diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 426f4698c2b00..73e3f09394b72 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -84,13 +84,27 @@ fi
 # CMake 3.18 is needed to support CUDA17 language variant
 CMAKE_VERSION=3.18.5
 
-_UCX_COMMIT=00bcc6bb18fc282eb160623b4c0d300147f579af
-_UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea
+_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
+_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 
 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
+  pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9)
+    CUDA_VERSION=12.4.0
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
   pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
     CUDA_VERSION=12.1.1
     CUDNN_VERSION=8
@@ -105,6 +119,21 @@ case "$image" in
     CONDA_CMAKE=yes
     TRITON=yes
     ;;
+  pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.4.0
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
   pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks)
     CUDA_VERSION=12.1.1
     CUDNN_VERSION=8
@@ -134,6 +163,20 @@ case "$image" in
     CONDA_CMAKE=yes
     TRITON=yes
     ;;
+  pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9)
+    CUDA_VERSION=12.4.0
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
   pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
     CUDA_VERSION=12.1.1
     CUDNN_VERSION=8
@@ -226,7 +269,7 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BASEKIT_VERSION=2024.0.0-49522
+    XPU_VERSION=0.5
     NINJA_VERSION=1.9.0
     CONDA_CMAKE=yes
     TRITON=yes
@@ -403,7 +446,7 @@ docker build \
        --build-arg "DOCS=${DOCS}" \
        --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
        --build-arg "EXECUTORCH=${EXECUTORCH}" \
-       --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
+       --build-arg "XPU_VERSION=${XPU_VERSION}" \
        --build-arg "ACL=${ACL:-}" \
        --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
        --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
diff --git a/.ci/docker/common/install_cudnn.sh b/.ci/docker/common/install_cudnn.sh
index f654c9fee24e6..3afd2f28841f5 100644
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@@ -4,7 +4,10 @@ if [[ ${CUDNN_VERSION} == 8 ]]; then
     # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
     mkdir tmp_cudnn
     pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
+    if [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-8.9.7.29_cuda12-archive"
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
+    elif [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
         CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive"
         curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
     elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
diff --git a/.ci/docker/common/install_cusparselt.sh b/.ci/docker/common/install_cusparselt.sh
index d418f1c75610e..493982919f8a4 100644
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@@ -5,9 +5,14 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt
 
-if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
-    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.5.2.1-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
+    arch_path='sbsa'
+    export TARGETARCH=${TARGETARCH:-$(uname -m)}
+    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
+        arch_path='x86_64'
+    fi
+    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.5.2.1-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
     CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
     curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
index de009c1a3adbf..d2db4cb76bfcf 100755
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@@ -15,7 +15,7 @@ conda_reinstall() {
 if [ -n "${ROCM_VERSION}" ]; then
   TRITON_REPO="https://github.com/openai/triton"
   TRITON_TEXT_FILE="triton-rocm"
-elif [ -n "${BASEKIT_VERSION}" ]; then
+elif [ -n "${XPU_VERSION}" ]; then
   TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
   TRITON_TEXT_FILE="triton-xpu"
 else
diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh
index d98ad2049b47c..aa308010326a7 100644
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@@ -3,10 +3,7 @@ set -xe
 
 
 # Intel® software for general purpose GPU capabilities.
-# Refer to https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html
-
-# Intel® oneAPI Base Toolkit (version 2024.0.0) has been updated to include functional and security updates.
-# Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html
+# Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
 
 # Users should update to the latest version as it becomes available
 
@@ -17,14 +14,16 @@ function install_ubuntu() {
     # Set up the repository. To do this, download the key to the system keyring
     wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
         | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
-    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+    wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+        | gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
 
     # Add the signed entry to APT sources and configure the APT client to use the Intel repository
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
+        https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
         | tee /etc/apt/sources.list.d/intel-gpu-jammy.list
-    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
-        | tee /etc/apt/sources.list.d/oneAPI.list
+    echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
+        https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
+        | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
 
     # Update the packages list and repository index
     apt-get update
@@ -40,11 +39,11 @@ function install_ubuntu() {
         mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
     # Development Packages
     apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
-    # Install Intel® oneAPI Base Toolkit
-    if [ -n "$BASEKIT_VERSION" ]; then
-        apt-get install intel-basekit=$BASEKIT_VERSION -y
+    # Install Intel Support Packages
+    if [ -n "$XPU_VERSION" ]; then
+        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION}
     else
-        apt-get install intel-basekit -y
+        apt-get install -y intel-for-pytorch-gpu-dev
     fi
 
     # Cleanup
diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile
index dcf7312c108fc..f96ee5e3b1070 100644
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@@ -152,6 +152,7 @@ RUN rm install_cusparselt.sh
 RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
 RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
 RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
+RUN if [ -h /usr/local/cuda-12.1/cuda-12.4 ]; then rm /usr/local/cuda-12.1/cuda-12.4; fi
 
 USER jenkins
 CMD ["bash"]
diff --git a/.ci/docker/ubuntu-xpu/Dockerfile b/.ci/docker/ubuntu-xpu/Dockerfile
index e49358fcbd0d9..02cd1133a050c 100644
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@@ -62,7 +62,7 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
 
 # Install XPU Dependencies
-ARG BASEKIT_VERSION
+ARG XPU_VERSION
 COPY ./common/install_xpu.sh install_xpu.sh
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 
diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
index b81caa0513691..4aa5dc39d0f5f 100755
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@@ -44,11 +44,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
   fi
 fi
 
-if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* ]]; then
-  echo "Caffe2 build is ON"
-  export BUILD_CAFFE2=ON
-fi
-
 if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
   export ATEN_THREADING=TBB
   export USE_TBB=1
diff --git a/.ci/pytorch/docs-test.sh b/.ci/pytorch/docs-test.sh
index 557f9d348772f..ffc00b623c14f 100755
--- a/.ci/pytorch/docs-test.sh
+++ b/.ci/pytorch/docs-test.sh
@@ -6,4 +6,4 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 echo "Testing pytorch docs"
 
 cd docs
-make doctest
+TERM=vt100 make doctest
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index 19d28eeefd9a8..6af49bee7d05e 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -588,6 +588,15 @@ test_inductor_torchbench_smoketest_perf() {
       "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
       --expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
   done
+
+  # Perform some "warm-start" runs for a few huggingface models.
+  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
+      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
+    python benchmarks/dynamo/check_accuracy.py \
+      --actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
+      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv"
+  done
 }
 
 test_inductor_torchbench_cpu_smoketest_perf(){
diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index 9830a3ce9650e..98cd949f97130 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-ea437b31ce316ea3d66fe73768c0dcb94edb79ad
+1980f8af5bcd0bb2ce51965cf79d8d4c25dad8a0
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index 4bf7526e79141..ade85af096871 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -1,6 +1,5 @@
 tracking_issue: 24422
 ciflow_tracking_issue: 64124
-TD_rollout_issue: 123120
 ciflow_push_tags:
 - ciflow/binaries
 - ciflow/binaries_conda
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 6d822165895eb..9f0dfe973dc9f 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -38,6 +38,8 @@ jobs:
       matrix:
         runner: [linux.12xlarge]
         docker-image-name: [
+          pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9,
+          pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks,
           pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9,
           pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks,
           pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9,
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 938e9521f72d2..988b1697c8455 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -1053,12 +1053,6 @@ exclude_patterns = [
     'test/quantization/fx/test_quantize_fx.py',
     'test/quantization/fx/test_subgraph_rewriter.py',
     'test/test_datapipe.py',
-    'test/test_fake_tensor.py',
-    'test/test_flop_counter.py',
-    'test/test_function_schema.py',
-    'test/test_functional_autograd_benchmark.py',
-    'test/test_functional_optim.py',
-    'test/test_functionalization_of_rng_ops.py',
     'test/test_futures.py',
     'test/test_fx.py',
     'test/test_fx_experimental.py',
@@ -1143,7 +1137,6 @@ exclude_patterns = [
     'test/test_transformers.py',
     'test/test_type_promotion.py',
     'test/test_unary_ufuncs.py',
-    'test/test_utils.py',
     'test/test_vulkan.py',
     'test/test_xnnpack_integration.py',
     'test/torch_np/numpy_test/**/*.py',
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 79db67e7357b5..f7561d606cbdb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,11 +181,7 @@ option(BUILD_BINARY "Build C++ binaries" OFF)
 option(BUILD_DOCS "Build Caffe2 documentation" OFF)
 option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
 option(BUILD_PYTHON "Build Python binaries" ON)
-option(BUILD_CAFFE2 "Master flag to build Caffe2" OFF)
 option(BUILD_LITE_INTERPRETER "Master flag to build Lite Interpreter" OFF)
-cmake_dependent_option(
-    BUILD_CAFFE2_OPS "Build Caffe2 operators" ON
-    "BUILD_CAFFE2" OFF)
 option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
 cmake_dependent_option(
     CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
@@ -281,8 +277,8 @@ if(NOT DEFINED USE_VULKAN)
 endif()
 
 option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
-option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON)
-option(USE_LITE_INTERPRETER_PROFILER "Enable " ON)
+option(USE_SOURCE_DEBUG_ON_MOBILE "Enable" ON)
+option(USE_LITE_INTERPRETER_PROFILER "Enable" ON)
 option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
 option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
 # option USE_XNNPACK: try to enable xnnpack by default.
@@ -635,7 +631,6 @@ if(INTERN_BUILD_MOBILE)
   endif()
   set(BUILD_PYTHON OFF)
   set(BUILD_FUNCTORCH OFF)
-  set(BUILD_CAFFE2_OPS OFF)
   set(USE_DISTRIBUTED OFF)
   set(NO_API ON)
   set(USE_FBGEMM OFF)
@@ -1208,13 +1203,6 @@ else()
       "shared libs.")
 endif()
 
-# ---[ Modules
-# If master flag for buildling Caffe2 is disabled, we also disable the
-# build for Caffe2 related operator modules.
-if(BUILD_CAFFE2)
-  add_subdirectory(modules)
-endif()
-
 # ---[ Binaries
 # Binaries will be built after the Caffe2 main libraries and the modules
 # are built. For the binaries, they will be linked to the Caffe2 main
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e665e3fb8bbf6..a37c5a3b405db 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -667,7 +667,6 @@ only interested in a specific component.
 - Working on a test binary? Run `(cd build && ninja bin/test_binary_name)` to
   rebuild only that test binary (without rerunning cmake). (Replace `ninja` with
   `make` if you don't have ninja installed).
-- Don't need Caffe2?  Pass `BUILD_CAFFE2=0` to disable Caffe2 build.
 
 On the initial build, you can also speed things up with the environment
 variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTENTION`, `USE_MEM_EFF_ATTENTION`, `BUILD_TEST`, `USE_FBGEMM`, `USE_NNPACK` and `USE_QNNPACK`.
@@ -1196,7 +1195,7 @@ build_with_asan()
   LDFLAGS="-stdlib=libstdc++" \
   CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \
   CXX_FLAGS="-pthread" \
-  USE_CUDA=0 USE_OPENMP=0 BUILD_CAFFE2_OPS=0 USE_DISTRIBUTED=0 DEBUG=1 \
+  USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \
   python setup.py develop
 }
 
diff --git a/README.md b/README.md
index 3ff42586109c3..eb291b1c97e00 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,9 @@ Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.o
     - [NVIDIA Jetson Platforms](#nvidia-jetson-platforms)
   - [From Source](#from-source)
     - [Prerequisites](#prerequisites)
+      - [NVIDIA CUDA Support](#nvidia-cuda-support)
+      - [AMD ROCm Support](#amd-rocm-support)
+      - [Intel GPU Support](#intel-gpu-support)
     - [Install Dependencies](#install-dependencies)
     - [Get the PyTorch Source](#get-the-pytorch-source)
     - [Install PyTorch](#install-pytorch)
@@ -162,6 +165,7 @@ If you are installing from source, you will need:
 
 We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
 
+##### NVIDIA CUDA Support
 If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following:
 - [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
 - [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above
@@ -174,6 +178,7 @@ Other potentially useful environment variables may be found in `setup.py`.
 
 If you are building for NVIDIA's Jetson platforms (Jetson Nano, TX1, TX2, AGX Xavier), Instructions to install PyTorch for Jetson Nano are [available here](https://devtalk.nvidia.com/default/topic/1049071/jetson-nano/pytorch-for-jetson-nano/)
 
+##### AMD ROCm Support
 If you want to compile with ROCm support, install
 - [AMD ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) 4.0 and above installation
 - ROCm is currently supported only for Linux systems.
@@ -181,6 +186,14 @@ If you want to compile with ROCm support, install
 If you want to disable ROCm support, export the environment variable `USE_ROCM=0`.
 Other potentially useful environment variables may be found in `setup.py`.
 
+##### Intel GPU Support
+If you want to compile with Intel GPU support, follow these
+- [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) instructions.
+- Intel GPU is currently supported only for Linux systems.
+
+If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
+Other potentially useful environment variables may be found in `setup.py`.
+
 #### Install Dependencies
 
 **Common**
@@ -379,7 +392,7 @@ You can also pass the `CMAKE_VARS="..."` environment variable to specify additio
 See [setup.py](./setup.py) for the list of available variables.
 
 ```bash
-CMAKE_VARS="BUILD_CAFFE2=ON BUILD_CAFFE2_OPS=ON" make -f docker.Makefile
+make -f docker.Makefile
 ```
 
 ### Building the Documentation
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 583662e6c63d0..9ec458fda45e4 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -54,7 +54,7 @@ if(NOT BUILD_LITE_INTERPRETER)
 endif()
 EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})
 # Exclude TensorImpl_test.cpp if compiling without Caffe2
-if(NOT BUILD_CAFFE2 AND NOT BUILD_LITE_INTERPRETER)
+if(NOT BUILD_LITE_INTERPRETER)
   file(GLOB_RECURSE ATen_CORE_EXCLUDED_TEST_SRCS "core/TensorImpl_test.cpp")
   EXCLUDE(ATen_CORE_TEST_SRCS "${ATen_CORE_TEST_SRCS}" ${ATen_CORE_EXCLUDED_TEST_SRCS})
 endif()
diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp
index 2d086ebbe71fe..156a2b663c033 100644
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@@ -81,8 +81,8 @@ inline uint64_t make64BitsFrom32Bits(uint32_t hi, uint32_t lo) {
 CPUGeneratorImpl::CPUGeneratorImpl(uint64_t seed_in)
   : c10::GeneratorImpl{Device(DeviceType::CPU), DispatchKeySet(c10::DispatchKey::CPU)},
     engine_{seed_in},
-    next_float_normal_sample_{c10::optional<float>()},
-    next_double_normal_sample_{c10::optional<double>()} { }
+    next_float_normal_sample_{std::optional<float>()},
+    next_double_normal_sample_{std::optional<double>()} { }
 
 /**
  * Manually seeds the engine with the seed input
@@ -151,8 +151,8 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   detail::check_rng_state(new_state);
 
   at::mt19937 engine;
-  auto float_normal_sample = c10::optional<float>();
-  auto double_normal_sample = c10::optional<double>();
+  auto float_normal_sample = std::optional<float>();
+  auto double_normal_sample = std::optional<double>();
 
   // Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
   CPUGeneratorImplStateLegacy* legacy_pod{nullptr};
@@ -160,7 +160,7 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   if (new_state_size == size_legacy) {
     legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data();
     // Note that in CPUGeneratorImplStateLegacy, we didn't have float version
-    // of normal sample and hence we leave the c10::optional<float> as is
+    // of normal sample and hence we leave the std::optional<float> as is
 
     // Update next_double_normal_sample.
     // Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y)
@@ -171,14 +171,14 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
       auto r = legacy_pod->normal_rho;
       auto theta = 2.0 * c10::pi<double> * legacy_pod->normal_x;
       // we return the sin version of the normal sample when in caching mode
-      double_normal_sample = c10::optional<double>(r * ::sin(theta));
+      double_normal_sample = std::optional<double>(r * ::sin(theta));
     }
   } else if (new_state_size == size_current) {
     auto rng_state = (CPUGeneratorImplState*)new_state.data();
     legacy_pod = &rng_state->legacy_pod;
     // update next_float_normal_sample
     if (rng_state->is_next_float_normal_sample_valid) {
-      float_normal_sample = c10::optional<float>(rng_state->next_float_normal_sample);
+      float_normal_sample = std::optional<float>(rng_state->next_float_normal_sample);
     }
 
     // Update next_double_normal_sample.
@@ -186,7 +186,7 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
     // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho
     // are squashed to 0.0.
     if (legacy_pod->normal_is_valid) {
-      double_normal_sample = c10::optional<double>(legacy_pod->normal_y);
+      double_normal_sample = std::optional<double>(legacy_pod->normal_y);
     }
   } else {
     AT_ERROR("Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy,
@@ -283,14 +283,14 @@ uint64_t CPUGeneratorImpl::random64() {
 /**
  * Get the cached normal random in float
  */
-c10::optional<float> CPUGeneratorImpl::next_float_normal_sample() {
+std::optional<float> CPUGeneratorImpl::next_float_normal_sample() {
   return next_float_normal_sample_;
 }
 
 /**
  * Get the cached normal random in double
  */
-c10::optional<double> CPUGeneratorImpl::next_double_normal_sample() {
+std::optional<double> CPUGeneratorImpl::next_double_normal_sample() {
   return next_double_normal_sample_;
 }
 
@@ -299,7 +299,7 @@ c10::optional<double> CPUGeneratorImpl::next_double_normal_sample() {
  *
  * See Note [Acquire lock when using random generators]
  */
-void CPUGeneratorImpl::set_next_float_normal_sample(c10::optional<float> randn) {
+void CPUGeneratorImpl::set_next_float_normal_sample(std::optional<float> randn) {
   next_float_normal_sample_ = randn;
 }
 
@@ -308,7 +308,7 @@ void CPUGeneratorImpl::set_next_float_normal_sample(c10::optional<float> randn)
  *
  * See Note [Acquire lock when using random generators]
  */
-void CPUGeneratorImpl::set_next_double_normal_sample(c10::optional<double> randn) {
+void CPUGeneratorImpl::set_next_double_normal_sample(std::optional<double> randn) {
   next_double_normal_sample_ = randn;
 }
 
diff --git a/aten/src/ATen/CPUGeneratorImpl.h b/aten/src/ATen/CPUGeneratorImpl.h
index f74c42f44fda5..34dd33a475b91 100644
--- a/aten/src/ATen/CPUGeneratorImpl.h
+++ b/aten/src/ATen/CPUGeneratorImpl.h
@@ -24,18 +24,18 @@ struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl {
   static c10::DeviceType device_type();
   uint32_t random();
   uint64_t random64();
-  c10::optional<float> next_float_normal_sample();
-  c10::optional<double> next_double_normal_sample();
-  void set_next_float_normal_sample(c10::optional<float> randn);
-  void set_next_double_normal_sample(c10::optional<double> randn);
+  std::optional<float> next_float_normal_sample();
+  std::optional<double> next_double_normal_sample();
+  void set_next_float_normal_sample(std::optional<float> randn);
+  void set_next_double_normal_sample(std::optional<double> randn);
   at::mt19937 engine();
   void set_engine(at::mt19937 engine);
 
  private:
   CPUGeneratorImpl* clone_impl() const override;
   at::mt19937 engine_;
-  c10::optional<float> next_float_normal_sample_;
-  c10::optional<double> next_double_normal_sample_;
+  std::optional<float> next_float_normal_sample_;
+  std::optional<double> next_double_normal_sample_;
 };
 
 namespace detail {
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index b50f0479e2fab..a922bcd5922fc 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -59,7 +59,7 @@ class TORCH_API Context {
     }
   }
   const AcceleratorHooksInterface& getAcceleratorHooksInterface(
-      c10::optional<c10::DeviceType> opt_device_type = c10::nullopt) {
+      std::optional<c10::DeviceType> opt_device_type = c10::nullopt) {
     c10::DeviceType device_type = opt_device_type.has_value()
         ? opt_device_type.value()
         : at::getAccelerator(true).value();
@@ -395,7 +395,7 @@ class TORCH_API Context {
   bool release_original_weights = false;
 #endif
   bool display_vmap_fallback_warnings_ = false;
-  c10::optional<at::QEngine> quantized_engine = c10::nullopt;
+  std::optional<at::QEngine> quantized_engine = c10::nullopt;
   bool enable_sparse_tensor_invariant_checks = false;
   bool allow_fp16_reduction_cpu = false;
 
diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h
index adc7f3efdbb6a..6c2f57e16c8ce 100644
--- a/aten/src/ATen/DeviceGuard.h
+++ b/aten/src/ATen/DeviceGuard.h
@@ -15,7 +15,7 @@ namespace at {
 //    OptionalDeviceGuard guard(device_of(tensor));
 
 /// Return the Device of a Tensor, if the Tensor is defined.
-inline c10::optional<Device> device_of(const Tensor& t) {
+inline std::optional<Device> device_of(const Tensor& t) {
   if (t.defined()) {
     return c10::make_optional(t.device());
   } else {
@@ -23,14 +23,14 @@ inline c10::optional<Device> device_of(const Tensor& t) {
   }
 }
 
-inline c10::optional<Device> device_of(const c10::optional<Tensor>& t) {
+inline std::optional<Device> device_of(const c10::optional<Tensor>& t) {
   return t.has_value() ? device_of(t.value()) : c10::nullopt;
 }
 
 /// Return the Device of a TensorList, if the list is non-empty and
 /// the first Tensor is defined.  (This function implicitly assumes
 /// that all tensors in the list have the same device.)
-inline c10::optional<Device> device_of(ITensorListRef t) {
+inline std::optional<Device> device_of(ITensorListRef t) {
   if (!t.empty()) {
     return device_of(t.front());
   } else {
diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp
index 0b35fc67b53ac..1eb5c070b547c 100644
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@@ -163,7 +163,7 @@ TensorBase _empty_generic(
     c10::Allocator* allocator,
     c10::DispatchKeySet ks,
     ScalarType scalar_type,
-    c10::optional<c10::MemoryFormat> memory_format_opt) {
+    std::optional<c10::MemoryFormat> memory_format_opt) {
   at::detail::check_size_nonnegative(size);
   at::detail::raise_warning_for_complex_half(scalar_type);
   caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type);
@@ -197,7 +197,7 @@ TensorBase empty_generic(
     c10::Allocator* allocator,
     c10::DispatchKeySet ks,
     ScalarType scalar_type,
-    c10::optional<c10::MemoryFormat> memory_format_opt) {
+    std::optional<c10::MemoryFormat> memory_format_opt) {
   return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt);
 }
 
@@ -206,7 +206,7 @@ TensorBase empty_generic_symint(
     c10::Allocator* allocator,
     c10::DispatchKeySet ks,
     ScalarType scalar_type,
-    c10::optional<c10::MemoryFormat> memory_format_opt) {
+    std::optional<c10::MemoryFormat> memory_format_opt) {
   return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt);
 }
 
@@ -252,7 +252,7 @@ TensorBase empty_strided_symint_generic(
 }
 
 TensorBase empty_cpu(IntArrayRef size, ScalarType dtype, bool pin_memory,
-                     c10::optional<c10::MemoryFormat> memory_format_opt) {
+                     std::optional<c10::MemoryFormat> memory_format_opt) {
   auto allocator = GetCPUAllocatorMaybePinned(pin_memory);
   constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU);
   return empty_generic(size, allocator, cpu_ks, dtype, memory_format_opt);
@@ -260,11 +260,11 @@ TensorBase empty_cpu(IntArrayRef size, ScalarType dtype, bool pin_memory,
 
 TensorBase empty_cpu(
     IntArrayRef size,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::CPU);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
 
@@ -295,10 +295,10 @@ TensorBase empty_strided_cpu(IntArrayRef size, IntArrayRef stride,
 TensorBase empty_strided_cpu(
     IntArrayRef size,
     IntArrayRef stride,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::CPU);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
 
@@ -342,7 +342,7 @@ static MetaAllocator g_meta_alloc;
 REGISTER_ALLOCATOR(kMeta, &g_meta_alloc);
 
 TensorBase empty_meta(IntArrayRef size, ScalarType dtype,
-                     c10::optional<c10::MemoryFormat> memory_format_opt) {
+                     std::optional<c10::MemoryFormat> memory_format_opt) {
   auto *allocator = GetAllocator(kMeta);
   constexpr c10::DispatchKeySet meta_dks(c10::DispatchKey::Meta);
   return at::detail::empty_generic(
@@ -351,11 +351,11 @@ TensorBase empty_meta(IntArrayRef size, ScalarType dtype,
 
 TensorBase empty_meta(
   IntArrayRef size,
-  c10::optional<ScalarType> dtype_opt,
-  c10::optional<Layout> layout_opt,
-  c10::optional<Device> device_opt,
-  c10::optional<bool> pin_memory_opt,
-  c10::optional<c10::MemoryFormat> memory_format_opt
+  std::optional<ScalarType> dtype_opt,
+  std::optional<Layout> layout_opt,
+  std::optional<Device> device_opt,
+  std::optional<bool> pin_memory_opt,
+  std::optional<c10::MemoryFormat> memory_format_opt
 ) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta);
   // NB: because there is no SparseMeta (yet), non-strided layout is
@@ -371,11 +371,11 @@ TensorBase empty_meta(
 
 TensorBase empty_symint_meta(
   SymIntArrayRef size,
-  c10::optional<ScalarType> dtype_opt,
-  c10::optional<Layout> layout_opt,
-  c10::optional<Device> device_opt,
-  c10::optional<bool> pin_memory_opt,
-  c10::optional<c10::MemoryFormat> memory_format_opt
+  std::optional<ScalarType> dtype_opt,
+  std::optional<Layout> layout_opt,
+  std::optional<Device> device_opt,
+  std::optional<bool> pin_memory_opt,
+  std::optional<c10::MemoryFormat> memory_format_opt
 ) {
   auto *allocator = GetAllocator(kMeta);
   constexpr c10::DispatchKeySet ks(c10::DispatchKey::Meta);
@@ -405,10 +405,10 @@ TensorBase empty_strided_meta(IntArrayRef size, IntArrayRef stride,
 TensorBase empty_strided_meta(
     IntArrayRef size,
     IntArrayRef stride,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
 
@@ -440,10 +440,10 @@ TensorBase empty_strided_symint_meta(SymIntArrayRef size, SymIntArrayRef stride,
 TensorBase empty_strided_symint_meta(
     SymIntArrayRef size,
     SymIntArrayRef stride,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
 
diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h
index f6e2e53bc99f5..e0e304ea8e8f6 100644
--- a/aten/src/ATen/EmptyTensor.h
+++ b/aten/src/ATen/EmptyTensor.h
@@ -49,14 +49,14 @@ TORCH_API TensorBase empty_generic(
     c10::Allocator* allocator,
     c10::DispatchKeySet ks,
     ScalarType scalar_type,
-    c10::optional<c10::MemoryFormat> memory_format_opt);
+    std::optional<c10::MemoryFormat> memory_format_opt);
 
 TORCH_API TensorBase empty_generic_symint(
     SymIntArrayRef size,
     c10::Allocator* allocator,
     c10::DispatchKeySet ks,
     ScalarType scalar_type,
-    c10::optional<c10::MemoryFormat> memory_format_opt);
+    std::optional<c10::MemoryFormat> memory_format_opt);
 
 TORCH_API TensorBase empty_strided_generic(
     IntArrayRef size,
@@ -76,15 +76,15 @@ TORCH_API TensorBase empty_cpu(
     IntArrayRef size,
     ScalarType dtype,
     bool pin_memory = false,
-    c10::optional<c10::MemoryFormat> memory_format_opt = c10::nullopt);
+    std::optional<c10::MemoryFormat> memory_format_opt = c10::nullopt);
 
 TORCH_API TensorBase empty_cpu(
     IntArrayRef size,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt);
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
 
 TORCH_API TensorBase empty_cpu(IntArrayRef size, const TensorOptions& options);
 
@@ -97,10 +97,10 @@ TORCH_API TensorBase empty_strided_cpu(
 TORCH_API TensorBase empty_strided_cpu(
     IntArrayRef size,
     IntArrayRef stride,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt);
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt);
 
 TORCH_API TensorBase empty_strided_cpu(
     IntArrayRef size,
@@ -110,23 +110,23 @@ TORCH_API TensorBase empty_strided_cpu(
 TORCH_API TensorBase empty_meta(
     IntArrayRef size,
     ScalarType dtype,
-    c10::optional<c10::MemoryFormat> memory_format_opt = c10::nullopt);
+    std::optional<c10::MemoryFormat> memory_format_opt = c10::nullopt);
 
 TORCH_API TensorBase empty_meta(
     IntArrayRef size,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt);
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
 
 TORCH_API TensorBase empty_symint_meta(
     SymIntArrayRef size,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt);
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
 
 TORCH_API TensorBase empty_meta(IntArrayRef size, const TensorOptions& options);
 
@@ -136,10 +136,10 @@ empty_strided_meta(IntArrayRef size, IntArrayRef stride, ScalarType dtype);
 TORCH_API TensorBase empty_strided_meta(
     IntArrayRef size,
     IntArrayRef stride,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt);
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt);
 
 TORCH_API TensorBase empty_strided_meta(
     IntArrayRef size,
@@ -154,10 +154,10 @@ TORCH_API TensorBase empty_strided_symint_meta(
 TORCH_API TensorBase empty_strided_symint_meta(
     SymIntArrayRef size,
     SymIntArrayRef stride,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt);
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt);
 
 TORCH_API TensorBase empty_strided_symint_meta(
     SymIntArrayRef size,
diff --git a/aten/src/ATen/FunctionalInverses.cpp b/aten/src/ATen/FunctionalInverses.cpp
index ebc24085a74a8..c70c8bd842f9e 100644
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@@ -145,7 +145,7 @@ Tensor FunctionalInverses::_neg_view_inverse(const Tensor& base, const Tensor& m
     }
 }
 
-Tensor FunctionalInverses::as_strided_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, at::SymIntArrayRef size, at::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset) {
+Tensor FunctionalInverses::as_strided_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, at::SymIntArrayRef size, at::SymIntArrayRef stride, std::optional<c10::SymInt> storage_offset) {
     if (inverse_return_mode == InverseReturnMode::AlwaysView) {
       // NB: assumes mutated_view is a narrowed view of base.
       // We should NOT do this for functionalization
@@ -220,7 +220,7 @@ Tensor FunctionalInverses::lift_fresh_inverse(const Tensor& base, const Tensor&
     return mutated_view;
 }
 
-Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step) {
+Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t dim, std::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step) {
     if (inverse_return_mode == InverseReturnMode::AlwaysView) {
       // NB: assumes mutated_view is a narrowed view of base.
       // We should NOT do this for functionalization
diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
index c9ef28dbf56e4..73edec07e2623 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -526,7 +526,7 @@ Tensor to_functional_tensor(const Tensor& tensor) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!isFunctionalTensor(tensor));
   return at::detail::make_tensor<FunctionalTensorWrapper>(tensor);
 }
-c10::optional<Tensor> to_functional_tensor(const c10::optional<Tensor>& tensor) {
+std::optional<Tensor> to_functional_tensor(const c10::optional<Tensor>& tensor) {
   if (tensor.has_value()) {
     return c10::make_optional<Tensor>(to_functional_tensor(*tensor));
   }
@@ -564,7 +564,7 @@ Tensor from_functional_tensor(const Tensor& tensor, bool assert_functional) {
     return tensor;
   }
 }
-c10::optional<Tensor> from_functional_tensor(const c10::optional<Tensor>& t, bool assert_functional) {
+std::optional<Tensor> from_functional_tensor(const c10::optional<Tensor>& t, bool assert_functional) {
   if (t.has_value()) {
     return c10::make_optional<Tensor>(from_functional_tensor(*t, assert_functional));
   }
@@ -610,7 +610,7 @@ void sync(const Tensor& t) {
   auto functional_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
   functional_impl->sync_();
 }
-void sync(const c10::optional<Tensor>& t) {
+void sync(const std::optional<Tensor>& t) {
   if (t.has_value()) {
     sync(*t);
   }
@@ -692,7 +692,7 @@ bool isFunctionalTensor(const at::Tensor& tensor) {
   return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
 }
 
-bool isFunctionalTensor(const c10::optional<Tensor>& t) {
+bool isFunctionalTensor(const std::optional<Tensor>& t) {
   if (t.has_value()) {
     return isFunctionalTensor(*t);
   } else {
diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h
index 95d6afe5f0be0..6ef890b772c1c 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@@ -286,32 +286,32 @@ TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
 }
 
 TORCH_API bool isFunctionalTensor(const at::Tensor& tensor);
-TORCH_API bool isFunctionalTensor(const c10::optional<Tensor>& t);
+TORCH_API bool isFunctionalTensor(const std::optional<Tensor>& t);
 TORCH_API bool isFunctionalTensor(
-    const c10::List<c10::optional<Tensor>>& t_list);
+    const c10::List<std::optional<Tensor>>& t_list);
 TORCH_API bool isFunctionalTensor(ITensorListRef list);
 
 TORCH_API Tensor to_functional_tensor(const Tensor& tensor);
-TORCH_API c10::optional<Tensor> to_functional_tensor(
-    const c10::optional<Tensor>& tensor);
-TORCH_API c10::List<c10::optional<Tensor>> to_functional_tensor(
-    const c10::List<c10::optional<Tensor>>& t_list);
+TORCH_API std::optional<Tensor> to_functional_tensor(
+    const std::optional<Tensor>& tensor);
+TORCH_API c10::List<std::optional<Tensor>> to_functional_tensor(
+    const c10::List<std::optional<Tensor>>& t_list);
 TORCH_API std::vector<Tensor> to_functional_tensor(ITensorListRef t_list);
 
 TORCH_API void freeze_functional_tensor(const Tensor& tensor);
 
 TORCH_API Tensor
 from_functional_tensor(const Tensor& tensor, bool assert_functional = true);
-TORCH_API c10::optional<Tensor> from_functional_tensor(
-    const c10::optional<Tensor>& t,
+TORCH_API std::optional<Tensor> from_functional_tensor(
+    const std::optional<Tensor>& t,
     bool assert_functional = true);
-TORCH_API c10::List<c10::optional<Tensor>> from_functional_tensor(
-    const c10::List<c10::optional<Tensor>>& t_list);
+TORCH_API c10::List<std::optional<Tensor>> from_functional_tensor(
+    const c10::List<std::optional<Tensor>>& t_list);
 TORCH_API std::vector<Tensor> from_functional_tensor(ITensorListRef t_list);
 
 TORCH_API void sync(const at::Tensor& t);
-TORCH_API void sync(const c10::optional<Tensor>& t);
-TORCH_API void sync(const c10::List<c10::optional<Tensor>>& t_list);
+TORCH_API void sync(const std::optional<Tensor>& t);
+TORCH_API void sync(const c10::List<std::optional<Tensor>>& t_list);
 TORCH_API void sync(ITensorListRef t_list);
 
 TORCH_API void replace_(const Tensor& functional_tensor, const Tensor& other);
diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
index 8b26c875fc02c..1ffc268b7f79b 100644
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@@ -125,7 +125,7 @@ namespace {
 // - when we resize to a larger size, it acts as a mutation
 // - when we resize to a smaller size, it acts as a view
 // See Note [resize_ in Functionalization] for more dtails
-static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, c10::optional<at::MemoryFormat> memory_format) {
+static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, std::optional<at::MemoryFormat> memory_format) {
   // First unwrap the tensor arguments
   at::Tensor self_;
   if (at::functionalization::impl::isFunctionalTensor(self)) {
@@ -216,7 +216,7 @@ static at::Tensor lift_fresh_functionalize_copy(const at::Tensor & self) {
     // in the local include TLS. As a result, when we redispatch here,
     // we will end up hitting PreDispatch stack first. So, we should
     // directly redispatch to the functionalize key manually.
-    static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("aten::clone", "").typed<at::Tensor(const at::Tensor &, c10::optional<at::MemoryFormat>)>();
+    static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("aten::clone", "").typed<at::Tensor(const at::Tensor &, std::optional<at::MemoryFormat>)>();
     return op.redispatch(c10::DispatchKeySet({c10::DispatchKey::Functionalize}), self, c10::nullopt);
   }
 
@@ -225,7 +225,7 @@ static at::Tensor lift_fresh_functionalize_copy(const at::Tensor & self) {
   return at::functionalization::impl::to_functional_tensor(out);
 }
 
-static bool device_opted_into_functionalization(c10::Device self_device, c10::optional<c10::Device> tgt_device) {
+static bool device_opted_into_functionalization(c10::Device self_device, std::optional<c10::Device> tgt_device) {
     // If the target device is empty, then the output tensor should be on the same device as the input
     auto real_tgt_device = tgt_device.has_value() ? tgt_device.value() : self_device;
     return real_tgt_device.type() == c10::DeviceType::XLA || real_tgt_device.type() == c10::DeviceType::Lazy;
@@ -235,12 +235,12 @@ static bool device_opted_into_functionalization(c10::Device self_device, c10::op
 // We should probably get rid of this though.
 static at::Tensor _to_copy_functionalize(
         const at::Tensor & self,
-        c10::optional<at::ScalarType> dtype,
-        c10::optional<at::Layout> layout,
-        c10::optional<at::Device> device,
-        c10::optional<bool> pin_memory,
+        std::optional<at::ScalarType> dtype,
+        std::optional<at::Layout> layout,
+        std::optional<at::Device> device,
+        std::optional<bool> pin_memory,
         bool non_blocking,
-        c10::optional<at::MemoryFormat> memory_format) {
+        std::optional<at::MemoryFormat> memory_format) {
   at::Tensor self_;
   if (at::functionalization::impl::isFunctionalTensor(self)) {
     // sync any pending updates
diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h
index caa8ec42003c9..411cf12d51341 100644
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@@ -23,7 +23,7 @@ inline void infer_size_impl(
     ResultVec& res) {
   NumelType newsize = 1;
   // N.B. this is an index, not a sym dim!
-  auto infer_dim = c10::optional<int64_t>();
+  auto infer_dim = std::optional<int64_t>();
   for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
     if (shape[dim] == -1) {
       if (infer_dim) {
diff --git a/aten/src/ATen/LegacyBatchingRegistrations.cpp b/aten/src/ATen/LegacyBatchingRegistrations.cpp
index bae40e3c8e51f..e0f7fce43f9e4 100644
--- a/aten/src/ATen/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/LegacyBatchingRegistrations.cpp
@@ -380,8 +380,8 @@ Tensor select_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes
 Tensor slice_batching_rule(
     const Tensor& self,
     int64_t dim,
-    c10::optional<int64_t> start,
-    c10::optional<int64_t> end,
+    std::optional<int64_t> start,
+    std::optional<int64_t> end,
     int64_t step) {
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
   auto dim_physical = self_physical.getPhysicalDim(dim);
@@ -996,10 +996,10 @@ Tensor new_zeros_batching_rule(
 Tensor new_empty_batching_rule(
     const Tensor& self,
     IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   auto physical_view = MultiBatchVmapTransform::logicalToPhysical(self);
   auto physical_size = physical_view.getPhysicalShape(size);
   auto result = physical_view.tensor().new_empty(physical_size, TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory));
@@ -1209,10 +1209,10 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   BINARY_POINTWISE(mul);
   BINARY_POINTWISE(div);
   {
-    using Binop = Tensor (*)(const Tensor&, const Tensor&, c10::optional<c10::string_view>);
-    using Unop = Tensor (*)(const Tensor&, const Scalar&, c10::optional<c10::string_view>);
-    m.impl("div.Tensor_mode", binary_pointwise_batching_rule<Binop, at::div, c10::optional<c10::string_view>>);
-    m.impl("div.Scalar_mode", unwrap_and_call<Unop, at::div, const Scalar&, c10::optional<c10::string_view>>);
+    using Binop = Tensor (*)(const Tensor&, const Tensor&, std::optional<c10::string_view>);
+    using Unop = Tensor (*)(const Tensor&, const Scalar&, std::optional<c10::string_view>);
+    m.impl("div.Tensor_mode", binary_pointwise_batching_rule<Binop, at::div, std::optional<c10::string_view>>);
+    m.impl("div.Scalar_mode", unwrap_and_call<Unop, at::div, const Scalar&, std::optional<c10::string_view>>);
   }
 
   // at::pow has three out-of-place overloads
diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp
index a76156c03402d..3e66ef7f74dea 100644
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@@ -128,7 +128,7 @@ static void assert_names_equal(DimnameList a, DimnameList b) {
 }
 
 const Tensor& propagate_names_if_present_and_nonempty(const Tensor& result,
-    c10::optional<DimnameList> maybe_names,
+    std::optional<DimnameList> maybe_names,
     bool validate_names) {
   auto maybe_name_list = maybe_names.value_or(at::ArrayRef<Dimname>{});
   propagate_names_if_nonempty(result.unsafeGetTensorImpl(), maybe_name_list, validate_names);
diff --git a/aten/src/ATen/NamedTensorUtils.h b/aten/src/ATen/NamedTensorUtils.h
index c1443b7eaa01b..47dcd6dd76851 100644
--- a/aten/src/ATen/NamedTensorUtils.h
+++ b/aten/src/ATen/NamedTensorUtils.h
@@ -81,7 +81,7 @@ namespace namedinference {
 
 const Tensor& propagate_names_if_present_and_nonempty(
     const Tensor& result,
-    c10::optional<DimnameList> maybe_names,
+    std::optional<DimnameList> maybe_names,
     bool validate_names = false);
 // Propagates `names` to `result` if `names` is not empty.
 // `names` can be empty; see [NOTE] Writing name inference rules
diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp
index 2f73b7b304ee3..534e4e71e657f 100644
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@@ -236,7 +236,7 @@ NestedTensorImpl::NestedTensorImpl(
   set_custom_sizes_strides(c10::TensorImpl::SizesStridesPolicy::CustomSizes);
 }
 
-c10::optional<int64_t> NestedTensorImpl::opt_size(int64_t d) const {
+std::optional<int64_t> NestedTensorImpl::opt_size(int64_t d) const {
   if (C10_UNLIKELY(!opt_sizes_.has_value())) {
     // Cache the metadata to avoid recomputing it each time.
     opt_sizes_ = c10::make_optional(construct_opt_sizes(nested_sizes_));
diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h
index 0bd3d98e73c5c..697969edbbd44 100644
--- a/aten/src/ATen/NestedTensorImpl.h
+++ b/aten/src/ATen/NestedTensorImpl.h
@@ -61,10 +61,10 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
   // Returns nullopt if the ith dimension is irregular. The ith dimension
   // of a NestedTensor is regular if the unbound tensors match in
   // size at the (i-1)th dimension.
-  c10::optional<int64_t> opt_size(int64_t d) const;
+  std::optional<int64_t> opt_size(int64_t d) const;
 
   int64_t size(int64_t d) const {
-    c10::optional<int64_t> optional_size = this->opt_size(d);
+    std::optional<int64_t> optional_size = this->opt_size(d);
     TORCH_CHECK(
         optional_size.has_value(),
         "Given dimension ",
@@ -171,7 +171,7 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
   // Optional to allow it to be computed lazily from nested.
   // TODO: maybe we can remove this metadata since
   //       we can compute it from `nested_sizes_`
-  mutable c10::optional<std::vector<int64_t>> opt_sizes_;
+  mutable std::optional<std::vector<int64_t>> opt_sizes_;
 
   template <typename VariableVersion>
   c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
diff --git a/aten/src/ATen/SavedTensorHooks.cpp b/aten/src/ATen/SavedTensorHooks.cpp
index c1c963409f40e..f2fb0642eb34c 100644
--- a/aten/src/ATen/SavedTensorHooks.cpp
+++ b/aten/src/ATen/SavedTensorHooks.cpp
@@ -35,7 +35,7 @@ void SavedTensorDefaultHooks::enable() {
   tls.disabled_error_message = c10::nullopt;
 }
 
-const c10::optional<std::string>& SavedTensorDefaultHooks::get_disabled_error_message() {
+const std::optional<std::string>& SavedTensorDefaultHooks::get_disabled_error_message() {
   return tls.disabled_error_message;
 }
 
diff --git a/aten/src/ATen/SavedTensorHooks.h b/aten/src/ATen/SavedTensorHooks.h
index af821cb908c6a..6ad46a8334c3f 100644
--- a/aten/src/ATen/SavedTensorHooks.h
+++ b/aten/src/ATen/SavedTensorHooks.h
@@ -21,7 +21,7 @@ struct TORCH_API SavedTensorDefaultHooksTLS {
   // disabled_error_message is nullopt IFF Saved Tensor hooks is enabled
   // We did this for efficiency (so we didn't have to keep a separate bool
   // around)
-  c10::optional<std::string> disabled_error_message;
+  std::optional<std::string> disabled_error_message;
 };
 
 } // namespace impl
@@ -46,7 +46,7 @@ struct TORCH_API SavedTensorDefaultHooks {
   static void disable(const std::string& error_message);
   static void enable();
   static bool is_enabled();
-  static const c10::optional<std::string>& get_disabled_error_message();
+  static const std::optional<std::string>& get_disabled_error_message();
 };
 
 } // namespace at
diff --git a/aten/src/ATen/ScalarOps.cpp b/aten/src/ATen/ScalarOps.cpp
index 13a1754fa53a1..f931af0ad445e 100644
--- a/aten/src/ATen/ScalarOps.cpp
+++ b/aten/src/ATen/ScalarOps.cpp
@@ -23,7 +23,7 @@ Tensor& scalar_fill(Tensor& self, const Scalar& value) {
   return self;
 }
 
-Tensor scalar_tensor_static(const Scalar& s, c10::optional<ScalarType> dtype_opt, c10::optional<Device> device_opt) {
+Tensor scalar_tensor_static(const Scalar& s, std::optional<ScalarType> dtype_opt, c10::optional<Device> device_opt) {
   at::tracer::impl::NoTracerDispatchMode tracer_guard;
   at::AutoDispatchBelowAutograd mode;
   Tensor result = at::detail::empty_cpu(
diff --git a/aten/src/ATen/ScalarOps.h b/aten/src/ATen/ScalarOps.h
index 943ac161d4c18..ed591955dd876 100644
--- a/aten/src/ATen/ScalarOps.h
+++ b/aten/src/ATen/ScalarOps.h
@@ -18,8 +18,8 @@ namespace at::detail {
 Tensor& scalar_fill(Tensor& self, const Scalar& value);
 TORCH_API Tensor scalar_tensor_static(
     const Scalar& s,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Device> device_opt);
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Device> device_opt);
 } // namespace at::detail
 
 // This is in the c10 namespace because we use ADL to find the functions in it.
diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h
index eb29b4d5ad739..b2ef33ffc058d 100644
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@@ -39,9 +39,9 @@ TORCH_API extern const EllipsisIndexType Ellipsis;
 struct TORCH_API Slice final {
  public:
   Slice(
-      c10::optional<c10::SymInt> start_index = c10::nullopt,
-      c10::optional<c10::SymInt> stop_index = c10::nullopt,
-      c10::optional<c10::SymInt> step_index = c10::nullopt) {
+      std::optional<c10::SymInt> start_index = c10::nullopt,
+      std::optional<c10::SymInt> stop_index = c10::nullopt,
+      std::optional<c10::SymInt> step_index = c10::nullopt) {
     if (!step_index.has_value()) {
       step_ = c10::SymInt(1);
     } else {
@@ -205,7 +205,7 @@ static inline Tensor applySlice(
     c10::SymInt step,
     bool disable_slice_optimization,
     const at::Device& self_device,
-    const c10::optional<SymIntArrayRef>& self_sizes) {
+    const std::optional<SymIntArrayRef>& self_sizes) {
   // TODO: implement negative step
   TORCH_CHECK_VALUE(step > 0, "step must be greater than zero");
 
@@ -233,7 +233,7 @@ static inline Tensor applySelect(
     SymInt index,
     int64_t real_dim,
     const at::Device& /*self_device*/,
-    const c10::optional<SymIntArrayRef>& self_sizes) {
+    const std::optional<SymIntArrayRef>& self_sizes) {
   // See NOTE [nested tensor size for indexing]
   if (self_sizes.has_value()) {
     auto maybe_index = index.maybe_as_int();
@@ -431,7 +431,7 @@ static inline Tensor handleDimInMultiDimIndexing(
     std::vector<Tensor>& outIndices,
     bool disable_slice_optimization,
     const at::Device& original_tensor_device,
-    const c10::optional<SymIntArrayRef>& prev_dim_result_sizes) {
+    const std::optional<SymIntArrayRef>& prev_dim_result_sizes) {
   if (index.is_integer()) {
     return impl::applySelect(
         prev_dim_result,
@@ -515,7 +515,7 @@ static inline Tensor applySlicing(
     std::vector<Tensor>& outIndices,
     bool disable_slice_optimization,
     const at::Device& self_device,
-    const c10::optional<SymIntArrayRef>& self_sizes) {
+    const std::optional<SymIntArrayRef>& self_sizes) {
   int64_t dim = 0;
   int64_t specified_dims = impl::count_specified_dimensions(indices);
 
@@ -531,9 +531,9 @@ static inline Tensor applySlicing(
   for (const auto i : c10::irange(indices.size())) {
     auto& obj = indices[i];
     // See NOTE [nested tensor size for indexing]
-    c10::optional<SymIntArrayRef> result_sizes = result.is_nested()
-        ? c10::optional<SymIntArrayRef>(c10::nullopt)
-        : c10::optional<SymIntArrayRef>(result.sym_sizes());
+    std::optional<SymIntArrayRef> result_sizes = result.is_nested()
+        ? std::optional<SymIntArrayRef>(c10::nullopt)
+        : std::optional<SymIntArrayRef>(result.sym_sizes());
     result = handleDimInMultiDimIndexing(
         /*prev_dim_result=*/result,
         /*original_tensor=*/self,
@@ -607,9 +607,9 @@ static inline Tensor get_item(
   // nested tensor does not have a size (yet) so for now we represent its size
   // as null may need to be changed after we reach a better solution for nested
   // tensor size
-  c10::optional<SymIntArrayRef> self_sizes = self.is_nested()
-      ? c10::optional<SymIntArrayRef>(c10::nullopt)
-      : c10::optional<SymIntArrayRef>(self.sym_sizes());
+  std::optional<SymIntArrayRef> self_sizes = self.is_nested()
+      ? std::optional<SymIntArrayRef>(c10::nullopt)
+      : std::optional<SymIntArrayRef>(self.sym_sizes());
 
   // handle simple types: integers, slices, none, ellipsis, bool
   if (indices.size() == 1) {
diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h
index a241244a5744c..fb61ca65146a3 100644
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@@ -147,7 +147,7 @@ struct TORCH_API OperandInfo {
   /// promotion target_dtype value can become different from tensor's dtype
   /// also, during type promotion target_dtype and device can be set for an
   /// undefined tensor so that tensor can be properly constructed later.
-  c10::optional<Device> device = c10::nullopt;
+  std::optional<Device> device = c10::nullopt;
   ScalarType target_dtype = ScalarType::Undefined;
   // Caches dtype of the tensor, because scalar_type is an expensive operation
   // If dtype of the tensor is changed (e.g. as a result of type promotion or in
@@ -971,9 +971,9 @@ class TORCH_API TensorIteratorConfig final {
   int num_outputs_ = 0;
   int num_inputs_ = 0;
 
-  c10::optional<DimVector> static_shape_ = c10::nullopt;
-  c10::optional<ScalarType> static_dtype_ = c10::nullopt;
-  c10::optional<Device> static_device_ = c10::nullopt;
+  std::optional<DimVector> static_shape_ = c10::nullopt;
+  std::optional<ScalarType> static_dtype_ = c10::nullopt;
+  std::optional<Device> static_device_ = c10::nullopt;
   bool check_mem_overlap_ = true;
   bool allow_cpu_scalars_ = false;
   bool is_reduction_ = false;
diff --git a/aten/src/ATen/TensorSubclassLikeUtils.h b/aten/src/ATen/TensorSubclassLikeUtils.h
index a9a0b4ecdcf8b..10c26dfe35eca 100644
--- a/aten/src/ATen/TensorSubclassLikeUtils.h
+++ b/aten/src/ATen/TensorSubclassLikeUtils.h
@@ -61,7 +61,7 @@ inline bool areAnyTensorSubclassLike(TensorList tensors) {
 }
 
 inline bool areAnyOptionalTensorSubclassLike(
-    const c10::List<c10::optional<Tensor>>& tensors) {
+    const c10::List<std::optional<Tensor>>& tensors) {
   if (c10::impl::dispatch_mode_enabled())
     return true;
   return std::any_of(
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index e425a0a8ed130..14e81d6504179 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -327,7 +327,7 @@ std::vector<int64_t> defaultStrides(IntArrayRef sizes) {
 // see overloads of computeStride() below.
 //
 template <typename ResultVec, typename NewShapeVec, typename Numel>
-inline c10::optional<ResultVec> computeStride_impl(
+inline std::optional<ResultVec> computeStride_impl(
     const NewShapeVec& oldshape,
     const NewShapeVec& oldstride,
     const NewShapeVec& newshape,
@@ -395,7 +395,7 @@ inline c10::optional<ResultVec> computeStride_impl(
   return newstride;
 }
 
-c10::optional<std::vector<int64_t>> computeStride(
+std::optional<std::vector<int64_t>> computeStride(
     IntArrayRef oldshape,
     IntArrayRef oldstride,
     IntArrayRef newshape) {
@@ -403,7 +403,7 @@ c10::optional<std::vector<int64_t>> computeStride(
   return computeStride_impl<std::vector<int64_t>, IntArrayRef, int64_t>(oldshape, oldstride, newshape, toResult);
 }
 
-c10::optional<SymDimVector> computeStride(
+std::optional<SymDimVector> computeStride(
     c10::SymIntArrayRef oldshape,
     c10::SymIntArrayRef oldstride,
     c10::SymIntArrayRef newshape) {
@@ -411,7 +411,7 @@ c10::optional<SymDimVector> computeStride(
   return computeStride_impl<SymDimVector, c10::SymIntArrayRef, c10::SymInt>(oldshape, oldstride, newshape, toResult);
 }
 
-c10::optional<DimVector> computeStride(
+std::optional<DimVector> computeStride(
     IntArrayRef oldshape,
     IntArrayRef oldstride,
     const DimVector& newshape) {
diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
index 4615ab50606ee..4a81dc280e242 100644
--- a/aten/src/ATen/TensorUtils.h
+++ b/aten/src/ATen/TensorUtils.h
@@ -171,17 +171,17 @@ TORCH_API void check_dim_size(
 namespace detail {
 TORCH_API std::vector<int64_t> defaultStrides(IntArrayRef sizes);
 
-TORCH_API c10::optional<std::vector<int64_t>> computeStride(
+TORCH_API std::optional<std::vector<int64_t>> computeStride(
     IntArrayRef oldshape,
     IntArrayRef oldstride,
     IntArrayRef newshape);
 
-TORCH_API c10::optional<SymDimVector> computeStride(
+TORCH_API std::optional<SymDimVector> computeStride(
     c10::SymIntArrayRef oldshape,
     c10::SymIntArrayRef oldstride,
     c10::SymIntArrayRef newshape);
 
-TORCH_API c10::optional<DimVector> computeStride(
+TORCH_API std::optional<DimVector> computeStride(
     IntArrayRef oldshape,
     IntArrayRef oldstride,
     const DimVector& newshape);
diff --git a/aten/src/ATen/VmapModeRegistrations.cpp b/aten/src/ATen/VmapModeRegistrations.cpp
index ab4556c8c4155..3b6198778a353 100644
--- a/aten/src/ATen/VmapModeRegistrations.cpp
+++ b/aten/src/ATen/VmapModeRegistrations.cpp
@@ -39,7 +39,7 @@ TORCH_LIBRARY_IMPL(aten, VmapMode, m) {
   // CppFunction::makeNamedNotSupported() to avoid listing out the types of everything.
   // However, registering e.g. CppFunction::makeNamedNotSupported() as an implementation
   // only works for operators that support boxing.
-#define TENSOROPTIONS c10::optional<c10::ScalarType>, c10::optional<c10::Layout>, c10::optional<c10::Device>, c10::optional<bool>
+#define TENSOROPTIONS std::optional<c10::ScalarType>, c10::optional<c10::Layout>, c10::optional<c10::Device>, c10::optional<bool>
 
   // random operations (out-of-place)
   m.impl("bernoulli", unsupportedRandomOp<const Tensor&, optional<Generator>>);
diff --git a/aten/src/ATen/ZeroTensorFallback.cpp b/aten/src/ATen/ZeroTensorFallback.cpp
index bc012f8cde909..329216cf3789f 100644
--- a/aten/src/ATen/ZeroTensorFallback.cpp
+++ b/aten/src/ATen/ZeroTensorFallback.cpp
@@ -16,7 +16,7 @@ namespace at {
     const auto num_arguments = arguments.size();
     const auto stack_start = stack->size() - num_arguments;
 
-    c10::optional<bool> is_write;
+    std::optional<bool> is_write;
     for (const auto i : c10::irange(num_arguments)) {
       const auto& alias_info = arguments[i].alias_info();
       if (alias_info != nullptr) {
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index c233f17b44580..2d01bdeca500b 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -144,7 +144,7 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_
 Banned functions
 *******************************/
 
-static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const c10::optional<Tensor>&, int64_t) {
+static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional<Tensor>&, int64_t) {
   AT_ERROR("torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
            "Many models use a sigmoid layer right before the binary cross entropy layer.\n"
            "In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"
diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h
index 59a91848a5175..c36030db5b048 100644
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@@ -297,9 +297,9 @@ TORCH_API Tensor cached_cast(
     c10::DeviceType device_type = c10::DeviceType::CUDA);
 
 // Overload to process optional<Tensor>
-inline c10::optional<Tensor> cached_cast(
+inline std::optional<Tensor> cached_cast(
     at::ScalarType to_type,
-    const c10::optional<Tensor>& arg,
+    const std::optional<Tensor>& arg,
     c10::DeviceType device_type = c10::DeviceType::CUDA) {
   if (arg.has_value()) {
     return cached_cast(to_type, *arg, device_type);
@@ -353,9 +353,9 @@ Otherwise, set it to the autocast type.
 ********************************************************/
 
 // Overload to catch dtype flags
-c10::optional<ScalarType> inline set_opt_dtype(
+std::optional<ScalarType> inline set_opt_dtype(
     at::ScalarType to_type,
-    const c10::optional<ScalarType>& dtype) {
+    const std::optional<ScalarType>& dtype) {
   return dtype.has_value() ? dtype : to_type;
 }
 
@@ -392,7 +392,7 @@ enum class CastPolicy : uint8_t {
   fp32, // Cast all inputs to at::kFloat before running the op.
   fp32_set_opt_dtype, // Treats functions (like softmax) that
                       //  1. we'd like to run in fp32 and
-                      //  2. have a c10::optional<ScalarType> arg that controls
+                      //  2. have a std::optional<ScalarType> arg that controls
                       //  the output type.
                       // fp32_set_opt_dtype wrappers' policy is: if the output
                       // type is already set, don't touch it, otherwise, set
@@ -865,24 +865,24 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
   _(ADD_NS(norm),                                                           \
     "norm.Scalar",                                                          \
     Tensor(const Tensor&, const Scalar&),                                   \
-    Tensor(const Tensor&, const c10::optional<Scalar>&, ScalarType),        \
+    Tensor(const Tensor&, const std::optional<Scalar>&, ScalarType),        \
     fp32_append_dtype)                                                      \
   _(ADD_NS(norm),                                                           \
     "norm.ScalarOpt_dim",                                                   \
-    Tensor(const Tensor&, const c10::optional<Scalar>&, IntArrayRef, bool), \
+    Tensor(const Tensor&, const std::optional<Scalar>&, IntArrayRef, bool), \
     Tensor(                                                                 \
         const Tensor&,                                                      \
-        const c10::optional<Scalar>&,                                       \
+        const std::optional<Scalar>&,                                       \
         IntArrayRef,                                                        \
         bool,                                                               \
         ScalarType),                                                        \
     fp32_append_dtype)                                                      \
   _(ADD_NS(norm),                                                           \
     "norm.names_ScalarOpt_dim",                                             \
-    Tensor(const Tensor&, const c10::optional<Scalar>&, DimnameList, bool), \
+    Tensor(const Tensor&, const std::optional<Scalar>&, DimnameList, bool), \
     Tensor(                                                                 \
         const Tensor&,                                                      \
-        const c10::optional<Scalar>&,                                       \
+        const std::optional<Scalar>&,                                       \
         DimnameList,                                                        \
         bool,                                                               \
         ScalarType),                                                        \
@@ -895,6 +895,7 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
   _(bilinear)                \
   _(cross)                   \
   _(dot)                     \
+  _(vdot)                    \
   _(grid_sampler)            \
   _(index_put)               \
   _(tensordot)               \
diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h
index d04cb1c6b8a70..449f8d743157b 100644
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@@ -152,7 +152,7 @@ struct CachingHostAllocatorImpl {
     // do not need to look up the ctx in blocks_.
     auto* block = reinterpret_cast<B*>(ctx);
 
-    c10::optional<std::vector<E>> events;
+    std::optional<std::vector<E>> events;
     {
       std::lock_guard<std::mutex> g(block->mutex_);
       block->allocated_ = false;
@@ -263,7 +263,7 @@ struct CachingHostAllocatorImpl {
       // Avoid calling cudaEventDestroy while holding a mutex, so move
       // intermediate events out of the lock into this object.
       // process the last event
-      c10::optional<std::pair<E, B*>> processed;
+      std::optional<std::pair<E, B*>> processed;
       {
         std::lock_guard<std::mutex> g(events_mutex_);
         if (!events_.empty()) {
@@ -324,7 +324,7 @@ struct CachingHostAllocatorImpl {
   }
 
   // Record an event on stream and store event into events.
-  virtual void record_stream(c10::optional<std::vector<E>>& events, S stream) {
+  virtual void record_stream(std::optional<std::vector<E>>& events, S stream) {
     TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream");
   }
 
diff --git a/aten/src/ATen/core/CheckMemoryFormat.h b/aten/src/ATen/core/CheckMemoryFormat.h
index 442889e2eec6f..8add9509f4d5f 100644
--- a/aten/src/ATen/core/CheckMemoryFormat.h
+++ b/aten/src/ATen/core/CheckMemoryFormat.h
@@ -2,10 +2,10 @@
 
 namespace c10::impl {
 
-inline c10::optional<MemoryFormat>
+inline std::optional<MemoryFormat>
 check_tensor_options_and_extract_memory_format(
     const TensorOptions& options,
-    c10::optional<MemoryFormat> memory_format) {
+    std::optional<MemoryFormat> memory_format) {
   TORCH_CHECK(
       options.requires_grad_opt() == c10::nullopt ||
       options.requires_grad_opt().value() == false,
diff --git a/aten/src/ATen/core/DeprecatedTypeProperties.cpp b/aten/src/ATen/core/DeprecatedTypeProperties.cpp
index 15231f965aefd..a97a6828571e7 100644
--- a/aten/src/ATen/core/DeprecatedTypeProperties.cpp
+++ b/aten/src/ATen/core/DeprecatedTypeProperties.cpp
@@ -14,7 +14,7 @@ Storage DeprecatedTypeProperties::unsafeStorageFromTH(void * th_pointer, bool re
   return at::unsafeStorageFromTH(th_pointer, retain);
 }
 
-Tensor DeprecatedTypeProperties::copy(const Tensor & src, bool non_blocking, c10::optional<Device> to_device) const {
+Tensor DeprecatedTypeProperties::copy(const Tensor & src, bool non_blocking, std::optional<Device> to_device) const {
   if (to_device) {
     return src.to(src.options().dtype(scalarType()).device(to_device), non_blocking, /*copy=*/true);
   }
diff --git a/aten/src/ATen/core/DeprecatedTypeProperties.h b/aten/src/ATen/core/DeprecatedTypeProperties.h
index 222465eac56f2..a945761e8ff97 100644
--- a/aten/src/ATen/core/DeprecatedTypeProperties.h
+++ b/aten/src/ATen/core/DeprecatedTypeProperties.h
@@ -107,7 +107,7 @@ class TORCH_API DeprecatedTypeProperties {
 
   /// Constructs the `TensorOptions` from a type and a Device.  Asserts that
   /// the device type matches the device type of the type.
-  TensorOptions options(c10::optional<Device> device_opt) const {
+  TensorOptions options(std::optional<Device> device_opt) const {
     if (!device_opt.has_value()) {
       return options(-1);
     } else {
@@ -129,7 +129,7 @@ class TORCH_API DeprecatedTypeProperties {
 
   Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const;
   Storage unsafeStorageFromTH(void * th_pointer, bool retain) const;
-  Tensor copy(const Tensor & src, bool non_blocking=false, c10::optional<Device> to_device={}) const;
+  Tensor copy(const Tensor & src, bool non_blocking=false, std::optional<Device> to_device={}) const;
 
  private:
   Backend backend_;
diff --git a/aten/src/ATen/core/Dimname.h b/aten/src/ATen/core/Dimname.h
index e53db14732c89..d3bc5a45abb7a 100644
--- a/aten/src/ATen/core/Dimname.h
+++ b/aten/src/ATen/core/Dimname.h
@@ -21,7 +21,7 @@ struct TORCH_API Dimname {
   bool isWildcard() const { return type_ == NameType::WILDCARD; }
 
   bool matches(Dimname other) const;
-  c10::optional<Dimname> unify(Dimname other) const;
+  std::optional<Dimname> unify(Dimname other) const;
 
  private:
   Dimname(Symbol name)
diff --git a/aten/src/ATen/core/DistributionsHelper.h b/aten/src/ATen/core/DistributionsHelper.h
index 8b399510e94aa..a46608200e5b9 100644
--- a/aten/src/ATen/core/DistributionsHelper.h
+++ b/aten/src/ATen/core/DistributionsHelper.h
@@ -144,7 +144,7 @@ template <typename RNG, typename ret_type,
 C10_HOST_DEVICE inline bool maybe_get_next_##TYPE##_normal_sample(RNG* generator, ret_type* ret) {  \
   if (generator->next_##TYPE##_normal_sample()) {                                                   \
     *ret = *(generator->next_##TYPE##_normal_sample());                                             \
-    generator->set_next_##TYPE##_normal_sample(c10::optional<TYPE>());                              \
+    generator->set_next_##TYPE##_normal_sample(std::optional<TYPE>());                              \
     return true;                                                                                    \
   }                                                                                                 \
   return false;                                                                                     \
diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h
index b237c571b22d3..6b76db5d06864 100644
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@@ -150,7 +150,7 @@ Generator make_generator(Args&&... args) {
  * the backend generator type (CPU/CUDAGeneratorImpl etc.)
  */
 template <typename T>
-static inline T * check_generator(c10::optional<Generator> gen) {
+static inline T * check_generator(std::optional<Generator> gen) {
   TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt");
   TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed");
   TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'");
@@ -164,7 +164,7 @@ static inline T * check_generator(c10::optional<Generator> gen) {
  * the backend generator type (CPU/CUDAGeneratorImpl etc.)
  */
 template <typename T>
-static inline T* get_generator_or_default(const c10::optional<Generator>& gen, const Generator& default_gen) {
+static inline T* get_generator_or_default(const std::optional<Generator>& gen, const Generator& default_gen) {
   return gen.has_value() && gen->defined() ? check_generator<T>(gen) : check_generator<T>(default_gen);
 }
 
diff --git a/aten/src/ATen/core/GeneratorForPrivateuseone.cpp b/aten/src/ATen/core/GeneratorForPrivateuseone.cpp
index 1e8d8daa9fc8f..35b1dd9fdd4eb 100644
--- a/aten/src/ATen/core/GeneratorForPrivateuseone.cpp
+++ b/aten/src/ATen/core/GeneratorForPrivateuseone.cpp
@@ -5,8 +5,8 @@ namespace at {
 
 static std::mutex _generator_mutex_lock;
 
-c10::optional<GeneratorFuncType>& GetGeneratorPrivate() {
-  static c10::optional<GeneratorFuncType> generator_privateuse1 = c10::nullopt;
+std::optional<GeneratorFuncType>& GetGeneratorPrivate() {
+  static std::optional<GeneratorFuncType> generator_privateuse1 = c10::nullopt;
   return generator_privateuse1;
 }
 
diff --git a/aten/src/ATen/core/GeneratorForPrivateuseone.h b/aten/src/ATen/core/GeneratorForPrivateuseone.h
index 9b84f162a7652..747c77897ff9b 100644
--- a/aten/src/ATen/core/GeneratorForPrivateuseone.h
+++ b/aten/src/ATen/core/GeneratorForPrivateuseone.h
@@ -7,7 +7,7 @@ namespace at {
 
 using GeneratorFuncType = std::function<at::Generator(c10::DeviceIndex)>;
 
-c10::optional<GeneratorFuncType>& GetGeneratorPrivate();
+std::optional<GeneratorFuncType>& GetGeneratorPrivate();
 
 class TORCH_API _GeneratorRegister {
  public:
diff --git a/aten/src/ATen/core/List.h b/aten/src/ATen/core/List.h
index 68ecf5ed343f8..53560b9666ae3 100644
--- a/aten/src/ATen/core/List.h
+++ b/aten/src/ATen/core/List.h
@@ -58,10 +58,10 @@ struct ListElementConstReferenceTraits {
   using const_reference = typename c10::detail::ivalue_to_const_ref_overload_return<T>::type;
 };
 
-// There is no to() overload for c10::optional<std::string>.
+// There is no to() overload for std::optional<std::string>.
 template<>
-struct ListElementConstReferenceTraits<c10::optional<std::string>> {
-  using const_reference = c10::optional<std::reference_wrapper<const std::string>>;
+struct ListElementConstReferenceTraits<std::optional<std::string>> {
+  using const_reference = std::optional<std::reference_wrapper<const std::string>>;
 };
 
 template<class T, class Iterator>
diff --git a/aten/src/ATen/core/List_inl.h b/aten/src/ATen/core/List_inl.h
index f8ce73eb3f9cc..64760b5f782b4 100644
--- a/aten/src/ATen/core/List_inl.h
+++ b/aten/src/ATen/core/List_inl.h
@@ -168,8 +168,8 @@ list_element_to_const_ref(const IValue& element) {
 }
 
 template<>
-inline typename ListElementConstReferenceTraits<c10::optional<std::string>>::const_reference
-list_element_to_const_ref<c10::optional<std::string>>(const IValue& element) {
+inline typename ListElementConstReferenceTraits<std::optional<std::string>>::const_reference
+list_element_to_const_ref<std::optional<std::string>>(const IValue& element) {
   return element.toOptionalStringRef();
 }
 
diff --git a/aten/src/ATen/core/List_test.cpp b/aten/src/ATen/core/List_test.cpp
index 56da3cf299e90..808cbe2d8b63a 100644
--- a/aten/src/ATen/core/List_test.cpp
+++ b/aten/src/ATen/core/List_test.cpp
@@ -1127,13 +1127,13 @@ TEST(ListTest, canAccessStringByReference) {
 }
 
 TEST(ListTest, canAccessOptionalStringByReference) {
-  List<c10::optional<std::string>> list({"one", "two", c10::nullopt});
+  List<std::optional<std::string>> list({"one", "two", c10::nullopt});
   const auto& listRef = list;
   static_assert(
-      std::is_same_v<decltype(listRef[1]), c10::optional<std::reference_wrapper<const std::string>>>,
-      "List<c10::optional<std::string>> access should be by const reference");
-  c10::optional<std::string> str1 = list[1];
-  c10::optional<std::string> str2 = list[2];
+      std::is_same_v<decltype(listRef[1]), std::optional<std::reference_wrapper<const std::string>>>,
+      "List<std::optional<std::string>> access should be by const reference");
+  std::optional<std::string> str1 = list[1];
+  std::optional<std::string> str2 = list[2];
   decltype(auto) strRef1 = listRef[1];
   decltype(auto) strRef2 = listRef[2];
   // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
diff --git a/aten/src/ATen/core/NamedTensor.h b/aten/src/ATen/core/NamedTensor.h
index d6ff30ce00838..7eed27e4f1a61 100644
--- a/aten/src/ATen/core/NamedTensor.h
+++ b/aten/src/ATen/core/NamedTensor.h
@@ -100,7 +100,7 @@ void check_names_valid_for(const TensorBase& tensor, DimnameList names);
 void check_names_valid_for(size_t tensor_dim, DimnameList names);
 
 // Sets the names of `tensor` to be `names`.
-TORCH_API const TensorBase& internal_set_names_inplace(const TensorBase& tensor, c10::optional<DimnameList> names);
+TORCH_API const TensorBase& internal_set_names_inplace(const TensorBase& tensor, std::optional<DimnameList> names);
 TORCH_API const TensorBase& internal_set_names_inplace(const TensorBase& tensor, std::vector<Dimname>&& names, bool validate_names);
 
 constexpr size_t kMaxNamedTensorDim = 64;
@@ -111,7 +111,7 @@ namespace impl {
 
 // Some helper functions on TensorImpl. Useful for working with names in TH.
 // XXX: Ideally these would exist as methods on TensorImpl
-TORCH_API void internal_set_names_inplace(TensorImpl* impl, c10::optional<DimnameList> names, bool validate_names);
+TORCH_API void internal_set_names_inplace(TensorImpl* impl, std::optional<DimnameList> names, bool validate_names);
 TORCH_API void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names, bool validate_names);
 
 void check_names_valid_for(TensorImpl* impl, DimnameList names);
@@ -132,7 +132,7 @@ TORCH_API DimnameList get_names(const TensorImpl* impl);
 // Returns the names of the tensor if they have been allocated; returns nullopt
 // instead if the haven't been. The names of a tensor are not allocated if a
 // tensor is constructed with names=None.
-TORCH_API c10::optional<DimnameList> get_opt_names(const TensorImpl* impl);
+TORCH_API std::optional<DimnameList> get_opt_names(const TensorImpl* impl);
 
 } // namespace impl
 
diff --git a/aten/src/ATen/core/NestedIntSymNodeImpl.cpp b/aten/src/ATen/core/NestedIntSymNodeImpl.cpp
index b703f76773b46..7cdc7aa2cbe8f 100644
--- a/aten/src/ATen/core/NestedIntSymNodeImpl.cpp
+++ b/aten/src/ATen/core/NestedIntSymNodeImpl.cpp
@@ -7,7 +7,7 @@ namespace c10 {
 namespace {
 bool _eq(const char* op, c10::SymNodeImpl* lhs, c10::SymNodeImpl* rhs) {
   TORCH_INTERNAL_ASSERT(lhs->is_nested_int());
-  c10::optional<int64_t> c = rhs->nested_int();
+  std::optional<int64_t> c = rhs->nested_int();
   return (
       c.has_value() && lhs->nested_int() == *c &&
       lhs->nested_int_coeff() == rhs->nested_int_coeff());
@@ -68,7 +68,7 @@ c10::SymNode NestedIntSymNodeImpl::le(const c10::SymNode& other) {
 
 c10::SymNode NestedIntSymNodeImpl::mul(const c10::SymNode& other) {
   TORCH_CHECK(!other->nested_int(), "nested int cannot be multiplied by nested int");
-  c10::optional<int64_t> c = other->constant_int();
+  std::optional<int64_t> c = other->constant_int();
   TORCH_CHECK(c.has_value());
   return SymNode(c10::make_intrusive<NestedIntSymNodeImpl>(val_, coeff_ * *c));
 }
diff --git a/aten/src/ATen/core/NestedIntSymNodeImpl.h b/aten/src/ATen/core/NestedIntSymNodeImpl.h
index 228f4310a38fc..786464c4c3ea8 100644
--- a/aten/src/ATen/core/NestedIntSymNodeImpl.h
+++ b/aten/src/ATen/core/NestedIntSymNodeImpl.h
@@ -134,11 +134,11 @@ class TORCH_API NestedIntSymNodeImpl : public SymNodeImpl {
   c10::SymNode le(const c10::SymNode& other) override;
   c10::SymNode mul(const c10::SymNode& other) override;
 
-  c10::optional<int64_t> nested_int() override {
+  std::optional<int64_t> nested_int() override {
     return val_;
   }
 
-  c10::optional<int64_t> nested_int_coeff() override {
+  std::optional<int64_t> nested_int_coeff() override {
     return coeff_;
   }
 
diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp
index a34341b4a9437..caef951ed1268 100644
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@@ -14,7 +14,7 @@ namespace {
 // To achieve this, we ensure that the tls is empty by default and emptied again both when
 // we call into user torch_dispatch or returning back to python after this call.
 
-thread_local c10::optional<c10::impl::LocalDispatchKeySet> tls_on_entry;
+thread_local std::optional<c10::impl::LocalDispatchKeySet> tls_on_entry;
 
 c10::impl::LocalDispatchKeySet safe_get_tls_on_entry() {
   TORCH_CHECK(tls_on_entry.has_value(), "Accessing torch dispatch state outside of '__torch_dispatch__' "
diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp
index ed19144d0eaff..2ddd9b4e65bac 100644
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@@ -42,7 +42,7 @@ TensorBase TensorBase::to(
     at::TensorOptions options,
     bool non_blocking,
     bool copy,
-    c10::optional<at::MemoryFormat> memory_format) const {
+    std::optional<at::MemoryFormat> memory_format) const {
   Tensor self(*this);
   return at::_ops::to_dtype_layout::call(
       self, optTypeMetaToScalarType(options.dtype_opt()),
@@ -134,8 +134,8 @@ bool TensorBase::retains_grad() const {
 }
 
 void Tensor::_backward(TensorList inputs,
-        const c10::optional<Tensor>& gradient,
-        c10::optional<bool> keep_graph,
+        const std::optional<Tensor>& gradient,
+        std::optional<bool> keep_graph,
         bool create_graph) const {
   return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
 }
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index e03c6bdf2bd10..87d5937cf9ebc 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -147,7 +147,7 @@ class TORCH_API TensorBase {
   const TensorBase& fill_(const c10::Scalar& scalar) const;
   const TensorBase& zero_() const;
 
-  TensorBase to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
+  TensorBase to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, std::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
 
   bool is_complex() const {
     return at::isComplexType(this->scalar_type());
@@ -249,7 +249,7 @@ class TORCH_API TensorBase {
     return impl_->strides();
   }
   // See impl::get_opt_names in ATen/NamedTensor.h for docs.
-  c10::optional<DimnameList> opt_names() const {
+  std::optional<DimnameList> opt_names() const {
     return impl::get_opt_names(unsafeGetTensorImpl());
   }
   // See impl::get_names in ATen/NamedTensor.h for docs.
@@ -712,7 +712,7 @@ class TORCH_API TensorBase {
   /// // f requires grad, has no operation creating it
   /// @endcode
 
-  /// \fn void backward(const Tensor & gradient={}, c10::optional<bool> retain_graph=c10::nullopt, bool create_graph=false, c10::optional<TensorList> inputs=c10::nullopt) const;
+  /// \fn void backward(const Tensor & gradient={}, std::optional<bool> retain_graph=c10::nullopt, bool create_graph=false, c10::optional<TensorList> inputs=c10::nullopt) const;
   ///
   /// Computes the gradient of current tensor with respect to graph leaves.
   ///
@@ -1010,7 +1010,7 @@ struct ExclusivelyOwnedTraits<at::TensorBase> : public c10::ExclusivelyOwnedTens
 namespace at {
 
 inline c10::MaybeOwned<TensorBase> borrow_from_optional_tensor(
-    const c10::optional<TensorBase>& opt) {
+    const std::optional<TensorBase>& opt) {
   return opt.has_value()
     ? c10::MaybeOwned<TensorBase>::borrowed(*opt)
     : c10::MaybeOwned<TensorBase>::owned(std::in_place);
diff --git a/aten/src/ATen/core/TorchDispatchUtils.cpp b/aten/src/ATen/core/TorchDispatchUtils.cpp
index 8f666e5a476ab..32085a9f70627 100644
--- a/aten/src/ATen/core/TorchDispatchUtils.cpp
+++ b/aten/src/ATen/core/TorchDispatchUtils.cpp
@@ -17,7 +17,7 @@ bool tensorlist_has_dispatch(at::ITensorListRef li) {
   return false;
 }
 
-bool tensorlist_has_dispatch(const c10::List<c10::optional<at::Tensor>>& li) {
+bool tensorlist_has_dispatch(const c10::List<std::optional<at::Tensor>>& li) {
   for (auto i : c10::irange(li.size())) {
     auto t = li.get(i);
     if (t && tensor_has_dispatch(*t)) {
diff --git a/aten/src/ATen/core/TorchDispatchUtils.h b/aten/src/ATen/core/TorchDispatchUtils.h
index 0ead779360097..4f5d9e22e4692 100644
--- a/aten/src/ATen/core/TorchDispatchUtils.h
+++ b/aten/src/ATen/core/TorchDispatchUtils.h
@@ -10,7 +10,7 @@ namespace at::impl {
 
 TORCH_API bool tensor_has_dispatch(const at::Tensor& t);
 TORCH_API bool tensorlist_has_dispatch(at::ITensorListRef li);
-TORCH_API bool tensorlist_has_dispatch(const c10::List<c10::optional<at::Tensor>>& li);
+TORCH_API bool tensorlist_has_dispatch(const c10::List<std::optional<at::Tensor>>& li);
 using c10::impl::dispatch_mode_enabled;
 
 }
diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h
index 47d74f5433ac2..f9c0aa4a5fc14 100644
--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@@ -60,8 +60,8 @@ struct TORCH_API VariableHooksInterface {
   virtual void _backward(
       const Tensor&,
       TensorList,
-      const c10::optional<Tensor>&,
-      c10::optional<bool>,
+      const std::optional<Tensor>&,
+      std::optional<bool>,
       bool) const = 0;
   virtual void requires_grad_(const TensorBase&, bool) const = 0;
   virtual void basic_autograd_not_implemented_fallback(
diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h
index c950f4c80ffc7..7b55c2323a2ff 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@@ -22,7 +22,7 @@ using has_symint =
     std::is_same<c10::SymInt, T>,
     std::is_same<c10::SymIntArrayRef, T>,
     std::is_same<at::OptionalSymIntArrayRef, T>,
-    std::is_same<c10::optional<c10::SymInt>, T>
+    std::is_same<std::optional<c10::SymInt>, T>
   >;
 
 template <typename T>
@@ -46,8 +46,8 @@ struct remove_symint<c10::SymIntArrayRef> {
 };
 
 template <>
-struct remove_symint<c10::optional<c10::SymInt>> {
-  using type = c10::optional<int64_t>;
+struct remove_symint<std::optional<c10::SymInt>> {
+  using type = std::optional<int64_t>;
 };
 
 
diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h
index 0d6149c8090a9..0ad79b00be56b 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@@ -71,7 +71,7 @@ inline typename remove_symint<c10::SymIntArrayRef>::type unpackSymInt(c10::SymIn
 }
 
 template <>
-inline typename remove_symint<c10::optional<c10::SymInt>>::type unpackSymInt(c10::optional<c10::SymInt> x) {
+inline typename remove_symint<std::optional<c10::SymInt>>::type unpackSymInt(c10::optional<c10::SymInt> x) {
   return x.has_value() ? c10::make_optional(x->guard_int(__FILE__, __LINE__)) : c10::nullopt;
 }
 
diff --git a/aten/src/ATen/core/boxing/KernelFunction_test.cpp b/aten/src/ATen/core/boxing/KernelFunction_test.cpp
index 6453e5e00b5c4..a0f990e87aafe 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_test.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction_test.cpp
@@ -6,7 +6,7 @@
 
 using std::vector;
 using std::tuple;
-using c10::optional;
+using std::optional;
 using c10::IValue;
 using c10::OperatorKernel;
 using c10::OperatorHandle;
diff --git a/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp b/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp
index 7eb0137b283fc..fa562c1d7ca4f 100644
--- a/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp
+++ b/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp
@@ -207,15 +207,15 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithIntListOu
   EXPECT_EQ(6, result[0].toIntVector()[2]);
 }
 
-std::tuple<Tensor, int64_t, std::vector<Tensor>, c10::optional<int64_t>, Dict<string, Tensor>> kernelWithMultipleOutputs(Tensor) {
+std::tuple<Tensor, int64_t, std::vector<Tensor>, std::optional<int64_t>, Dict<string, Tensor>> kernelWithMultipleOutputs(Tensor) {
   Dict<string, Tensor> dict;
   dict.insert("first", dummyTensor(DispatchKey::CPU));
   dict.insert("second", dummyTensor(DispatchKey::CUDA));
-  return std::tuple<Tensor, int64_t, std::vector<Tensor>, c10::optional<int64_t>, Dict<string, Tensor>>(
+  return std::tuple<Tensor, int64_t, std::vector<Tensor>, std::optional<int64_t>, Dict<string, Tensor>>(
     dummyTensor(DispatchKey::CUDA),
     5,
     {dummyTensor(DispatchKey::CPU), dummyTensor(DispatchKey::CUDA)},
-    c10::optional<int64_t>(std::in_place, 0),
+    std::optional<int64_t>(std::in_place, 0),
     dict
   );
 }
@@ -808,11 +808,11 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenFallbackKernelWitho
   EXPECT_EQ(4, outputs[0].toInt());
 }
 
-c10::optional<Tensor> called_arg2 = c10::nullopt;
-c10::optional<int64_t> called_arg3 = c10::nullopt;
-c10::optional<std::string> called_arg4 = c10::nullopt;
+std::optional<Tensor> called_arg2 = c10::nullopt;
+std::optional<int64_t> called_arg3 = c10::nullopt;
+std::optional<std::string> called_arg4 = c10::nullopt;
 
-void kernelWithOptInputWithoutOutput(Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+void kernelWithOptInputWithoutOutput(Tensor arg1, const std::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
   called = true;
   called_arg2 = arg2;
   called_arg3 = arg3;
@@ -846,7 +846,7 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithOptionalI
   EXPECT_FALSE(called_arg4.has_value());
 }
 
-c10::optional<Tensor> kernelWithOptInputWithOutput(Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+std::optional<Tensor> kernelWithOptInputWithOutput(Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
   called = true;
   called_arg2 = arg2;
   called_arg3 = arg3;
@@ -883,8 +883,8 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithOptionalI
   EXPECT_FALSE(called_arg4.has_value());
 }
 
-std::tuple<c10::optional<Tensor>, c10::optional<int64_t>, c10::optional<std::string>>
-kernelWithOptInputWithMultipleOutputs(Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+std::tuple<std::optional<Tensor>, c10::optional<int64_t>, c10::optional<std::string>>
+kernelWithOptInputWithMultipleOutputs(Tensor arg1, const std::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
   return std::make_tuple(arg2, arg3, arg4);
 }
 
@@ -936,7 +936,7 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernel_whenRegister
   auto op = c10::Dispatcher::singleton().findSchema({"_test::no_schema_specified", ""});
   ASSERT_TRUE(op.has_value());
 
-  c10::optional<std::string> differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema());
+  std::optional<std::string> differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema());
   EXPECT_FALSE(differences.has_value());
 }
 
diff --git a/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp b/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp
index 15f7caae529b4..ed448d054c713 100644
--- a/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp
+++ b/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp
@@ -223,15 +223,15 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenKernelWithIntListOutput_w
   EXPECT_EQ(6, result[0].toIntVector()[2]);
 }
 
-std::tuple<Tensor, int64_t, c10::List<Tensor>, c10::optional<int64_t>, Dict<string, Tensor>> kernelWithMultipleOutputs(Tensor) {
+std::tuple<Tensor, int64_t, c10::List<Tensor>, std::optional<int64_t>, Dict<string, Tensor>> kernelWithMultipleOutputs(Tensor) {
   Dict<string, Tensor> dict;
   dict.insert("first", dummyTensor(DispatchKey::CPU));
   dict.insert("second", dummyTensor(DispatchKey::CUDA));
-  return std::tuple<Tensor, int64_t, c10::List<Tensor>, c10::optional<int64_t>, Dict<string, Tensor>>(
+  return std::tuple<Tensor, int64_t, c10::List<Tensor>, std::optional<int64_t>, Dict<string, Tensor>>(
     dummyTensor(DispatchKey::CUDA),
     5,
     c10::List<Tensor>({dummyTensor(DispatchKey::CPU), dummyTensor(DispatchKey::CUDA)}),
-    c10::optional<int64_t>(std::in_place, 0),
+    std::optional<int64_t>(std::in_place, 0),
     dict
   );
 }
@@ -550,11 +550,11 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenFallbackKernelWithoutTens
   EXPECT_EQ(4, outputs[0].toInt());
 }
 
-c10::optional<Tensor> called_arg2 = c10::nullopt;
-c10::optional<int64_t> called_arg3 = c10::nullopt;
-c10::optional<std::string> called_arg4 = c10::nullopt;
+std::optional<Tensor> called_arg2 = c10::nullopt;
+std::optional<int64_t> called_arg3 = c10::nullopt;
+std::optional<std::string> called_arg4 = c10::nullopt;
 
-void kernelWithOptInputWithoutOutput(Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+void kernelWithOptInputWithoutOutput(Tensor arg1, const std::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
   called = true;
   called_arg2 = arg2;
   called_arg3 = arg3;
@@ -588,7 +588,7 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenKernelWithOptionalInputs_
   EXPECT_FALSE(called_arg4.has_value());
 }
 
-c10::optional<Tensor> kernelWithOptInputWithOutput(Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+std::optional<Tensor> kernelWithOptInputWithOutput(Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
   called = true;
   called_arg2 = arg2;
   called_arg3 = arg3;
@@ -625,8 +625,8 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenKernelWithOptionalInputs_
   EXPECT_FALSE(called_arg4.has_value());
 }
 
-std::tuple<c10::optional<Tensor>, c10::optional<int64_t>, c10::optional<std::string>>
-kernelWithOptInputWithMultipleOutputs(Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+std::tuple<std::optional<Tensor>, c10::optional<int64_t>, c10::optional<std::string>>
+kernelWithOptInputWithMultipleOutputs(Tensor arg1, const std::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
   return std::make_tuple(arg2, arg3, arg4);
 }
 
@@ -690,7 +690,7 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenKernel_whenRegisteredWith
   auto op = c10::Dispatcher::singleton().findSchema({"_test::no_schema_specified", ""});
   ASSERT_TRUE(op.has_value());
 
-  c10::optional<std::string> differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema());
+  std::optional<std::string> differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema());
   EXPECT_FALSE(differences.has_value());
 }
 
diff --git a/aten/src/ATen/core/boxing/impl/kernel_lambda_legacy_test.cpp b/aten/src/ATen/core/boxing/impl/kernel_lambda_legacy_test.cpp
index a1a1b37e2d83e..22203b7326f38 100644
--- a/aten/src/ATen/core/boxing/impl/kernel_lambda_legacy_test.cpp
+++ b/aten/src/ATen/core/boxing/impl/kernel_lambda_legacy_test.cpp
@@ -188,15 +188,15 @@ TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithIntListOutp
 
 TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithMultipleOutputs_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-     .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))", [] (Tensor) -> std::tuple<Tensor, int64_t, std::vector<Tensor>, c10::optional<int64_t>, Dict<string, Tensor>> {
+     .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))", [] (Tensor) -> std::tuple<Tensor, int64_t, std::vector<Tensor>, std::optional<int64_t>, Dict<string, Tensor>> {
        Dict<string, Tensor> dict;
        dict.insert("first", dummyTensor(DispatchKey::CPU));
        dict.insert("second", dummyTensor(DispatchKey::CUDA));
-       return std::tuple<Tensor, int64_t, std::vector<Tensor>, c10::optional<int64_t>, Dict<string, Tensor>>(
+       return std::tuple<Tensor, int64_t, std::vector<Tensor>, std::optional<int64_t>, Dict<string, Tensor>>(
          dummyTensor(DispatchKey::CUDA),
          5,
          {dummyTensor(DispatchKey::CPU), dummyTensor(DispatchKey::CUDA)},
-         c10::optional<int64_t>(std::in_place, 0),
+         std::optional<int64_t>(std::in_place, 0),
          dict
        );
      });
@@ -733,13 +733,13 @@ TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenFallbackKernelWithout
 TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithOptionalInputs_withoutOutput_whenRegistered_thenCanBeCalled) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   bool called;
-  c10::optional<Tensor> called_arg2 = c10::nullopt;
-  c10::optional<int64_t> called_arg3 = c10::nullopt;
-  c10::optional<std::string> called_arg4 = c10::nullopt;
+  std::optional<Tensor> called_arg2 = c10::nullopt;
+  std::optional<int64_t> called_arg3 = c10::nullopt;
+  std::optional<std::string> called_arg4 = c10::nullopt;
 
   auto registrar = RegisterOperators().op(
     "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()",
-    [&] (Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+    [&] (Tensor arg1, const std::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
       called = true;
       called_arg2 = arg2;
       called_arg3 = arg3;
@@ -773,13 +773,13 @@ TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithOptionalInp
 TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithOptionalInputs_withOutput_whenRegistered_thenCanBeCalled) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   bool called;
-  c10::optional<Tensor> called_arg2 = c10::nullopt;
-  c10::optional<int64_t> called_arg3 = c10::nullopt;
-  c10::optional<std::string> called_arg4 = c10::nullopt;
+  std::optional<Tensor> called_arg2 = c10::nullopt;
+  std::optional<int64_t> called_arg3 = c10::nullopt;
+  std::optional<std::string> called_arg4 = c10::nullopt;
 
   auto registrar = RegisterOperators().op(
     "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?",
-    [&] (Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+    [&] (Tensor arg1, const std::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
       called = true;
       called_arg2 = arg2;
       called_arg3 = arg3;
@@ -816,13 +816,13 @@ TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithOptionalInp
 TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithOptionalInputs_withMultipleOutputs_whenRegistered_thenCanBeCalled) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   bool called;
-  c10::optional<Tensor> called_arg2 = c10::nullopt;
-  c10::optional<int64_t> called_arg3 = c10::nullopt;
-  c10::optional<std::string> called_arg4 = c10::nullopt;
+  std::optional<Tensor> called_arg2 = c10::nullopt;
+  std::optional<int64_t> called_arg3 = c10::nullopt;
+  std::optional<std::string> called_arg4 = c10::nullopt;
 
   auto registrar = RegisterOperators().op(
     "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)",
-    [] (Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+    [] (Tensor arg1, const std::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
       return std::make_tuple(arg2, arg3, arg4);
     });
   auto op = c10::Dispatcher::singleton().findSchema({"_test::opt_input", ""});
@@ -866,7 +866,7 @@ TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernel_whenRegistered
   auto op = c10::Dispatcher::singleton().findSchema({"_test::no_schema_specified", ""});
   ASSERT_TRUE(op.has_value());
 
-  c10::optional<std::string> differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema());
+  std::optional<std::string> differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema());
   EXPECT_FALSE(differences.has_value());
 }
 
diff --git a/aten/src/ATen/core/boxing/impl/kernel_lambda_test.cpp b/aten/src/ATen/core/boxing/impl/kernel_lambda_test.cpp
index dc463cb3fe180..ea06bbccc7bd6 100644
--- a/aten/src/ATen/core/boxing/impl/kernel_lambda_test.cpp
+++ b/aten/src/ATen/core/boxing/impl/kernel_lambda_test.cpp
@@ -187,15 +187,15 @@ TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithIntListOutput_whe
 TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithMultipleOutputs_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
      .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))",
-       RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor) -> std::tuple<Tensor, int64_t, c10::List<Tensor>, c10::optional<int64_t>, Dict<string, Tensor>> {
+       RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor) -> std::tuple<Tensor, int64_t, c10::List<Tensor>, std::optional<int64_t>, Dict<string, Tensor>> {
          Dict<string, Tensor> dict;
          dict.insert("first", dummyTensor(DispatchKey::CPU));
          dict.insert("second", dummyTensor(DispatchKey::CUDA));
-         return std::tuple<Tensor, int64_t, c10::List<Tensor>, c10::optional<int64_t>, Dict<string, Tensor>>(
+         return std::tuple<Tensor, int64_t, c10::List<Tensor>, std::optional<int64_t>, Dict<string, Tensor>>(
            dummyTensor(DispatchKey::CUDA),
            5,
            c10::List<Tensor>({dummyTensor(DispatchKey::CPU), dummyTensor(DispatchKey::CUDA)}),
-           c10::optional<int64_t>(std::in_place, 0),
+           std::optional<int64_t>(std::in_place, 0),
            dict
          );
        }));
@@ -466,14 +466,14 @@ TEST(OperatorRegistrationTestLambdaBasedKernel, givenFallbackKernelWithoutTensor
   EXPECT_EQ(4, outputs[0].toInt());
 }
 
-c10::optional<Tensor> called_arg2 = c10::nullopt;
-c10::optional<int64_t> called_arg3 = c10::nullopt;
-c10::optional<std::string> called_arg4 = c10::nullopt;
+std::optional<Tensor> called_arg2 = c10::nullopt;
+std::optional<int64_t> called_arg3 = c10::nullopt;
+std::optional<std::string> called_arg4 = c10::nullopt;
 
 TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithOptionalInputs_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators().op(
     "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()",
-    RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+    RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor arg1, const std::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
       called = true;
       called_arg2 = arg2;
       called_arg3 = arg3;
@@ -507,7 +507,7 @@ TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithOptionalInputs_wi
 TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithOptionalInputs_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators().op(
     "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?",
-    RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+    RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor arg1, const std::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
       called = true;
       called_arg2 = arg2;
       called_arg3 = arg3;
@@ -544,7 +544,7 @@ TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithOptionalInputs_wi
 TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithOptionalInputs_withMultipleOutputs_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators().op(
     "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)",
-    RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+    RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor arg1, const std::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
       return std::make_tuple(arg2, arg3, arg4);
     }));
   auto op = c10::Dispatcher::singleton().findSchema({"_test::opt_input", ""});
@@ -588,7 +588,7 @@ TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernel_whenRegisteredWithou
   auto op = c10::Dispatcher::singleton().findSchema({"_test::no_schema_specified", ""});
   ASSERT_TRUE(op.has_value());
 
-  c10::optional<std::string> differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema());
+  std::optional<std::string> differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema());
   EXPECT_FALSE(differences.has_value());
 }
 
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
index ccd94ff1de2be..4642be5d689a5 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -116,7 +116,7 @@ namespace impl {
   };
 
   template<class T, bool AllowDeprecatedTypes>
-  struct assert_is_valid_input_type<c10::optional<T>, AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<std::optional<T>, AllowDeprecatedTypes>
   : assert_is_valid_input_type<T, AllowDeprecatedTypes> {};
 
   template <bool AllowDeprecatedTypes, class... Args>
@@ -226,7 +226,7 @@ namespace impl {
   };
 
   template<class T, bool AllowDeprecatedTypes>
-  struct assert_is_valid_output_type<c10::optional<T>, AllowDeprecatedTypes>
+  struct assert_is_valid_output_type<std::optional<T>, AllowDeprecatedTypes>
   : assert_is_valid_output_type<T, AllowDeprecatedTypes> {};
 
   template<class T, bool AllowDeprecatedTypes>
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp
index 337f0d4c0cad3..1609e014f43f0 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp
@@ -205,15 +205,15 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithIntListOutput_wh
 }
 
 struct KernelWithMultipleOutputs final : OperatorKernel {
-  std::tuple<Tensor, int64_t, c10::List<Tensor>, c10::optional<int64_t>, Dict<string, Tensor>> operator()(Tensor) {
+  std::tuple<Tensor, int64_t, c10::List<Tensor>, std::optional<int64_t>, Dict<string, Tensor>> operator()(Tensor) {
     Dict<string, Tensor> dict;
     dict.insert("first", dummyTensor(DispatchKey::CPU));
     dict.insert("second", dummyTensor(DispatchKey::CUDA));
-    return std::tuple<Tensor, int64_t, c10::List<Tensor>, c10::optional<int64_t>, Dict<string, Tensor>>(
+    return std::tuple<Tensor, int64_t, c10::List<Tensor>, std::optional<int64_t>, Dict<string, Tensor>>(
       dummyTensor(DispatchKey::CUDA),
       5,
       c10::List<Tensor>({dummyTensor(DispatchKey::CPU), dummyTensor(DispatchKey::CUDA)}),
-      c10::optional<int64_t>(std::in_place, 0),
+      std::optional<int64_t>(std::in_place, 0),
       dict
     );
   }
@@ -679,12 +679,12 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenFallbackKernelWithoutTenso
   EXPECT_EQ(4, outputs[0].toInt());
 }
 
-c10::optional<Tensor> called_arg2 = c10::nullopt;
-c10::optional<int64_t> called_arg3 = c10::nullopt;
-c10::optional<std::string> called_arg4 = c10::nullopt;
+std::optional<Tensor> called_arg2 = c10::nullopt;
+std::optional<int64_t> called_arg3 = c10::nullopt;
+std::optional<std::string> called_arg4 = c10::nullopt;
 
 struct KernelWithOptInputWithoutOutput final : OperatorKernel {
-  void operator()(Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+  void operator()(Tensor arg1, const std::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
     called = true;
     called_arg2 = arg2;
     called_arg3 = arg3;
@@ -720,7 +720,7 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithOptionalInputs_w
 }
 
 struct KernelWithOptInputWithOutput final : OperatorKernel {
-  c10::optional<Tensor> operator()(Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+  std::optional<Tensor> operator()(Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
     called = true;
     called_arg2 = arg2;
     called_arg3 = arg3;
@@ -759,8 +759,8 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithOptionalInputs_w
 }
 
 struct KernelWithOptInputWithMultipleOutputs final : OperatorKernel {
-  std::tuple<c10::optional<Tensor>, c10::optional<int64_t>, c10::optional<std::string>>
-  operator()(Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+  std::tuple<std::optional<Tensor>, c10::optional<int64_t>, c10::optional<std::string>>
+  operator()(Tensor arg1, const std::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
     return std::make_tuple(arg2, arg3, arg4);
   }
 };
@@ -821,7 +821,7 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernel_whenRegisteredWitho
   auto op = c10::Dispatcher::singleton().findSchema({"_test::no_schema_specified", ""});
   ASSERT_TRUE(op.has_value());
 
-  c10::optional<std::string> differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema());
+  std::optional<std::string> differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema());
   EXPECT_FALSE(differences.has_value());
 }
 
@@ -832,7 +832,7 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernel_whenRegisteredCatch
   auto op = c10::Dispatcher::singleton().findSchema({"_test::no_schema_specified", ""});
   ASSERT_TRUE(op.has_value());
 
-  c10::optional<std::string> differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema());
+  std::optional<std::string> differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema());
   EXPECT_FALSE(differences.has_value());
 }
 
diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h
index b25ca55c16851..9aef3a0f62cf5 100644
--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@@ -63,7 +63,7 @@ struct BuiltinOpFunction : public Function {
 
   bool call(
       Stack& stack,
-      c10::optional<size_t>,
+      std::optional<size_t>,
       c10::function_ref<void(const Code&)>) override {
     run(stack);
     return false;
diff --git a/aten/src/ATen/core/class_type.cpp b/aten/src/ATen/core/class_type.cpp
index b4ef2979738f9..0a9a8074067ee 100644
--- a/aten/src/ATen/core/class_type.cpp
+++ b/aten/src/ATen/core/class_type.cpp
@@ -469,7 +469,7 @@ bool ClassType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
 }
 
 ClassTypePtr ClassType::create(
-    c10::optional<QualifiedName> qualifiedName,
+    std::optional<QualifiedName> qualifiedName,
     std::weak_ptr<CompilationUnit> cu,
     bool is_module,
     std::string doc_string,
@@ -483,7 +483,7 @@ ClassTypePtr ClassType::create(
 }
 
 ClassType::ClassType(
-    c10::optional<QualifiedName> name,
+    std::optional<QualifiedName> name,
     std::weak_ptr<CompilationUnit> cu,
     bool is_module,
     std::string doc_string,
@@ -620,7 +620,7 @@ IValue ClassType::getConstant(size_t slot) const {
   return constantValues_[slot];
 }
 
-c10::optional<IValue> ClassType::findConstant(const std::string& name) const {
+std::optional<IValue> ClassType::findConstant(const std::string& name) const {
   TORCH_INTERNAL_ASSERT(constantNames_.size() == constantValues_.size());
   size_t pos = 0;
   for (const auto& c : constantNames_) {
@@ -652,7 +652,7 @@ std::shared_ptr<const CompilationUnit> ClassType::compilation_unit() const {
   return cu;
 }
 
-c10::optional<ClassType::Property> ClassType::getProperty(const std::string& name) {
+std::optional<ClassType::Property> ClassType::getProperty(const std::string& name) {
   for (auto& prop : properties_) {
     if (name == prop.name) {
       return prop;
@@ -667,7 +667,7 @@ void ClassType::addProperty(const std::string& name, torch::jit::Function* gette
   properties_.push_back({name, getter, setter});
 }
 
-c10::optional<size_t> ClassType::findConstantSlot(const std::string& name) const {
+std::optional<size_t> ClassType::findConstantSlot(const std::string& name) const {
   TORCH_CHECK(constantNames_.size() == constantValues_.size());
   size_t slot = 0;
   for (const auto& constant : constantNames_) {
diff --git a/aten/src/ATen/core/class_type.h b/aten/src/ATen/core/class_type.h
index 99fd27bba5426..b137f0ed208a1 100644
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@@ -74,7 +74,7 @@ struct TORCH_API ClassType : public NamedType {
 
   // Create a class type with name `name` and its methods stored in `cu`.
   static ClassTypePtr create(
-      c10::optional<QualifiedName> qualifiedName,
+      std::optional<QualifiedName> qualifiedName,
       std::weak_ptr<CompilationUnit> cu,
       bool is_module = false,
       std::string doc_string = "",
@@ -152,7 +152,7 @@ struct TORCH_API ClassType : public NamedType {
   // Attributes are stored in a specific slot at runtime for effiency.
   // When emitting instructions we specify the slot so that attribute access is
   // a constant lookup
-  c10::optional<size_t> findAttributeSlot(const std::string& name) const {
+  std::optional<size_t> findAttributeSlot(const std::string& name) const {
     size_t slot = 0;
     for (const auto& attr : attributes_) {
       if (name == attr.getName()) {
@@ -239,7 +239,7 @@ struct TORCH_API ClassType : public NamedType {
   }
 
   // Get the property with the given \p name, if it exists on the class.
-  c10::optional<ClassType::Property> getProperty(const std::string& name);
+  std::optional<ClassType::Property> getProperty(const std::string& name);
   // Add a property named \p name with \p getter and \p setter as its getter and setter.
   void addProperty(const std::string& name, torch::jit::Function* getter, torch::jit::Function* setter);
   // Get a list of all properties.
@@ -257,7 +257,7 @@ struct TORCH_API ClassType : public NamedType {
 
   size_t addConstant(const std::string& name, const IValue& value);
 
-  c10::optional<size_t> findConstantSlot(const std::string& name) const;
+  std::optional<size_t> findConstantSlot(const std::string& name) const;
 
   size_t getConstantSlot(const std::string& name) const {
     if (auto r = findConstantSlot(name)) {
@@ -281,7 +281,7 @@ struct TORCH_API ClassType : public NamedType {
 
   IValue getConstant(size_t slot) const;
 
-  c10::optional<IValue> findConstant(const std::string& name) const;
+  std::optional<IValue> findConstant(const std::string& name) const;
 
   size_t numConstants() const;
 
@@ -384,7 +384,7 @@ struct TORCH_API ClassType : public NamedType {
 
  private:
   ClassType(
-      c10::optional<QualifiedName> name,
+      std::optional<QualifiedName> name,
       std::weak_ptr<CompilationUnit> cu,
       bool is_module = false,
       std::string doc_string = "",
diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
index 33e910591de0a..46c291bada308 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@@ -56,7 +56,7 @@ namespace detail {
     void operator()(const at::Tensor& x) {
       ts = ts | x.key_set();
     }
-    void operator()(const c10::optional<at::Tensor>& x) {
+    void operator()(const std::optional<at::Tensor>& x) {
       if (x.has_value()) {
         ts = ts | x->key_set();
       }
@@ -67,8 +67,8 @@ namespace detail {
       }
     }
     // Tensor?[] translates to this case.
-    void operator()(const c10::List<c10::optional<at::Tensor>>& xs) {
-      for (c10::optional<at::Tensor> x : xs) {
+    void operator()(const c10::List<std::optional<at::Tensor>>& xs) {
+      for (std::optional<at::Tensor> x : xs) {
         if (x.has_value()) {
           ts = ts | x.value().key_set();
         }
@@ -80,7 +80,7 @@ namespace detail {
         ts = ts | x.key_set();
       }
     }
-    [[noreturn]] void operator()(at::ArrayRef<c10::optional<at::Tensor>>) {
+    [[noreturn]] void operator()(at::ArrayRef<std::optional<at::Tensor>>) {
       // Just checking that the handling of Tensor?[] didn't change.
       TORCH_INTERNAL_ASSERT(false);
     }
@@ -89,7 +89,7 @@ namespace detail {
         ts = ts | gen.key_set();
       }
     }
-    void operator()(const c10::optional<at::Generator>& gen) {
+    void operator()(const std::optional<at::Generator>& gen) {
       if (gen.has_value() && gen->defined()) {
         ts = ts | gen->key_set();
       }
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index 6077ac8e34cc8..85897f7653ee6 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -76,8 +76,8 @@ C10_EXPORT Dispatcher& Dispatcher::realSingleton() {
   return _singleton;
 }
 
-c10::optional<OperatorHandle> Dispatcher::findOp(const OperatorName& overload_name) {
-  return operatorLookupTable_.read([&] (const ska::flat_hash_map<OperatorName, OperatorHandle>& operatorLookupTable) -> c10::optional<OperatorHandle> {
+std::optional<OperatorHandle> Dispatcher::findOp(const OperatorName& overload_name) {
+  return operatorLookupTable_.read([&] (const ska::flat_hash_map<OperatorName, OperatorHandle>& operatorLookupTable) -> std::optional<OperatorHandle> {
     auto found = operatorLookupTable.find(overload_name);
     if (found == operatorLookupTable.end()) {
       return c10::nullopt;
@@ -103,7 +103,7 @@ void Dispatcher::waitForDef(const FunctionSchema& schema) {
     "the same dependencies.");
 }
 
-void Dispatcher::waitForImpl(const OperatorName& op_name, c10::optional<c10::DispatchKey> maybe_dk) {
+void Dispatcher::waitForImpl(const OperatorName& op_name, std::optional<c10::DispatchKey> maybe_dk) {
   using namespace std::chrono_literals;
   std::unique_lock<std::mutex> lock(guard_->mutex);
   auto dk = maybe_dk.value_or(DispatchKey::CompositeImplicitAutograd);
@@ -121,7 +121,7 @@ void Dispatcher::waitForImpl(const OperatorName& op_name, c10::optional<c10::Dis
     "the same dependencies.");
 }
 
-c10::optional<OperatorHandle> Dispatcher::findSchema(const OperatorName& overload_name) {
+std::optional<OperatorHandle> Dispatcher::findSchema(const OperatorName& overload_name) {
   auto it = findOp(overload_name);
   if (it.has_value()) {
     if (it->hasSchema()) {
@@ -275,7 +275,7 @@ PythonModuleMapType& pythonModulesSingleton() {
 
 }
 
-c10::optional<std::pair<const char*, const char*>> Dispatcher::getPyStub(OperatorName op_name) {
+std::optional<std::pair<const char*, const char*>> Dispatcher::getPyStub(OperatorName op_name) {
   std::lock_guard<std::mutex> lock(guard_->mutex);
   auto found = pythonModulesSingleton().find(op_name);
   if (found == pythonModulesSingleton().end()) {
@@ -332,9 +332,9 @@ void Dispatcher::throwIfHasPythonModule(OperatorName op_name) {
 
 RegistrationHandleRAII Dispatcher::registerImpl(
   OperatorName op_name,
-  c10::optional<DispatchKey> dispatch_key,
+  std::optional<DispatchKey> dispatch_key,
   KernelFunction kernel,
-  c10::optional<impl::CppSignature> cpp_signature,
+  std::optional<impl::CppSignature> cpp_signature,
   std::unique_ptr<FunctionSchema> inferred_function_schema,
   std::string debug
 ) {
@@ -364,7 +364,7 @@ RegistrationHandleRAII Dispatcher::registerImpl(
   });
 }
 
-void Dispatcher::deregisterImpl_(const OperatorHandle& op, const OperatorName& op_name, c10::optional<DispatchKey> dispatch_key, impl::OperatorEntry::AnnotatedKernelContainerIterator handle) {
+void Dispatcher::deregisterImpl_(const OperatorHandle& op, const OperatorName& op_name, std::optional<DispatchKey> dispatch_key, impl::OperatorEntry::AnnotatedKernelContainerIterator handle) {
   op.operatorDef_->op.deregisterKernel_(*this, dispatch_key, handle);
 
   TORCH_INTERNAL_ASSERT(op.operator_name() == op_name);
@@ -486,7 +486,7 @@ std::vector<OperatorHandle> Dispatcher::findDanglingImpls() const {
   });
 }
 
-std::vector<OperatorName> Dispatcher::getRegistrationsForDispatchKey(c10::optional<DispatchKey> k) const {
+std::vector<OperatorName> Dispatcher::getRegistrationsForDispatchKey(std::optional<DispatchKey> k) const {
   return operatorLookupTable_.read([&] (const ska::flat_hash_map<OperatorName, OperatorHandle>& operatorLookupTable) -> std::vector<OperatorName> {
     std::vector<OperatorName> op_names;
     for (const auto& op : operatorLookupTable) {
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index caf73d7cebb21..6e679992a9f2d 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -137,7 +137,7 @@ class TORCH_API Dispatcher final {
    * and returns it if it is registered WITH A SCHEMA.
    * Returns nullopt otherwise.
    */
-  c10::optional<OperatorHandle> findSchema(const OperatorName& operator_name);
+  std::optional<OperatorHandle> findSchema(const OperatorName& operator_name);
 
   /**
    * Variant of findSchema that results in less code generated at the call site.
@@ -155,7 +155,7 @@ class TORCH_API Dispatcher final {
   OperatorHandle findSchemaOrThrow(const char* name, const char* overload_name);
 
   // Like findSchema, but also returns OperatorHandle even if there is no schema
-  c10::optional<OperatorHandle> findOp(const OperatorName& operator_name);
+  std::optional<OperatorHandle> findOp(const OperatorName& operator_name);
 
   // Returns a list of all operator names present in the operatorLookupTable_
   const std::vector<OperatorName> getAllOpNames();
@@ -196,7 +196,7 @@ class TORCH_API Dispatcher final {
 
   // Used by torchdeploy/multipy for multiple interpreters racing.
   void waitForDef(const FunctionSchema& schema);
-  void waitForImpl(const OperatorName& op_name, c10::optional<DispatchKey> dispatch_key);
+  void waitForImpl(const OperatorName& op_name, std::optional<DispatchKey> dispatch_key);
 
   // ------------------------------------------------------------------------
   //
@@ -221,7 +221,7 @@ class TORCH_API Dispatcher final {
    */
   // NB: steals the inferred function schema, as we may need to hold on to
   // it for a bit until the real schema turns up
-  RegistrationHandleRAII registerImpl(OperatorName op_name, c10::optional<DispatchKey> dispatch_key, KernelFunction kernel, c10::optional<impl::CppSignature> cpp_signature, std::unique_ptr<FunctionSchema> inferred_function_schema, std::string debug);
+  RegistrationHandleRAII registerImpl(OperatorName op_name, std::optional<DispatchKey> dispatch_key, KernelFunction kernel, c10::optional<impl::CppSignature> cpp_signature, std::unique_ptr<FunctionSchema> inferred_function_schema, std::string debug);
 
   /**
    * Given an operator, tells the Dispatcher that we have implemented a fake impl
@@ -234,7 +234,7 @@ class TORCH_API Dispatcher final {
    */
   void throwIfHasPythonModule(OperatorName op_name);
 
-  c10::optional<std::pair<const char*, const char*>> getPyStub(OperatorName op_name);
+  std::optional<std::pair<const char*, const char*>> getPyStub(OperatorName op_name);
 
   /**
    * Register a new operator by name.
@@ -299,7 +299,7 @@ class TORCH_API Dispatcher final {
    * Returns the names of all operators with a kernel registered for the specified DispatchKey.
    * If no DispatchKey is specified, it returns all registered operators.
    */
-  std::vector<OperatorName> getRegistrationsForDispatchKey(c10::optional<DispatchKey> k) const;
+  std::vector<OperatorName> getRegistrationsForDispatchKey(std::optional<DispatchKey> k) const;
 
 private:
   Dispatcher();
@@ -321,7 +321,7 @@ class TORCH_API Dispatcher final {
   void deregisterImpl_(
     const OperatorHandle& op,
     const OperatorName& op_name,
-    c10::optional<DispatchKey> dispatch_key,
+    std::optional<DispatchKey> dispatch_key,
     impl::OperatorEntry::AnnotatedKernelContainerIterator kernel_handle);
   void deregisterName_(const OperatorHandle& op, const OperatorName& op_name);
   void deregisterFallback_(DispatchKey dispatchKey);
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index 5f4538f2c9790..74e5a7e2cf955 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -7,7 +7,7 @@ namespace c10 {
 namespace impl {
 
 namespace {
-  std::string toString(c10::optional<DispatchKey> k) {
+  std::string toString(std::optional<DispatchKey> k) {
     if (k.has_value()) {
       return toString(*k);
     } else {
@@ -39,7 +39,7 @@ namespace {
     // TODO: figure out if we can just directly save real schema at def time
     FunctionSchema from_def = from_def_.cloneWithRealTypes(kernel.isValidSymUnboxed());
     FunctionSchema inferred = inferred_.cloneWithRealTypes();
-    c10::optional<std::string> schema_difference = findSchemaDifferences(from_def, inferred);
+    std::optional<std::string> schema_difference = findSchemaDifferences(from_def, inferred);
     if (schema_difference.has_value()) {
       TORCH_CHECK(false,
         "Inferred operator schema for a C++ kernel function doesn't match the expected function schema.\n"
@@ -101,9 +101,9 @@ void OperatorEntry::deregisterSchema() {
 
 OperatorEntry::AnnotatedKernelContainerIterator OperatorEntry::registerKernel(
   const c10::Dispatcher& dispatcher,
-  c10::optional<DispatchKey> dispatch_key,
+  std::optional<DispatchKey> dispatch_key,
   KernelFunction kernel,
-  c10::optional<CppSignature> cpp_signature,
+  std::optional<CppSignature> cpp_signature,
   std::unique_ptr<FunctionSchema> inferred_function_schema,
   std::string debug
 ) {
@@ -181,7 +181,7 @@ OperatorEntry::AnnotatedKernelContainerIterator OperatorEntry::registerKernel(
 
 void OperatorEntry::deregisterKernel_(
   const c10::Dispatcher& dispatcher,
-  c10::optional<DispatchKey> dispatch_key,
+  std::optional<DispatchKey> dispatch_key,
   AnnotatedKernelContainerIterator kernel
 ) {
   // Redirect catchAll deregistrations to CompositeImplicitAutograd.
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h
index 903ff043799b2..873b385845ed3 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@@ -129,9 +129,9 @@ class TORCH_API OperatorEntry final {
   // Postcondition: caller is responsible for disposing of the kernel
   AnnotatedKernelContainerIterator registerKernel(
     const Dispatcher& dispatcher,
-    c10::optional<DispatchKey> dispatch_key,
+    std::optional<DispatchKey> dispatch_key,
     KernelFunction kernel,
-    c10::optional<CppSignature> cpp_signature,
+    std::optional<CppSignature> cpp_signature,
     std::unique_ptr<FunctionSchema> inferred_function_schema,
     std::string debug
   );
@@ -139,7 +139,7 @@ class TORCH_API OperatorEntry final {
   // Precondition: Dispatcher::mutex_ is held
   void deregisterKernel_(
     const Dispatcher& dispatcher,
-    c10::optional<DispatchKey> dispatch_key,
+    std::optional<DispatchKey> dispatch_key,
     AnnotatedKernelContainerIterator kernel
   );
 
@@ -221,7 +221,7 @@ class TORCH_API OperatorEntry final {
 private:
 
   OperatorName name_;
-  c10::optional<AnnotatedSchema> schema_;
+  std::optional<AnnotatedSchema> schema_;
   #ifndef C10_MOBILE
     std::vector<at::Tag> tags_;
   #endif
@@ -282,10 +282,10 @@ class TORCH_API OperatorEntry final {
   struct CppSignatureWithDebug {
     CppSignature signature;
     std::string debug;
-    c10::optional<DispatchKey> dispatch_key;
+    std::optional<DispatchKey> dispatch_key;
   };
-  c10::optional<CppSignatureWithDebug> cpp_signature_;
-  c10::optional<CppSignatureWithDebug> sym_cpp_signature_;
+  std::optional<CppSignatureWithDebug> cpp_signature_;
+  std::optional<CppSignatureWithDebug> sym_cpp_signature_;
 
   // A Python custom error handler for OperatorEntry::reportError
   std::unique_ptr<c10::SafePyObject> report_error_callback_;
diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h
index 25b75b9e51114..fe4f0b4dfe602 100644
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@@ -121,7 +121,7 @@ class DynamicType : public SharedType {
    * A implementation detail to support NamedTuple.
    */
   struct LabeledDynamicType {
-    c10::optional<std::string> label;
+    std::optional<std::string> label;
     DynamicTypePtr ty;
     explicit LabeledDynamicType(DynamicTypePtr t) : ty(std::move(t)) {}
 
@@ -163,7 +163,7 @@ class DynamicType : public SharedType {
   Tag tag() const {
     return tag_;
   }
-  const c10::optional<std::string>& name() const {
+  const std::optional<std::string>& name() const {
     return name_;
   }
   const Arguments& arguments() const {
@@ -200,7 +200,7 @@ class DynamicType : public SharedType {
   }
 
   Tag tag_;
-  c10::optional<std::string> name_;
+  std::optional<std::string> name_;
   union {
     Arguments arguments_;
     ClassTypePtr class_;
diff --git a/aten/src/ATen/core/function.h b/aten/src/ATen/core/function.h
index f55e15e50b4fa..01e395bcf6106 100644
--- a/aten/src/ATen/core/function.h
+++ b/aten/src/ATen/core/function.h
@@ -97,7 +97,7 @@ struct TORCH_API Function {
   // executor.
   virtual bool call(
       Stack&,
-      c10::optional<size_t>,
+      std::optional<size_t>,
       c10::function_ref<void(const Code&)>) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
     return false;
diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp
index 6e119ae25cc72..6f6cc8ed68557 100644
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@@ -30,7 +30,7 @@ FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const {
     // NB: keep this in sync with unpackSymInt in KernelFunction_impl.h
     if (
       *a.real_type() == *getTypePtr<c10::SymInt>() ||
-      *a.real_type() == *getTypePtr<c10::optional<c10::SymInt>>() ||
+      *a.real_type() == *getTypePtr<std::optional<c10::SymInt>>() ||
       *a.real_type() == *getTypePtr<c10::SymIntArrayRef>() ||
       *a.real_type() == *getTypePtr<at::OptionalSymIntArrayRef>()
     ) {
@@ -53,7 +53,7 @@ FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const {
     is_varret());
 }
 
-bool FunctionSchema::canAliasTypeSetsAlias(const c10::optional<AliasTypeSet> &lhs, const c10::optional<AliasTypeSet> &rhs) const {
+bool FunctionSchema::canAliasTypeSetsAlias(const std::optional<AliasTypeSet> &lhs, const c10::optional<AliasTypeSet> &rhs) const {
   if (!lhs || !rhs) {
     return false;
   }
@@ -67,7 +67,7 @@ bool FunctionSchema::canAliasTypeSetsAlias(const c10::optional<AliasTypeSet> &lh
   return false;
 }
 
-c10::optional<AliasTypeSet> FunctionSchema::getAliasTypeSetContainedTypes(const c10::optional<AliasTypeSet> &aliasTypeSet) const {
+std::optional<AliasTypeSet> FunctionSchema::getAliasTypeSetContainedTypes(const c10::optional<AliasTypeSet> &aliasTypeSet) const {
   if (!aliasTypeSet) {
     return c10::nullopt;
   }
@@ -95,7 +95,7 @@ c10::optional<AliasTypeSet> FunctionSchema::getAliasTypeSetContainedTypes(const
   return AliasTypeSet(containedTypes.begin(), containedTypes.end());
 }
 
-c10::optional<AliasTypeSet> FunctionSchema::mapTypeToAliasTypeSet(const TypePtr& type) const {
+std::optional<AliasTypeSet> FunctionSchema::mapTypeToAliasTypeSet(const TypePtr& type) const {
   switch(type->kind()) {
     case TypeKind::ListType:
     case TypeKind::DictType:
@@ -155,8 +155,8 @@ bool FunctionSchema::may_alias(const SchemaArgument& lhs, const SchemaArgument&
   const Argument lhsArg = getCorrectList(lhs.type)[lhs.index];
   const Argument rhsArg = getCorrectList(rhs.type)[rhs.index];
 
-  c10::optional<AliasTypeSet> lhsTypes = mapTypeToAliasTypeSet(lhsArg.type());
-  c10::optional<AliasTypeSet> rhsTypes = mapTypeToAliasTypeSet(rhsArg.type());
+  std::optional<AliasTypeSet> lhsTypes = mapTypeToAliasTypeSet(lhsArg.type());
+  std::optional<AliasTypeSet> rhsTypes = mapTypeToAliasTypeSet(rhsArg.type());
 
   // Check to see if lhs and rhs have the same alias set
   if (canAliasTypeSetsAlias(lhsTypes, rhsTypes)) {
@@ -182,10 +182,10 @@ bool FunctionSchema::may_contain_alias(const SchemaArgument& lhs, const SchemaAr
 
   const c10::Argument lhsArg = getCorrectList(lhs.type)[lhs.index];
   const c10::Argument rhsArg = getCorrectList(rhs.type)[rhs.index];
-  c10::optional<AliasTypeSet> lhsTypes = mapTypeToAliasTypeSet(lhsArg.type());
-  c10::optional<AliasTypeSet> rhsTypes = mapTypeToAliasTypeSet(rhsArg.type());
-  c10::optional<AliasTypeSet> lhsContainedTypes = getAliasTypeSetContainedTypes(lhsTypes);
-  c10::optional<AliasTypeSet> rhsContainedTypes = getAliasTypeSetContainedTypes(rhsTypes);
+  std::optional<AliasTypeSet> lhsTypes = mapTypeToAliasTypeSet(lhsArg.type());
+  std::optional<AliasTypeSet> rhsTypes = mapTypeToAliasTypeSet(rhsArg.type());
+  std::optional<AliasTypeSet> lhsContainedTypes = getAliasTypeSetContainedTypes(lhsTypes);
+  std::optional<AliasTypeSet> rhsContainedTypes = getAliasTypeSetContainedTypes(rhsTypes);
 
   // Checks if one side is wildcard and the other side is a container of the same type
   bool lhsWildcard = lhsArg.alias_info() && lhsArg.alias_info()->isWildcardAfter() && canAliasTypeSetsAlias(lhsTypes, rhsContainedTypes);
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index 79e7ffed1a14f..801bd43c84c01 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -29,20 +29,20 @@ struct Argument {
   Argument(
       std::string name = "",
       const TypePtr& type = nullptr,
-      c10::optional<int32_t> N = c10::nullopt,
-      c10::optional<IValue> default_value = c10::nullopt,
+      std::optional<int32_t> N = c10::nullopt,
+      std::optional<IValue> default_value = c10::nullopt,
       bool kwarg_only = false,
-      c10::optional<AliasInfo> alias_info = c10::nullopt)
+      std::optional<AliasInfo> alias_info = c10::nullopt)
     : Argument(std::move(name), type, type, N, std::move(default_value), kwarg_only, std::move(alias_info)) {}
 
   Argument(
       std::string name,
       TypePtr fake_type,
       TypePtr real_type,
-      c10::optional<int32_t> N = c10::nullopt,
-      c10::optional<IValue> default_value = c10::nullopt,
+      std::optional<int32_t> N = c10::nullopt,
+      std::optional<IValue> default_value = c10::nullopt,
       bool kwarg_only = false,
-      c10::optional<AliasInfo> alias_info = c10::nullopt)
+      std::optional<AliasInfo> alias_info = c10::nullopt)
       : name_(std::move(name)),
         type_(fake_type ? std::move(fake_type) : TensorType::get()),
         real_type_(real_type ? std::move(real_type) : type_),
@@ -94,10 +94,10 @@ struct Argument {
   const TypePtr& real_type() const {
     return real_type_;
   }
-  c10::optional<int32_t> N() const {
+  std::optional<int32_t> N() const {
     return N_;
   }
-  const c10::optional<IValue>& default_value() const {
+  const std::optional<IValue>& default_value() const {
     return default_value_;
   }
   bool kwarg_only() const {
@@ -150,7 +150,7 @@ struct Argument {
         N_,
         default_value_,
         kwarg_only_,
-        alias_info_ ? c10::optional<AliasInfo>(*alias_info_) : c10::nullopt);
+        alias_info_ ? std::optional<AliasInfo>(*alias_info_) : c10::nullopt);
   }
 
   // this function checks whether this Argument is backward compatible with
@@ -179,9 +179,9 @@ struct Argument {
   // e.g. for int[3]: type = ListType::ofInts(), N = 3
   // If present, this will allow scalars to be broadcast to this length to
   // become a list.
-  c10::optional<int32_t> N_;
+  std::optional<int32_t> N_;
 
-  c10::optional<IValue> default_value_;
+  std::optional<IValue> default_value_;
   // AliasInfo is huge, so let's only allocate memory for it if
   // necessary (which it isn't during schema parsing on startup, to
   // give a pertinent example).
@@ -322,7 +322,7 @@ struct TORCH_API FunctionSchema {
   // alias information should we infer?
   // NB: due to alias analysis kind merging, this may be nullopt.  Eventually
   // this should always be set no matter what
-  c10::optional<AliasAnalysisKind> alias_kind_;
+  std::optional<AliasAnalysisKind> alias_kind_;
 
   template <typename T>
   void checkArg(const IValue& value, const Argument& argument, optional<size_t> pos) const;
@@ -395,7 +395,7 @@ struct TORCH_API FunctionSchema {
     return aliasInfo && aliasInfo->isWrite();
   }
   bool is_mutable(c10::string_view name) const {
-    c10::optional<int> index = argumentIndexWithName(name);
+    std::optional<int> index = argumentIndexWithName(name);
     TORCH_INTERNAL_ASSERT(
         index != c10::nullopt, "Schema has no argument named ", name);
 
@@ -416,22 +416,22 @@ struct TORCH_API FunctionSchema {
 
   // Returns whether the two AliasTypeSets contain any similarities
   // ie: whether the two type sets can alias.
-  bool canAliasTypeSetsAlias(const c10::optional<AliasTypeSet> &lhs, const c10::optional<AliasTypeSet> &rhs) const;
+  bool canAliasTypeSetsAlias(const std::optional<AliasTypeSet> &lhs, const c10::optional<AliasTypeSet> &rhs) const;
 
   // Recursively Finds all contained types within the AliasTypeSet.
-  c10::optional<AliasTypeSet> getAliasTypeSetContainedTypes(const c10::optional<AliasTypeSet> &aliasTypeSet) const;
+  std::optional<AliasTypeSet> getAliasTypeSetContainedTypes(const c10::optional<AliasTypeSet> &aliasTypeSet) const;
 
   // Similar to mapTypeToAliasTypeSet defined in alias_analysis.cpp.
   // Used to map types to a type such that all types that can alias will be mapped to the same type.
   // For example, calling this method on 'Optional[List[int]]' is the same as calling this method
   // on 'List[int]'.
-  c10::optional<AliasTypeSet> mapTypeToAliasTypeSet(const TypePtr& type) const;
+  std::optional<AliasTypeSet> mapTypeToAliasTypeSet(const TypePtr& type) const;
 
   // Returns either arguments() or returns() depending on the SchemaArgType
   // output => returns(), input => arguments()
   const std::vector<Argument>& getCorrectList(SchemaArgType type) const;
 
-  c10::optional<int> argumentIndexWithName(c10::string_view name) const {
+  std::optional<int> argumentIndexWithName(c10::string_view name) const {
     for (const auto i : c10::irange(arguments().size())) {
       if(name == arguments()[i].name())
         return i;
@@ -470,8 +470,8 @@ struct TORCH_API FunctionSchema {
   std::string formatTypeMismatchMsg(
       const Argument& expected,
       const std::string& actual_type,
-      c10::optional<size_t> position = c10::nullopt,
-      c10::optional<std::string> value = c10::nullopt) const;
+      std::optional<size_t> position = c10::nullopt,
+      std::optional<std::string> value = c10::nullopt) const;
 
   FunctionSchema cloneWithRemappedTypes(
       const std::function<TypePtr(TypePtr)> type_map) const;
@@ -514,7 +514,7 @@ struct TORCH_API FunctionSchema {
     alias_kind_ = v;
   }
 
-  c10::optional<c10::string_view> getNamespace() const {
+  std::optional<c10::string_view> getNamespace() const {
     return name_.getNamespace();
   }
 
diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h
index a6959c661af15..182d7a181cde4 100644
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@@ -162,8 +162,8 @@ inline bool Argument::isForwardCompatibleWith(
 inline std::string FunctionSchema::formatTypeMismatchMsg(
     const Argument& expected,
     const std::string& actual_type,
-    c10::optional<size_t> position,
-    c10::optional<std::string> value) const {
+    std::optional<size_t> position,
+    std::optional<std::string> value) const {
   std::string position_str;
   if (position) {
     position_str = c10::str("Position: ", *position, "\n");
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 7343d66fcb97d..6c505f8b656cf 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -471,7 +471,7 @@ bool IValue::isOptionalTensorList() const {
     return false;
   }
   const auto& ty = static_cast<detail::ListImpl*>(payload.u.as_intrusive_ptr)->elementType;
-  const auto& expected_ty = c10::getTypePtr<c10::optional<at::Tensor>>();
+  const auto& expected_ty = c10::getTypePtr<std::optional<at::Tensor>>();
   return expected_ty == ty;
 }
 
@@ -886,14 +886,14 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::create(
       StrongTypePtr(nullptr, std::move(classType)), numSlots);
 }
 
-IValue IValue::deepcopy(c10::optional<at::Device> device) const {
+IValue IValue::deepcopy(std::optional<at::Device> device) const {
   IValue::HashAliasedIValueMap memo;
   return deepcopy(memo, device);
 }
 
 IValue IValue::deepcopy(
     IValue::HashAliasedIValueMap& memo,
-    c10::optional<at::Device> device) const {
+    std::optional<at::Device> device) const {
   if (memo.count(*this)) {
     return memo.at(*this);
   }
@@ -1027,14 +1027,14 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::copy_to_weak_compilation_ref(
 }
 
 c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
-    c10::optional<at::Device> device) const {
+    std::optional<at::Device> device) const {
   IValue::HashAliasedIValueMap memo;
   return deepcopy(memo, device);
 }
 
 c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
     IValue::HashAliasedIValueMap& memo,
-    c10::optional<at::Device> device) const {
+    std::optional<at::Device> device) const {
   auto cu = type_.cu_;
   auto object = ivalue::Object::create(WeakOrStrongTypePtr(type_.cu_, type_.type_), type()->numAttributes());
   for (const auto i : c10::irange(slots_.size())) {
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 07e85677c3c75..7715ffbe3c31d 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -86,20 +86,20 @@ struct StreamData3Holder : c10::intrusive_ptr_target {
 
 } // namespace ivalue
 
-// This is an owning wrapper for a c10::optional<std::vector<T>>
+// This is an owning wrapper for a std::optional<std::vector<T>>
 // that can be implicitly converted to a (non-owning) optional<ArrayRef<T>>.
 // Its purpose is to be used in generated code to keep the vector alive
 // either until the end of a statement (as a temporary), or as a saved arg
 // in autograd.
 template <typename T>
 struct OptionalArray {
-  c10::optional<std::vector<T>> list;
+  std::optional<std::vector<T>> list;
 
   OptionalArray() = default;
   OptionalArray(std::vector<T> val) : list(std::move(val)) {}
 
   // Used when saving an argument for the backwards pass.
-  OptionalArray& operator=(c10::optional<ArrayRef<T>> ref) {
+  OptionalArray& operator=(std::optional<ArrayRef<T>> ref) {
     if (ref) {
       list = std::vector<T>(ref->begin(), ref->end());
     } else {
@@ -118,7 +118,7 @@ struct OptionalArray {
     return *this;
   }
 
-  operator c10::optional<c10::ArrayRef<T>>() {
+  operator std::optional<c10::ArrayRef<T>>() {
     if (!list) {
       return nullopt;
     }
@@ -697,7 +697,7 @@ struct TORCH_API IValue final {
   c10::intrusive_ptr<ivalue::ConstantString> toString() &&;
   c10::intrusive_ptr<ivalue::ConstantString> toString() const&;
   const std::string& toStringRef() const;
-  c10::optional<std::reference_wrapper<const std::string>> toOptionalStringRef()
+  std::optional<std::reference_wrapper<const std::string>> toOptionalStringRef()
       const;
   c10::string_view toStringView() const;
 
@@ -726,9 +726,9 @@ struct TORCH_API IValue final {
 
   // OptionalTensorList
   bool isOptionalTensorList() const;
-  c10::List<c10::optional<at::Tensor>> toOptionalTensorList() &&;
-  c10::List<c10::optional<at::Tensor>> toOptionalTensorList() const&;
-  std::vector<c10::optional<at::Tensor>> toOptionalTensorVector() const;
+  c10::List<std::optional<at::Tensor>> toOptionalTensorList() &&;
+  c10::List<std::optional<at::Tensor>> toOptionalTensorList() const&;
+  std::vector<std::optional<at::Tensor>> toOptionalTensorVector() const;
 
   // GenericList
   IValue(c10::List<IValue> v);
@@ -817,7 +817,7 @@ struct TORCH_API IValue final {
       IValue(std::unordered_map<Key, Value> v);
 
   template <class T, enable_if_ivalue_constructible<T> = nullptr>
-  IValue(c10::optional<T> v);
+  IValue(std::optional<T> v);
   template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
   IValue(c10::OptionalArrayRef<T> v);
   IValue(c10::nullopt_t);
@@ -1128,10 +1128,10 @@ struct TORCH_API IValue final {
   // TODO: There are several places that recurse over IValue. This is fragile.
   // This visitor should be used to recurse over ivalues.
   void visit(const std::function<bool(const IValue&)>& visitor) const;
-  IValue deepcopy(c10::optional<at::Device> device = c10::nullopt) const;
+  IValue deepcopy(std::optional<at::Device> device = c10::nullopt) const;
   IValue deepcopy(
       HashAliasedIValueMap& memo,
-      c10::optional<at::Device> device = c10::nullopt) const;
+      std::optional<at::Device> device = c10::nullopt) const;
 
  private:
   static c10::intrusive_ptr_target* null_to_undefined_tensor(
@@ -1530,8 +1530,8 @@ struct WeakOrStrongCompilationUnit {
     return holdingStrongRef() && *strong_ptr_ == nullptr;
   }
 
-  c10::optional<std::shared_ptr<torch::jit::CompilationUnit>> strong_ptr_;
-  c10::optional<std::weak_ptr<torch::jit::CompilationUnit>> weak_ptr_;
+  std::optional<std::shared_ptr<torch::jit::CompilationUnit>> strong_ptr_;
+  std::optional<std::weak_ptr<torch::jit::CompilationUnit>> weak_ptr_;
 };
 
 // An Object will hold a non-owning Compilation Unit reference if it is a
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index 3e3525c274118..b1124c12cfb34 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -909,7 +909,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
   using WeakStorage = c10::weak_intrusive_ptr<c10::StorageImpl>;
   void markCompleted(
       IValue value,
-      c10::optional<std::vector<WeakStorage>> storages = c10::nullopt) {
+      std::optional<std::vector<WeakStorage>> storages = c10::nullopt) {
     // Start by performing all steps that can throw, before setting any field.
     // Do this before even acquiring the mutex, because extractStorages might
     // acquire the GIL, which could lead to a lock inversion with our mutex.
@@ -1586,11 +1586,11 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
   c10::intrusive_ptr<Object> copy() const;
 
   c10::intrusive_ptr<Object> deepcopy(
-      c10::optional<at::Device> device = c10::nullopt) const;
+      std::optional<at::Device> device = c10::nullopt) const;
 
   c10::intrusive_ptr<Object> deepcopy(
       IValue::HashAliasedIValueMap& memo,
-      c10::optional<at::Device> device = c10::nullopt) const;
+      std::optional<at::Device> device = c10::nullopt) const;
 
   bool is_weak_compilation_ref() const {
     return !type_.holds_strong_ref();
@@ -1613,7 +1613,7 @@ struct ivalue::PyObjectHolder : c10::intrusive_ptr_target {
  public:
   virtual PyObject* getPyObject() = 0;
   virtual c10::InferredType tryToInferType() = 0;
-  virtual IValue toIValue(const TypePtr& type, c10::optional<int32_t> N = c10::nullopt) = 0;
+  virtual IValue toIValue(const TypePtr& type, std::optional<int32_t> N = c10::nullopt) = 0;
   virtual std::string toStr() = 0;
   virtual std::vector<at::Tensor> extractTensors() = 0;
 
@@ -1909,7 +1909,7 @@ std::unordered_map<K, V> generic_to(
 }
 
 template <typename T>
-c10::optional<T> generic_to(IValue ivalue, _fake_type<c10::optional<T>>) {
+std::optional<T> generic_to(IValue ivalue, _fake_type<c10::optional<T>>) {
   if (ivalue.isNone()) {
     return c10::nullopt;
   }
@@ -1946,11 +1946,11 @@ inline T IValue::to() && {
 }
 
 template <>
-inline c10::optional<c10::string_view> IValue::to() && {
+inline std::optional<c10::string_view> IValue::to() && {
   // In the default implementation, the IValue is destroyed with std::move.
   // But if the unboxed type is optional<string_view> we cannot destroy
   // the IValue.
-  return generic_to(*this, _fake_type<c10::optional<c10::string_view>>{});
+  return generic_to(*this, _fake_type<std::optional<c10::string_view>>{});
 }
 
 template <typename T>
@@ -2046,20 +2046,20 @@ inline std::vector<at::Tensor> IValue::toTensorVector() const {
   return createVectorFromList<at::Tensor>(
       static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
 }
-inline c10::List<c10::optional<at::Tensor>> IValue::toOptionalTensorList() && {
+inline c10::List<std::optional<at::Tensor>> IValue::toOptionalTensorList() && {
   AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind());
-  return c10::List<c10::optional<at::Tensor>>(moveToIntrusivePtr<c10::detail::ListImpl>());
+  return c10::List<std::optional<at::Tensor>>(moveToIntrusivePtr<c10::detail::ListImpl>());
 }
-inline c10::List<c10::optional<at::Tensor>> IValue::toOptionalTensorList() const& {
+inline c10::List<std::optional<at::Tensor>> IValue::toOptionalTensorList() const& {
   AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind());
-  return c10::List<c10::optional<at::Tensor>>(toIntrusivePtr<c10::detail::ListImpl>());
+  return c10::List<std::optional<at::Tensor>>(toIntrusivePtr<c10::detail::ListImpl>());
 }
-inline std::vector<c10::optional<at::Tensor>> IValue::toOptionalTensorVector() const {
+inline std::vector<std::optional<at::Tensor>> IValue::toOptionalTensorVector() const {
   AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
       "called toOptionalTensorVector on null intrusive_ptr IValue");
-  return createVectorFromList<c10::optional<at::Tensor>>(
+  return createVectorFromList<std::optional<at::Tensor>>(
       static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
 }
 inline c10::List<IValue> IValue::toList() && {
@@ -2274,7 +2274,7 @@ inline IValue::IValue(std::unordered_map<Key, Value> v)
 }
 
 template <class T, IValue::enable_if_ivalue_constructible<T>>
-inline IValue::IValue(c10::optional<T> v) : IValue() {
+inline IValue::IValue(std::optional<T> v) : IValue() {
   if (v.has_value()) {
     *this = IValue(std::move(*v));
   }
@@ -2360,7 +2360,7 @@ inline const std::string& IValue::toStringRef() const {
              payload.u.as_intrusive_ptr)
       ->string();
 }
-inline c10::optional<std::reference_wrapper<const std::string>> IValue::
+inline std::optional<std::reference_wrapper<const std::string>> IValue::
     toOptionalStringRef() const {
   if (isNone()) {
     return c10::nullopt;
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 05f7242855417..be4414e8fe5b0 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -32,7 +32,7 @@ class Dict;
 struct IValue;
 struct FunctionSchema;
 struct NamedType;
-using OptNameList = c10::optional<std::vector<std::string>>;
+using OptNameList = std::optional<std::vector<std::string>>;
 
 void standardizeVectorForUnion(std::vector<TypePtr>& reference, std::vector<TypePtr>* to_fill);
 void standardizeVectorForUnion(std::vector<TypePtr>* to_flatten);
@@ -164,9 +164,9 @@ struct TORCH_API UnionType : public SharedType {
     return has_free_variables_;
   }
 
-  c10::optional<TypePtr> toOptional() const;
+  std::optional<TypePtr> toOptional() const;
 
-  c10::optional<TypePtr> subtractTypeSet(std::vector<TypePtr>& to_subtract) const;
+  std::optional<TypePtr> subtractTypeSet(std::vector<TypePtr>& to_subtract) const;
 
  protected:
     explicit UnionType(std::vector<TypePtr> types, TypeKind kind=TypeKind::UnionType);
@@ -247,13 +247,13 @@ struct TORCH_API OptionalType : public UnionType {
 };
 
 template <typename T>
-inline c10::optional<T> merge_primitive(
-    const c10::optional<T>& a,
-    const c10::optional<T>& b) {
+inline std::optional<T> merge_primitive(
+    const std::optional<T>& a,
+    const std::optional<T>& b) {
   if (a.has_value() && b.has_value() && a.value() == b.value()) {
     return a;
   }
-  return c10::optional<T>{};
+  return std::optional<T>{};
 }
 
 // If we see `a + b + c`  and know that a, b, and c are the same size and have
@@ -274,9 +274,9 @@ inline c10::optional<T> merge_primitive(
 struct TORCH_API Stride {
   Stride() = default;
   Stride(
-      const c10::optional<size_t>& stride_index,
-      c10::optional<bool> contiguous,
-      const c10::optional<size_t>& stride)
+      const std::optional<size_t>& stride_index,
+      std::optional<bool> contiguous,
+      const std::optional<size_t>& stride)
       : stride_index_(stride_index), contiguous_(contiguous), stride_(stride) {}
 
   bool operator==(const Stride& b) const {
@@ -288,17 +288,17 @@ struct TORCH_API Stride {
     return stride_index_ && contiguous_ && stride_;
   }
 
-  c10::optional<size_t> stride_index_;
-  c10::optional<bool> contiguous_;
-  c10::optional<size_t> stride_;
+  std::optional<size_t> stride_index_;
+  std::optional<bool> contiguous_;
+  std::optional<size_t> stride_;
 };
 
 template <>
-inline c10::optional<Stride> merge_primitive(
-    const c10::optional<Stride>& a,
-    const c10::optional<Stride>& b) {
-  c10::optional<Stride> left = a;
-  c10::optional<Stride> right = b;
+inline std::optional<Stride> merge_primitive(
+    const std::optional<Stride>& a,
+    const std::optional<Stride>& b) {
+  std::optional<Stride> left = a;
+  std::optional<Stride> right = b;
   if (!left.has_value()) {
     left = {Stride()};
   }
@@ -314,7 +314,7 @@ inline c10::optional<Stride> merge_primitive(
   // normalize
   if (!r.stride_index_.has_value() && !r.contiguous_.has_value() &&
       !r.stride_.has_value()) {
-    return c10::optional<Stride>{};
+    return std::optional<Stride>{};
   }
 
   return r;
@@ -375,7 +375,7 @@ struct TORCH_API SymbolicShape {
   SymbolicShape() : dims_(c10::nullopt) {}
 
   // Known rank but unknown dimentions.
-  SymbolicShape(c10::optional<size_t> rank) : dims_(c10::nullopt) {
+  SymbolicShape(std::optional<size_t> rank) : dims_(c10::nullopt) {
     if(!rank) {
       return;
     }
@@ -389,10 +389,10 @@ struct TORCH_API SymbolicShape {
   }
 
   // Mix of known and unknown ranks
-  SymbolicShape(const std::vector<c10::optional<int64_t>>& dims) {
+  SymbolicShape(const std::vector<std::optional<int64_t>>& dims) {
     std::vector<ShapeSymbol> shape_symbols;
     shape_symbols.reserve(dims.size());
-    for(c10::optional<int64_t> dim: dims) {
+    for(std::optional<int64_t> dim: dims) {
       if(!dim) {
         shape_symbols.push_back(ShapeSymbol::newSymbol());
       } else {
@@ -430,18 +430,18 @@ struct TORCH_API SymbolicShape {
   }
 
   // Returns rank or nullopt in case of unranked shape.
-  c10::optional<size_t> rank() const {
+  std::optional<size_t> rank() const {
     if(!dims_) {
       return c10::nullopt;
     }
     return dims_->size();
   }
 
-  c10::optional<std::vector<ShapeSymbol>> sizes() const {
+  std::optional<std::vector<ShapeSymbol>> sizes() const {
     return dims_;
   }
 
-  c10::optional<std::vector<bool>> symbolicDims() const {
+  std::optional<std::vector<bool>> symbolicDims() const {
     if (!dims_) {
       return c10::nullopt;
     }
@@ -482,7 +482,7 @@ struct TORCH_API SymbolicShape {
   }
 
   private:
-    c10::optional<std::vector<ShapeSymbol>> dims_;
+    std::optional<std::vector<ShapeSymbol>> dims_;
 };
 
 namespace detail {
@@ -498,14 +498,14 @@ inline bool isComplete(const T& /*t*/) {
 
 template <typename T>
 struct VaryingShape {
-  using ListOfOptionalElements = std::vector<c10::optional<T>>;
+  using ListOfOptionalElements = std::vector<std::optional<T>>;
   VaryingShape(const std::vector<T>& vec)
       : VaryingShape(ListOfOptionalElements(vec.begin(), vec.end())) {}
 
   VaryingShape(c10::ArrayRef<T> vec)
       : VaryingShape(ListOfOptionalElements(vec.begin(), vec.end())) {}
 
-  VaryingShape(c10::optional<size_t> size = c10::nullopt) : dims_(c10::nullopt) {
+  VaryingShape(std::optional<size_t> size = c10::nullopt) : dims_(c10::nullopt) {
     if (size) {
       dims_ = ListOfOptionalElements(*size);
     }
@@ -513,20 +513,20 @@ struct VaryingShape {
 
   VaryingShape(ListOfOptionalElements dims) : dims_(std::move(dims)) {}
 
-  VaryingShape(size_t size) : VaryingShape(c10::optional<size_t>(size)) {}
+  VaryingShape(size_t size) : VaryingShape(std::optional<size_t>(size)) {}
 
   bool operator==(const VaryingShape& other) const {
     return dims_ == other.dims_;
   }
 
-  const c10::optional<T> &operator[](size_t i) const {
+  const std::optional<T> &operator[](size_t i) const {
     if (!dims_) {
       throw std::runtime_error("Rank isn't fixed");
     }
     return (*dims_).at(i);
   }
 
-  c10::optional<size_t> size() const {
+  std::optional<size_t> size() const {
     if (!dims_) {
       return c10::nullopt;
     }
@@ -534,13 +534,13 @@ struct VaryingShape {
     return dims.size();
   }
 
-  const c10::optional<ListOfOptionalElements>& sizes() const {
+  const std::optional<ListOfOptionalElements>& sizes() const {
     return dims_;
   }
 
   TORCH_API VaryingShape merge(const VaryingShape& other) const;
 
-  c10::optional<std::vector<T>> concrete_sizes() const {
+  std::optional<std::vector<T>> concrete_sizes() const {
     if (!dims_) {
       return c10::nullopt;
     }
@@ -568,7 +568,7 @@ struct VaryingShape {
   }
 
  private:
-  c10::optional<ListOfOptionalElements> dims_;
+  std::optional<ListOfOptionalElements> dims_;
 };
 
 struct TensorType;
@@ -581,27 +581,27 @@ struct TORCH_API TensorType : public SharedType {
   // used by TensorType::create(size_t dim) which in turn used by
   // shape_analysis.cpp
   static TensorTypePtr create(
-      c10::optional<at::ScalarType> scalar_type,
-      c10::optional<Device> device,
+      std::optional<at::ScalarType> scalar_type,
+      std::optional<Device> device,
       const VaryingShape<int64_t>& sizes,
       const VaryingShape<int64_t>& strides,
-      c10::optional<bool> requires_grad,
-      c10::optional<bool> undefined = false,
+      std::optional<bool> requires_grad,
+      std::optional<bool> undefined = false,
       bool tensor_contiguity = false);
 
   static TensorTypePtr create(
-      c10::optional<at::ScalarType> scalar_type,
-      c10::optional<Device> device,
+      std::optional<at::ScalarType> scalar_type,
+      std::optional<Device> device,
       const SymbolicShape& sizes,
       const VaryingShape<Stride>& stride_,
-      c10::optional<bool> requires_grad,
-      c10::optional<bool> undefined = false);
+      std::optional<bool> requires_grad,
+      std::optional<bool> undefined = false);
 
   static TensorTypePtr create(
-      c10::optional<at::ScalarType> scalar_type,
-      c10::optional<Device> device,
-      c10::optional<size_t> dim,
-      c10::optional<bool> requires_grad);
+      std::optional<at::ScalarType> scalar_type,
+      std::optional<Device> device,
+      std::optional<size_t> dim,
+      std::optional<bool> requires_grad);
 
   // overloaded create variadic template argument as it could not distinguish
   // initializer list
@@ -613,7 +613,7 @@ struct TORCH_API TensorType : public SharedType {
   static TypePtr fromNumberType(const Type& typ);
   static TypePtr fromBoolType();
 
-  c10::optional<size_t> dim() const {
+  std::optional<size_t> dim() const {
     return sizes().size();
   }
 
@@ -625,13 +625,13 @@ struct TORCH_API TensorType : public SharedType {
     return strides_;
   }
 
-  c10::optional<at::Device> device() const {
+  std::optional<at::Device> device() const {
     return device_;
   }
-  c10::optional<at::ScalarType> scalarType() const {
+  std::optional<at::ScalarType> scalarType() const {
     return scalar_type_;
   }
-  c10::optional<bool> requiresGrad() const {
+  std::optional<bool> requiresGrad() const {
     return requires_grad_;
   }
   bool requires_grad() const override {
@@ -651,32 +651,32 @@ struct TORCH_API TensorType : public SharedType {
     }
   }
 
-  c10::optional<size_t> numel() const {
+  std::optional<size_t> numel() const {
     size_t prod = 1;
     const auto& shape = sizes();
 
     for (size_t i = 0; i < shape.size(); i++) {
       if (!shape[i]) {
-        return c10::optional<size_t>{};
+        return std::optional<size_t>{};
       }
       prod *= shape[i].value();
     }
     return prod;
   }
 
-  TensorTypePtr withRequiresGrad(c10::optional<bool> s) {
+  TensorTypePtr withRequiresGrad(std::optional<bool> s) {
     auto copy = clone();
     copy->requires_grad_ = s;
     return copy;
   }
 
-  TensorTypePtr withScalarType(c10::optional<ScalarType> st) {
+  TensorTypePtr withScalarType(std::optional<ScalarType> st) {
     auto copy = clone();
     copy->scalar_type_ = st;
     return copy;
   }
 
-  TensorTypePtr withDim(c10::optional<size_t> d) {
+  TensorTypePtr withDim(std::optional<size_t> d) {
     auto copy = clone();
     // withDim is only used by the legacy executor
     // that only cares about the rank, so create dummy symbols)) :
@@ -712,7 +712,7 @@ struct TORCH_API TensorType : public SharedType {
         sizes, contiguousStridesOf(sizes));
   }
 
-  TensorTypePtr withDevice(const c10::optional<at::Device> device) const {
+  TensorTypePtr withDevice(const std::optional<at::Device> device) const {
     auto copy = clone();
     copy->device_ = device;
     return copy;
@@ -784,7 +784,7 @@ struct TORCH_API TensorType : public SharedType {
     return r;
   }
 
-  c10::optional<bool> undefined() const { return undefined_; }
+  std::optional<bool> undefined() const { return undefined_; }
 
   static const TensorTypePtr& get();
 
@@ -824,12 +824,12 @@ struct TORCH_API TensorType : public SharedType {
 
  private:
   TensorType(
-      c10::optional<at::ScalarType> scalar_type,
-      c10::optional<Device> device,
+      std::optional<at::ScalarType> scalar_type,
+      std::optional<Device> device,
       SymbolicShape sizes,
       VaryingShape<Stride> strides,
-      c10::optional<bool> requires_grad,
-      c10::optional<bool> undefined = false);
+      std::optional<bool> requires_grad,
+      std::optional<bool> undefined = false);
 
   TensorTypePtr clone() const {
     return TensorTypePtr(new TensorType(
@@ -841,11 +841,11 @@ struct TORCH_API TensorType : public SharedType {
       at::IntArrayRef strides,
       bool tensor_contiguity = false);
 
-  c10::optional<at::ScalarType> scalar_type_;
-  c10::optional<at::Device> device_;
+  std::optional<at::ScalarType> scalar_type_;
+  std::optional<at::Device> device_;
   SymbolicShape sizes_;
   VaryingShape<Stride> strides_;
-  c10::optional<bool> requires_grad_;
+  std::optional<bool> requires_grad_;
   // we exploit the fact certain tensors must be zero in the autograd to
   // optimize gradient computation. Such zero tensors are currently implemented
   // with `UndefinedTensorImpl.` They can be handled only by special operators
@@ -857,7 +857,7 @@ struct TORCH_API TensorType : public SharedType {
   // undefined_ may become `c10::nullopt` if the tensor was observed to be both
   // defined and undefined. However, no tensor type starts out with
   // `undefined_` set to `c10::nullopt`
-  c10::optional<bool> undefined_;
+  std::optional<bool> undefined_;
   // Represents whether or not this type was inferred.
   bool is_inferred_ = false;
 };
@@ -1144,16 +1144,16 @@ using NameList = std::vector<std::string>;
 // This type represents a Tuple
 struct TORCH_API TupleType : public NamedType {
 
-  static TupleTypePtr createNamed(const c10::optional<c10::QualifiedName>& name,
+  static TupleTypePtr createNamed(const std::optional<c10::QualifiedName>& name,
       const std::vector<std::string>& field_names,
       const std::vector<TypePtr>& field_types,
       std::vector<IValue>& field_defaults);
 
-  static TupleTypePtr createNamed(const c10::optional<c10::QualifiedName>& name,
+  static TupleTypePtr createNamed(const std::optional<c10::QualifiedName>& name,
       const std::vector<std::string>& field_names,
       const std::vector<TypePtr>& field_types);
 
-  static TupleTypePtr createNamed(const c10::optional<c10::QualifiedName>& name,
+  static TupleTypePtr createNamed(const std::optional<c10::QualifiedName>& name,
       const std::vector<c10::string_view>& field_names,
       const std::vector<TypePtr>& field_types);
 
@@ -1190,21 +1190,21 @@ struct TORCH_API TupleType : public NamedType {
   const std::shared_ptr<FunctionSchema>& schema() const {
     return schema_;
   }
-  c10::optional<std::vector<c10::string_view>> names() const;
+  std::optional<std::vector<c10::string_view>> names() const;
 
   static const TypeKind Kind = TypeKind::TupleType;
 
  private:
   template <typename S>
   static TupleTypePtr createWithSpec(
-      const c10::optional<c10::QualifiedName>& name,
+      const std::optional<c10::QualifiedName>& name,
       const std::vector<S>& field_names,
       const std::vector<TypePtr>& field_types,
       std::vector<IValue>& field_defaults);
 
   TupleType(
       std::vector<TypePtr> elements_,
-      c10::optional<c10::QualifiedName> name,
+      std::optional<c10::QualifiedName> name,
       std::shared_ptr<FunctionSchema> schema);
 
   bool compare(
@@ -1747,7 +1747,7 @@ inline TypePtr TensorType::fromBoolType() {
   return TensorType::createContiguous(at::kBool, at::kCPU, {});
 }
 
-inline c10::optional<c10::ScalarType> tryScalarTypeFromJitType(const Type& type) {
+inline std::optional<c10::ScalarType> tryScalarTypeFromJitType(const Type& type) {
   if (type == *FloatType::get()) {
     return at::typeMetaToScalarType(c10::get_default_dtype());
   } else if (type == *IntType::get()) {
@@ -1782,13 +1782,13 @@ inline at::ScalarType scalarTypeFromJitType(const Type& type) {
 // If `type_hint` is an `InterfaceType`, then we can use that as a
 // potential supertype for `ClassType`s in the list. Otherwise, we have
 // no way to find and use some common interface type
-TORCH_API c10::optional<TypePtr> unifyTypes(
+TORCH_API std::optional<TypePtr> unifyTypes(
     const TypePtr& t1,
     const TypePtr& t2,
     bool default_to_union = false,
     const TypePtr& type_hint = nullptr);
 
-TORCH_API c10::optional<TypePtr> unifyTypeList(
+TORCH_API std::optional<TypePtr> unifyTypeList(
     at::ArrayRef<TypePtr> elements,
     std::ostream& why_not,
     bool default_to_union = false,
@@ -2132,7 +2132,7 @@ struct MatchTypeReturn {
  private:
   MatchTypeReturn()
   : reason_(c10::nullopt) {}
-  c10::optional<std::string> reason_; // is there is no match, this contains the reason
+  std::optional<std::string> reason_; // is there is no match, this contains the reason
 };
 
 // attempt to match the type variables in formal to actual, adding them to type_env.
diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h
index 21692db56dd87..ac2cb0528245c 100644
--- a/aten/src/ATen/core/jit_type_base.h
+++ b/aten/src/ATen/core/jit_type_base.h
@@ -75,7 +75,7 @@ struct SharedType;
 // Use this to customize how a Type is printed using `annotation_str()`. If
 // c10::nullopt is returned, `annotation_str()` falls through to its default
 // implementation.
-using TypePrinter = std::function<c10::optional<std::string>(const Type&)>;
+using TypePrinter = std::function<std::optional<std::string>(const Type&)>;
 
 namespace detail {
 template <typename T>
@@ -688,7 +688,7 @@ using NamedTypePtr = std::shared_ptr<NamedType>;
 using ConstNamedTypePtr = std::shared_ptr<const NamedType>;
 
 struct TORCH_API NamedType : public SharedType {
-  NamedType(TypeKind tk, c10::optional<QualifiedName> name)
+  NamedType(TypeKind tk, std::optional<QualifiedName> name)
       : SharedType(tk), name_(std::move(name)) {
     TORCH_INTERNAL_ASSERT(
         tk == TypeKind::TupleType || tk == TypeKind::FunctionType ||
@@ -700,12 +700,12 @@ struct TORCH_API NamedType : public SharedType {
 
   // Fully qualified name of type
   // Looks like: "foo.bar.Baz".
-  const c10::optional<QualifiedName>& name() const {
+  const std::optional<QualifiedName>& name() const {
     return name_;
   }
 
  private:
-  c10::optional<QualifiedName> name_;
+  std::optional<QualifiedName> name_;
 };
 
 } // namespace c10
diff --git a/aten/src/ATen/core/library.cpp b/aten/src/ATen/core/library.cpp
index fd349da2f8b0c..6a910d7b60a57 100644
--- a/aten/src/ATen/core/library.cpp
+++ b/aten/src/ATen/core/library.cpp
@@ -42,7 +42,7 @@ namespace {
   constexpr auto CatchAll = c10::DispatchKey::CatchAll;
 } // anonymous namespace
 
-CppFunction::CppFunction(c10::KernelFunction func, c10::optional<c10::impl::CppSignature> cpp_signature, std::unique_ptr<c10::FunctionSchema> schema)
+CppFunction::CppFunction(c10::KernelFunction func, std::optional<c10::impl::CppSignature> cpp_signature, std::unique_ptr<c10::FunctionSchema> schema)
   : func_(std::move(func))
   , cpp_signature_(cpp_signature)
   , schema_(std::move(schema))
@@ -57,10 +57,10 @@ void Library::reset() {
 
 #define ERROR_CONTEXT "(Error occurred while processing ", toString(kind_), " block at ", file_, ":", line_, ")"
 
-Library::Library(Kind kind, std::string ns, c10::optional<c10::DispatchKey> k, const char* file, uint32_t line)
+Library::Library(Kind kind, std::string ns, std::optional<c10::DispatchKey> k, const char* file, uint32_t line)
   : kind_(kind)
   , ns_(ns == "_" ? c10::nullopt : c10::make_optional(std::move(ns)))
-  , dispatch_key_(k.value_or(CatchAll) == CatchAll ? c10::optional<c10::DispatchKey>() : k)
+  , dispatch_key_(k.value_or(CatchAll) == CatchAll ? std::optional<c10::DispatchKey>() : k)
   , file_(file)
   , line_(line)
   {
diff --git a/aten/src/ATen/core/op_registration/infer_schema.cpp b/aten/src/ATen/core/op_registration/infer_schema.cpp
index 7e0fd28f9a7b1..e280bb140220b 100644
--- a/aten/src/ATen/core/op_registration/infer_schema.cpp
+++ b/aten/src/ATen/core/op_registration/infer_schema.cpp
@@ -43,7 +43,7 @@ FunctionSchema make_function_schema(
 } // namespace infer_schema
 } // namespace detail
 
-c10::optional<std::string> findSchemaDifferences(
+std::optional<std::string> findSchemaDifferences(
     const FunctionSchema& lhs,
     const FunctionSchema& rhs) {
   if (lhs.arguments().size() != rhs.arguments().size()) {
diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h
index 57409442950f2..2f845f7c4c10f 100644
--- a/aten/src/ATen/core/op_registration/infer_schema.h
+++ b/aten/src/ATen/core/op_registration/infer_schema.h
@@ -155,6 +155,6 @@ FunctionSchema inferFunctionSchemaSingleReturn(std::string&& name, std::string&&
   return detail::infer_schema::createFunctionSchemaFromTraitsSingleReturn<guts::infer_function_traits_t<FuncType>>(std::move(name), std::move(overload_name));
 }
 
-TORCH_API c10::optional<std::string> findSchemaDifferences(const FunctionSchema& inferred, const FunctionSchema& specified);
+TORCH_API std::optional<std::string> findSchemaDifferences(const FunctionSchema& inferred, const FunctionSchema& specified);
 
 }
diff --git a/aten/src/ATen/core/op_registration/op_registration.cpp b/aten/src/ATen/core/op_registration/op_registration.cpp
index 8a516e68bd0dc..0a64e0f44d7e5 100644
--- a/aten/src/ATen/core/op_registration/op_registration.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration.cpp
@@ -17,9 +17,9 @@ void build_feature_required_feature_not_available(const char* feature) {
 } // namespace impl
 
 static_assert(std::is_nothrow_move_constructible<
-              c10::optional<RegistrationHandleRAII>>::value);
+              std::optional<RegistrationHandleRAII>>::value);
 static_assert(std::is_nothrow_move_assignable<
-              c10::optional<RegistrationHandleRAII>>::value);
+              std::optional<RegistrationHandleRAII>>::value);
 
 void RegisterOperators::checkSchemaAndRegisterOp_(Options&& options) {
   TORCH_CHECK(
@@ -71,7 +71,7 @@ c10::FunctionSchema RegisterOperators::inferSchemaFromKernels_(
       opName,
       " because there is no kernel specified.");
 
-  c10::optional<FunctionSchema> inferred_schema = c10::nullopt;
+  std::optional<FunctionSchema> inferred_schema = c10::nullopt;
   for (const auto& kernel : options.kernels) {
     if (nullptr != kernel.inferred_function_schema.get()) {
       if (!inferred_schema.has_value()) {
diff --git a/aten/src/ATen/core/op_registration/op_registration.h b/aten/src/ATen/core/op_registration/op_registration.h
index 0b083dc6b6759..b1b1e2c47bc45 100644
--- a/aten/src/ATen/core/op_registration/op_registration.h
+++ b/aten/src/ATen/core/op_registration/op_registration.h
@@ -399,7 +399,7 @@ class TORCH_API RegisterOperators final {
     }
 
   private:
-    Options&& kernel(c10::optional<DispatchKey> dispatch_key, KernelFunction&& func, c10::optional<impl::CppSignature> cpp_signature, std::unique_ptr<FunctionSchema>&& inferred_function_schema) && {
+    Options&& kernel(std::optional<DispatchKey> dispatch_key, KernelFunction&& func, c10::optional<impl::CppSignature> cpp_signature, std::unique_ptr<FunctionSchema>&& inferred_function_schema) && {
       KernelRegistrationConfig config;
       config.dispatch_key = dispatch_key;
       config.func = std::move(func);
@@ -425,13 +425,13 @@ class TORCH_API RegisterOperators final {
         , inferred_function_schema(nullptr)
       {}
 
-      c10::optional<DispatchKey> dispatch_key;
+      std::optional<DispatchKey> dispatch_key;
       KernelFunction func;
-      c10::optional<impl::CppSignature> cpp_signature;
+      std::optional<impl::CppSignature> cpp_signature;
       std::unique_ptr<FunctionSchema> inferred_function_schema;
     };
 
-    c10::optional<std::variant<OperatorName, FunctionSchema>> schemaOrName_;
+    std::optional<std::variant<OperatorName, FunctionSchema>> schemaOrName_;
 
     std::vector<KernelRegistrationConfig> kernels;
     optional<AliasAnalysisKind> aliasAnalysisKind_;
diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp
index 377cb403cdcfd..d1305ac6d9491 100644
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@@ -882,56 +882,56 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) {
 
 
   // optional types (with has_value() == true)
-  testArgTypes<c10::optional<double>>::test(
-    c10::optional<double>(1.5), [] (const c10::optional<double>& v) {EXPECT_EQ(1.5, v.value());},
-    c10::optional<double>(2.5), [] (const IValue& v) {EXPECT_EQ(2.5, v.toDouble());},
+  testArgTypes<std::optional<double>>::test(
+    std::optional<double>(1.5), [] (const c10::optional<double>& v) {EXPECT_EQ(1.5, v.value());},
+    std::optional<double>(2.5), [] (const IValue& v) {EXPECT_EQ(2.5, v.toDouble());},
     "(float? a) -> float?");
-  testArgTypes<c10::optional<int64_t>>::test(
-    c10::optional<int64_t>(1), [] (const c10::optional<int64_t>& v) {EXPECT_EQ(1, v.value());},
-    c10::optional<int64_t>(2), [] (const IValue& v) {EXPECT_EQ(2, v.toInt());},
+  testArgTypes<std::optional<int64_t>>::test(
+    std::optional<int64_t>(1), [] (const c10::optional<int64_t>& v) {EXPECT_EQ(1, v.value());},
+    std::optional<int64_t>(2), [] (const IValue& v) {EXPECT_EQ(2, v.toInt());},
     "(int? a) -> int?");
-  testArgTypes<c10::optional<bool>>::test(
-    c10::optional<bool>(true), [] (const c10::optional<bool>& v) {EXPECT_EQ(true, v.value());},
-    c10::optional<bool>(false), [] (const IValue& v) {EXPECT_EQ(false, v.toBool());},
+  testArgTypes<std::optional<bool>>::test(
+    std::optional<bool>(true), [] (const c10::optional<bool>& v) {EXPECT_EQ(true, v.value());},
+    std::optional<bool>(false), [] (const IValue& v) {EXPECT_EQ(false, v.toBool());},
     "(bool? a) -> bool?");
-  testArgTypes<c10::optional<bool>>::test(
-    c10::optional<bool>(false), [] (const c10::optional<bool>& v) {EXPECT_EQ(false, v.value());},
-    c10::optional<bool>(true), [] (const IValue& v) {EXPECT_EQ(true, v.toBool());},
+  testArgTypes<std::optional<bool>>::test(
+    std::optional<bool>(false), [] (const c10::optional<bool>& v) {EXPECT_EQ(false, v.value());},
+    std::optional<bool>(true), [] (const IValue& v) {EXPECT_EQ(true, v.toBool());},
     "(bool? a) -> bool?");
-  testArgTypes<c10::optional<std::string>>::test(
-    c10::optional<std::string>("string1"), [] (const c10::optional<std::string>& v) {EXPECT_EQ("string1", v.value());},
-    c10::optional<std::string>("string2"), [] (const IValue& v) {EXPECT_EQ("string2", v.toStringRef());},
+  testArgTypes<std::optional<std::string>>::test(
+    std::optional<std::string>("string1"), [] (const c10::optional<std::string>& v) {EXPECT_EQ("string1", v.value());},
+    std::optional<std::string>("string2"), [] (const IValue& v) {EXPECT_EQ("string2", v.toStringRef());},
     "(str? a) -> str?");
-  testArgTypes<c10::optional<Tensor>>::test(
-    c10::optional<Tensor>(dummyTensor(c10::DispatchKey::CPU)), [] (const c10::optional<Tensor>& v) {EXPECT_EQ(c10::DispatchKey::CPU, extractDispatchKey(v.value()));},
-    c10::optional<Tensor>(dummyTensor(c10::DispatchKey::CUDA)), [] (const IValue& v) {EXPECT_EQ(c10::DispatchKey::CUDA, extractDispatchKey(v.toTensor()));},
+  testArgTypes<std::optional<Tensor>>::test(
+    std::optional<Tensor>(dummyTensor(c10::DispatchKey::CPU)), [] (const c10::optional<Tensor>& v) {EXPECT_EQ(c10::DispatchKey::CPU, extractDispatchKey(v.value()));},
+    std::optional<Tensor>(dummyTensor(c10::DispatchKey::CUDA)), [] (const IValue& v) {EXPECT_EQ(c10::DispatchKey::CUDA, extractDispatchKey(v.toTensor()));},
     "(Tensor? a) -> Tensor?");
 
 
   // optional types (with has_value() == false)
-  testArgTypes<c10::optional<double>>::test(
-    c10::optional<double>(c10::nullopt), [] (const c10::optional<double>& v) {EXPECT_FALSE(v.has_value());},
-    c10::optional<double>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+  testArgTypes<std::optional<double>>::test(
+    std::optional<double>(c10::nullopt), [] (const c10::optional<double>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<double>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
     "(float? a) -> float?");
-  testArgTypes<c10::optional<int64_t>>::test(
-    c10::optional<int64_t>(c10::nullopt), [] (const c10::optional<int64_t>& v) {EXPECT_FALSE(v.has_value());},
-    c10::optional<int64_t>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+  testArgTypes<std::optional<int64_t>>::test(
+    std::optional<int64_t>(c10::nullopt), [] (const c10::optional<int64_t>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<int64_t>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
     "(int? a) -> int?");
-  testArgTypes<c10::optional<bool>>::test(
-    c10::optional<bool>(c10::nullopt), [] (const c10::optional<bool>& v) {EXPECT_FALSE(v.has_value());},
-    c10::optional<bool>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+  testArgTypes<std::optional<bool>>::test(
+    std::optional<bool>(c10::nullopt), [] (const c10::optional<bool>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<bool>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
     "(bool? a) -> bool?");
-  testArgTypes<c10::optional<bool>>::test(
-    c10::optional<bool>(c10::nullopt), [] (const c10::optional<bool>& v) {EXPECT_FALSE(v.has_value());},
-    c10::optional<bool>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+  testArgTypes<std::optional<bool>>::test(
+    std::optional<bool>(c10::nullopt), [] (const c10::optional<bool>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<bool>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
     "(bool? a) -> bool?");
-  testArgTypes<c10::optional<std::string>>::test(
-    c10::optional<std::string>(c10::nullopt), [] (const c10::optional<std::string>& v) {EXPECT_FALSE(v.has_value());},
-    c10::optional<std::string>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+  testArgTypes<std::optional<std::string>>::test(
+    std::optional<std::string>(c10::nullopt), [] (const c10::optional<std::string>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<std::string>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
     "(str? a) -> str?");
-  testArgTypes<c10::optional<Tensor>>::test(
-    c10::optional<Tensor>(c10::nullopt), [] (const c10::optional<Tensor>& v) {EXPECT_FALSE(v.has_value());},
-    c10::optional<Tensor>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+  testArgTypes<std::optional<Tensor>>::test(
+    std::optional<Tensor>(c10::nullopt), [] (const c10::optional<Tensor>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<Tensor>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
     "(Tensor? a) -> Tensor?");
 
 
@@ -1136,21 +1136,21 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) {
     "(Tensor[] a) -> Tensor[]");
 
   // Test optional of list (with nullopt)
-  testArgTypes<c10::optional<c10::List<int64_t>>>::test(
-    c10::optional<c10::List<int64_t>>(c10::nullopt), [] (const c10::optional<c10::List<int64_t>>& v) {EXPECT_FALSE(v.has_value());},
-    c10::optional<c10::List<int64_t>>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
+  testArgTypes<std::optional<c10::List<int64_t>>>::test(
+    std::optional<c10::List<int64_t>>(c10::nullopt), [] (const c10::optional<c10::List<int64_t>>& v) {EXPECT_FALSE(v.has_value());},
+    std::optional<c10::List<int64_t>>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
     "(int[]? a) -> int[]?");
 
   // Test optional of list (with empty list)
-  testArgTypes<c10::optional<c10::List<int64_t>>>::test(
-    c10::optional<c10::List<int64_t>>(c10::List<int64_t>({})), [] (const c10::optional<c10::List<int64_t>>& v) {EXPECT_EQ(0, v.value().size());},
-    c10::optional<c10::List<int64_t>>(c10::List<int64_t>({})), [] (const IValue& v) {EXPECT_EQ(0, v.to<c10::List<int64_t>>().size());},
+  testArgTypes<std::optional<c10::List<int64_t>>>::test(
+    std::optional<c10::List<int64_t>>(c10::List<int64_t>({})), [] (const c10::optional<c10::List<int64_t>>& v) {EXPECT_EQ(0, v.value().size());},
+    std::optional<c10::List<int64_t>>(c10::List<int64_t>({})), [] (const IValue& v) {EXPECT_EQ(0, v.to<c10::List<int64_t>>().size());},
     "(int[]? a) -> int[]?");
 
   // Test optional of list (with values)
-  testArgTypes<c10::optional<c10::List<int64_t>>>::test(
-    c10::optional<c10::List<int64_t>>(c10::List<int64_t>({1, 2})), [] (const c10::optional<c10::List<int64_t>>& v) {expectListEquals({1, 2}, v.value());},
-    c10::optional<c10::List<int64_t>>(c10::List<int64_t>({3, 4})), [] (const IValue& v) {expectListEquals({3, 4}, v.to<c10::List<int64_t>>());},
+  testArgTypes<std::optional<c10::List<int64_t>>>::test(
+    std::optional<c10::List<int64_t>>(c10::List<int64_t>({1, 2})), [] (const c10::optional<c10::List<int64_t>>& v) {expectListEquals({1, 2}, v.value());},
+    std::optional<c10::List<int64_t>>(c10::List<int64_t>({3, 4})), [] (const IValue& v) {expectListEquals({3, 4}, v.to<c10::List<int64_t>>());},
     "(int[]? a) -> int[]?");
 
   // Test list of optional (with empty list)
@@ -1161,8 +1161,8 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) {
 
   // Test list of optional (with values)
   testArgTypes<c10::List<::std::optional<int64_t>>>::test(
-    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({3, c10::nullopt, 2})), [] (const c10::List<::std::optional<int64_t>>& v) {expectListEquals<c10::optional<int64_t>>({3, c10::nullopt, 2}, v);},
-    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({3, c10::nullopt, 2})), [] (const IValue& v) {expectListEquals<c10::optional<int64_t>>({3, c10::nullopt, 2}, v.to<c10::List<::std::optional<int64_t>>>());},
+    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({3, c10::nullopt, 2})), [] (const c10::List<::std::optional<int64_t>>& v) {expectListEquals<std::optional<int64_t>>({3, c10::nullopt, 2}, v);},
+    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({3, c10::nullopt, 2})), [] (const IValue& v) {expectListEquals<std::optional<int64_t>>({3, c10::nullopt, 2}, v.to<c10::List<::std::optional<int64_t>>>());},
     "(int?[] a) -> int?[]");
 
   // dict types
diff --git a/aten/src/ATen/core/operator_name.h b/aten/src/ATen/core/operator_name.h
index 6440a695b55ec..5ba01b4a7df58 100644
--- a/aten/src/ATen/core/operator_name.h
+++ b/aten/src/ATen/core/operator_name.h
@@ -23,7 +23,7 @@ struct OperatorName final {
   // Return the namespace of this OperatorName, if it exists.  The
   // returned string_view is only live as long as the OperatorName
   // exists and name is not mutated
-  c10::optional<c10::string_view> getNamespace() const {
+  std::optional<c10::string_view> getNamespace() const {
     auto pos = name.find("::");
     if (pos == std::string::npos) {
       return c10::nullopt;
diff --git a/aten/src/ATen/core/tensor_type.cpp b/aten/src/ATen/core/tensor_type.cpp
index c7f8c8b05f91e..9110b4261d396 100644
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@@ -274,12 +274,12 @@ TensorTypePtr TensorType::create(const at::Tensor& t) {
 }
 
 TensorTypePtr TensorType::create(
-    c10::optional<at::ScalarType> scalar_type,
-    c10::optional<Device> device,
+    std::optional<at::ScalarType> scalar_type,
+    std::optional<Device> device,
     const VaryingShape<int64_t>& sizes,
     const VaryingShape<int64_t>& strides,
-    c10::optional<bool> requires_grad,
-    c10::optional<bool> undefined, bool tensor_contiguity) {
+    std::optional<bool> requires_grad,
+    std::optional<bool> undefined, bool tensor_contiguity) {
   if(strides.concrete_sizes() && strides.concrete_sizes().has_value()){
     // handles case where strides are set
     // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
@@ -304,22 +304,22 @@ TensorTypePtr TensorType::create(
 }
 
 TensorTypePtr TensorType::create(
-    c10::optional<at::ScalarType> scalar_type,
-    c10::optional<Device> device,
+    std::optional<at::ScalarType> scalar_type,
+    std::optional<Device> device,
     const SymbolicShape& sizes,
     const VaryingShape<Stride>& strides,
-    c10::optional<bool> requires_grad,
-    c10::optional<bool> undefined) {
+    std::optional<bool> requires_grad,
+    std::optional<bool> undefined) {
   auto pt = TensorTypePtr(new TensorType(
       scalar_type, device, sizes, strides, requires_grad, undefined));
   return pt;
 }
 
 TensorTypePtr TensorType::create(
-    c10::optional<at::ScalarType> scalar_type,
-    c10::optional<Device> device,
-    c10::optional<size_t> dim,
-    c10::optional<bool> requires_grad) {
+    std::optional<at::ScalarType> scalar_type,
+    std::optional<Device> device,
+    std::optional<size_t> dim,
+    std::optional<bool> requires_grad) {
   return TensorType::create(
       scalar_type,
       device,
@@ -349,7 +349,7 @@ VaryingShape<int64_t> TensorType::sizes() const {
       fmap(*sizes_.sizes(), [](ShapeSymbol ss) {
         // we turn symbolic shapes into unknowns
         return ss.is_static()
-            ? c10::optional<int64_t>(ss.static_size())
+            ? std::optional<int64_t>(ss.static_size())
             : c10::nullopt;
       }));
 }
@@ -371,7 +371,7 @@ TensorTypePtr TensorType::merge(const TensorType& other, bool merge_sizes) const
 }
 
 template <typename T>
-bool is_null_or_equal(c10::optional<T> a, c10::IntArrayRef b) {
+bool is_null_or_equal(std::optional<T> a, c10::IntArrayRef b) {
   return !a.has_value() || a.value() == b;
 }
 
@@ -417,7 +417,7 @@ VaryingShape<int64_t> TensorType::strides() const {
   if (!strides_.size().has_value()) {
     return VaryingShape<int64_t>();
   }
-  std::vector<c10::optional<int64_t>> ss(*strides_.size());
+  std::vector<std::optional<int64_t>> ss(*strides_.size());
   for (size_t i = 0; i < *strides_.size(); i++) {
     if (!strides_[i].has_value()) {
       continue;
@@ -431,12 +431,12 @@ VaryingShape<int64_t> TensorType::strides() const {
 }
 
 TensorType::TensorType(
-    c10::optional<at::ScalarType> scalar_type,
-    c10::optional<Device> device,
+    std::optional<at::ScalarType> scalar_type,
+    std::optional<Device> device,
     SymbolicShape sizes,
     VaryingShape<Stride> strides,
-    c10::optional<bool> requires_grad,
-    c10::optional<bool> undefined)
+    std::optional<bool> requires_grad,
+    std::optional<bool> undefined)
     : SharedType(TypeKind::TensorType),
       scalar_type_(scalar_type),
       device_(device),
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index f7d67ca84861a..572b15a118b36 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -364,7 +364,7 @@ SymBoolTypePtr SymBoolType::get() {
   return value;
 }
 
-static c10::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool default_to_union=false, const TypePtr& type_hint=nullptr) {
+static std::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool default_to_union=false, const TypePtr& type_hint=nullptr) {
   // check direct subtyping relation
   if (t1->isSubtypeOf(*t2)) {
     return t2;
@@ -446,7 +446,7 @@ static c10::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t
   return c10::nullopt;
 }
 
-c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2, bool default_to_union, const TypePtr& type_hint) {
+std::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2, bool default_to_union, const TypePtr& type_hint) {
   auto unified = unifyTypesImpl(t1, t2, default_to_union, type_hint);
 
   if (default_to_union && !unified) {
@@ -456,7 +456,7 @@ c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2, bool def
   return unified;
 }
 
-c10::optional<TypePtr> unifyTypeList(
+std::optional<TypePtr> unifyTypeList(
     at::ArrayRef<TypePtr> elements,
     std::ostream& why_not,
     bool default_to_union,
@@ -468,7 +468,7 @@ c10::optional<TypePtr> unifyTypeList(
 
   TypePtr ret_type = elements.at(0);
   for (size_t i = 1; i < elements.size() && ret_type; ++i) {
-    c10::optional<TypePtr> maybe_unified = unifyTypes(ret_type, elements.at(i), default_to_union, type_hint);
+    std::optional<TypePtr> maybe_unified = unifyTypes(ret_type, elements.at(i), default_to_union, type_hint);
     if (!maybe_unified) {
       why_not << "Could not unify type list since element " << i << " of type "
               << elements.at(i)->repr_str()
@@ -719,7 +719,7 @@ bool Type::is_module() const {
 }
 
 TupleTypePtr TupleType::createNamed(
-    const c10::optional<c10::QualifiedName>& qualName,
+    const std::optional<c10::QualifiedName>& qualName,
     const std::vector<std::string>& field_names,
     const std::vector<TypePtr>& field_types) {
   std::vector<IValue> empty_defaults;
@@ -727,7 +727,7 @@ TupleTypePtr TupleType::createNamed(
 }
 
 TupleTypePtr TupleType::createNamed(
-    const c10::optional<c10::QualifiedName>& qualName,
+    const std::optional<c10::QualifiedName>& qualName,
     const std::vector<c10::string_view>& field_names,
     const std::vector<TypePtr>& field_types) {
   std::vector<IValue> empty_defaults;
@@ -735,7 +735,7 @@ TupleTypePtr TupleType::createNamed(
 }
 
 TupleTypePtr TupleType::createNamed(
-    const c10::optional<c10::QualifiedName>& qualName,
+    const std::optional<c10::QualifiedName>& qualName,
     const std::vector<std::string>& field_names,
     const std::vector<TypePtr>& field_types,
     std::vector<IValue>& field_defaults) {
@@ -743,7 +743,7 @@ TupleTypePtr TupleType::createNamed(
 }
 
 template <typename S>
-TupleTypePtr TupleType::createWithSpec(const c10::optional<c10::QualifiedName>& qualName,
+TupleTypePtr TupleType::createWithSpec(const std::optional<c10::QualifiedName>& qualName,
     const std::vector<S>& field_names,
     const std::vector<TypePtr>& field_types,
     std::vector<IValue>& field_defaults) {
@@ -784,7 +784,7 @@ TupleTypePtr TupleType::createWithSpec(const c10::optional<c10::QualifiedName>&
       field_types, qualName, std::move(schema))); // NOLINT(modernize-make-shared)
 }
 
-c10::optional<std::vector<c10::string_view>> TupleType::names() const {
+std::optional<std::vector<c10::string_view>> TupleType::names() const {
   if (!schema_) {
     return {};
   }
@@ -820,7 +820,7 @@ bool NumberType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
 
 TupleType::TupleType(
     std::vector<TypePtr> elements,
-    c10::optional<c10::QualifiedName> name,
+    std::optional<c10::QualifiedName> name,
     std::shared_ptr<FunctionSchema> schema)
     : NamedType(TypeKind::TupleType, std::move(name)),
       elements_(std::move(elements)),
diff --git a/aten/src/ATen/core/union_type.cpp b/aten/src/ATen/core/union_type.cpp
index 2acc4c497ba56..4039e2a4418f9 100644
--- a/aten/src/ATen/core/union_type.cpp
+++ b/aten/src/ATen/core/union_type.cpp
@@ -29,7 +29,7 @@ ListTypePtr ListType::ofOptionalTensors() {
 
 namespace {
 
-c10::optional<TypePtr> subtractTypeSetFrom(std::vector<TypePtr>& to_subtract, ArrayRef<TypePtr> from) {
+std::optional<TypePtr> subtractTypeSetFrom(std::vector<TypePtr>& to_subtract, ArrayRef<TypePtr> from) {
   std::vector<TypePtr> types;
 
   // Given a TypePtr `lhs`, this function says whether or not `lhs` (or
@@ -93,7 +93,7 @@ void filterDuplicateSubtypes(std::vector<TypePtr>* types) {
   if (types->empty()) {
     return;
   }
-  auto get_supertype = [](const TypePtr& t1, const TypePtr& t2) -> c10::optional<TypePtr> {
+  auto get_supertype = [](const TypePtr& t1, const TypePtr& t2) -> std::optional<TypePtr> {
     // We don't want nested Optionals. Also, prematurely unifying to
     // `Optional` could prevent us from coalescing other types
     if ((t1->isSubtypeOf(*NoneType::get()) && !t2->isSubtypeOf(*NoneType::get()))
@@ -114,7 +114,7 @@ void filterDuplicateSubtypes(std::vector<TypePtr>* types) {
   size_t end_idx = types->size()-1;
   for (size_t i = types->size()-1; i > 0; --i) {
     for (size_t j = std::min(i-1, end_idx); ; --j) {
-      c10::optional<TypePtr> unified;
+      std::optional<TypePtr> unified;
       unified = get_supertype((*types)[i], (*types)[j]);
       if (unified) {
         (*types)[j] = *unified;
@@ -272,11 +272,11 @@ UnionTypePtr UnionType::create(std::vector<TypePtr> reference) {
   return union_type;
 }
 
-c10::optional<TypePtr> UnionType::subtractTypeSet(std::vector<TypePtr>& to_subtract) const {
+std::optional<TypePtr> UnionType::subtractTypeSet(std::vector<TypePtr>& to_subtract) const {
   return subtractTypeSetFrom(to_subtract, containedTypes());
 }
 
-c10::optional<TypePtr> UnionType::toOptional() const {
+std::optional<TypePtr> UnionType::toOptional() const {
   if (!canHoldType(*NoneType::get())) {
       return c10::nullopt;
   }
@@ -432,7 +432,7 @@ bool UnionType::canHoldType(const Type& type) const {
 bool OptionalType::equals(const Type& rhs) const {
   if (auto union_rhs = rhs.cast<UnionType>()) {
     auto optional_rhs = union_rhs->toOptional();
-    // `**optional_rhs` = `*` to get value of `c10::optional<TypePtr>`,
+    // `**optional_rhs` = `*` to get value of `std::optional<TypePtr>`,
     // then `*` to dereference the pointer
     return optional_rhs && *this == **optional_rhs;
   } else if (auto optional_rhs = rhs.cast<OptionalType>()) {
diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp
index f4f22711d61a3..9ae49113dc8a2 100644
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@@ -105,7 +105,7 @@ struct CUDACachingHostAllocatorImpl
   }
 
   void record_stream(
-      c10::optional<std::vector<EventPool::Event>>& events,
+      std::optional<std::vector<EventPool::Event>>& events,
       CUDAStream stream) override {
     auto event = create_event_internal(stream.device_index());
     event->record(stream);
diff --git a/aten/src/ATen/cuda/EmptyTensor.cpp b/aten/src/ATen/cuda/EmptyTensor.cpp
index a3cd55f4b2b7b..269b4a3ecfc11 100644
--- a/aten/src/ATen/cuda/EmptyTensor.cpp
+++ b/aten/src/ATen/cuda/EmptyTensor.cpp
@@ -8,8 +8,8 @@ namespace at::detail {
 TensorBase empty_cuda(
     IntArrayRef size,
     ScalarType dtype,
-    c10::optional<Device> device_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt) {
+    std::optional<Device> device_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt) {
   at::globalContext().lazyInitCUDA();
   const auto device = device_or_default(device_opt);
   TORCH_INTERNAL_ASSERT(device.is_cuda());
@@ -22,11 +22,11 @@ TensorBase empty_cuda(
 
 TensorBase empty_cuda(
     IntArrayRef size,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt) {
   TORCH_CHECK(!pin_memory_opt.has_value() || !*pin_memory_opt, "Only dense CPU tensors can be pinned");
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
 
@@ -49,7 +49,7 @@ TensorBase empty_strided_cuda(
     IntArrayRef size,
     IntArrayRef stride,
     ScalarType dtype,
-    c10::optional<Device> device_opt) {
+    std::optional<Device> device_opt) {
   at::globalContext().lazyInitCUDA();
   const auto device = device_or_default(device_opt);
   TORCH_INTERNAL_ASSERT(device.is_cuda());
@@ -63,10 +63,10 @@ TensorBase empty_strided_cuda(
 TensorBase empty_strided_cuda(
     IntArrayRef size,
     IntArrayRef stride,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt) {
   TORCH_CHECK(!pin_memory_opt.has_value() || !*pin_memory_opt, "Only dense CPU tensors can be pinned");
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
 
diff --git a/aten/src/ATen/cuda/EmptyTensor.h b/aten/src/ATen/cuda/EmptyTensor.h
index 18733f0beb30b..2fd88a94b75d2 100644
--- a/aten/src/ATen/cuda/EmptyTensor.h
+++ b/aten/src/ATen/cuda/EmptyTensor.h
@@ -6,16 +6,16 @@ namespace at::detail {
 TORCH_CUDA_CPP_API TensorBase empty_cuda(
     IntArrayRef size,
     ScalarType dtype,
-    c10::optional<Device> device_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt);
+    std::optional<Device> device_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
 
 TORCH_CUDA_CPP_API TensorBase empty_cuda(
     IntArrayRef size,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt);
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
 
 TORCH_CUDA_CPP_API TensorBase empty_cuda(
     IntArrayRef size,
@@ -25,15 +25,15 @@ TORCH_CUDA_CPP_API TensorBase empty_strided_cuda(
     IntArrayRef size,
     IntArrayRef stride,
     ScalarType dtype,
-    c10::optional<Device> device_opt);
+    std::optional<Device> device_opt);
 
 TORCH_CUDA_CPP_API TensorBase empty_strided_cuda(
     IntArrayRef size,
     IntArrayRef stride,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt);
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt);
 
 TORCH_CUDA_CPP_API TensorBase empty_strided_cuda(
     IntArrayRef size,
diff --git a/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp b/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp
index 973027cd87f61..0c3e37825640d 100644
--- a/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp
+++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp
@@ -8,13 +8,13 @@
 
 namespace at::native {
 
-bool is_pinned_cuda(const Tensor& self, c10::optional<Device> device) {
+bool is_pinned_cuda(const Tensor& self, std::optional<Device> device) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda());
   // TODO: unhook this
   return detail::getCUDAHooks().isPinnedPtr(self.storage().data());
 }
 
-Tensor _pin_memory_cuda(const Tensor& self, c10::optional<Device> device) {
+Tensor _pin_memory_cuda(const Tensor& self, std::optional<Device> device) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda());
   auto* allocator = at::cuda::getPinnedMemoryAllocator();
   auto storage = Storage(
diff --git a/aten/src/ATen/cudnn/AutocastRNN.cpp b/aten/src/ATen/cudnn/AutocastRNN.cpp
index 083d435975c7c..2677e52df0929 100644
--- a/aten/src/ATen/cudnn/AutocastRNN.cpp
+++ b/aten/src/ATen/cudnn/AutocastRNN.cpp
@@ -22,9 +22,9 @@ std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor>
 _cudnn_rnn_cast_reflatten(const Tensor & input,
                           TensorList weight,
                           int64_t weight_stride0,
-                          const c10::optional<Tensor>& weight_buf_opt,
+                          const std::optional<Tensor>& weight_buf_opt,
                           const Tensor& hx,
-                          const c10::optional<Tensor>& cx,
+                          const std::optional<Tensor>& cx,
                           int64_t mode,
                           int64_t hidden_size,
                           int64_t proj_size,
@@ -34,7 +34,7 @@ _cudnn_rnn_cast_reflatten(const Tensor & input,
                           bool train,
                           bool bidirectional,
                           IntArrayRef batch_sizes,
-                          const c10::optional<Tensor>& dropout_state) {
+                          const std::optional<Tensor>& dropout_state) {
 #if AT_CUDNN_ENABLED()
   c10::impl::ExcludeDispatchKeyGuard no_autocast(DispatchKey::Autocast);
 
diff --git a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
index 44ca2802bf3a2..e7a914c1e0f69 100644
--- a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
@@ -303,7 +303,7 @@ static std::tuple<Tensor, optional<int64_t>> log_sigmoid_backward_batch_rule(
   return std::make_tuple(at::log_sigmoid_backward(out_grad, out_self, out_buffer), 0);
 }
 
-static Tensor binomial_wrapper(const Tensor& count, const Tensor& prob, c10::optional<Generator> gen) {
+static Tensor binomial_wrapper(const Tensor& count, const Tensor& prob, std::optional<Generator> gen) {
   return at::binomial(count, prob.contiguous(), std::move(gen)); // Bug in PyTorch, prob shouldn't need to be contiguous
 }
 
@@ -457,7 +457,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   using TensorScalarInplaceT = Tensor& (Tensor::*)(const Tensor&, const Scalar&) const;
   using ScalarScalarInplaceT = Tensor& (Tensor::*)(const Scalar&, const Scalar&) const;
   using TensorInplaceT = Tensor& (Tensor::*)(const Tensor&) const;
-  using TensorInplaceModeT = Tensor& (Tensor::*)(const Tensor&, c10::optional<c10::string_view>) const;
+  using TensorInplaceModeT = Tensor& (Tensor::*)(const Tensor&, std::optional<c10::string_view>) const;
   using ScalarInplaceT = Tensor& (Tensor::*)(const Scalar&) const;
   using CopyT = Tensor& (Tensor::*)(const Tensor&, bool) const;
 
@@ -471,7 +471,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT2(mul_, Tensor, SINGLE_ARG(binary_pointwise_inplace_batch_rule<TensorInplaceT, &Tensor::mul_>));
   VMAP_SUPPORT2(mul_, Scalar, SINGLE_ARG(unary_inplace_batch_rule<ScalarInplaceT, &Tensor::mul_, const Scalar&>));
   VMAP_SUPPORT2(div_, Tensor, SINGLE_ARG(binary_pointwise_inplace_batch_rule<TensorInplaceT, &Tensor::div_>));
-  VMAP_SUPPORT2(div_, Tensor_mode, SINGLE_ARG(binary_pointwise_inplace_batch_rule<TensorInplaceModeT, &Tensor::div_, c10::optional<c10::string_view>>));
+  VMAP_SUPPORT2(div_, Tensor_mode, SINGLE_ARG(binary_pointwise_inplace_batch_rule<TensorInplaceModeT, &Tensor::div_, std::optional<c10::string_view>>));
   VMAP_SUPPORT2(div_, Scalar, SINGLE_ARG(unary_inplace_batch_rule<ScalarInplaceT, &Tensor::div_, const Scalar&>));
   VMAP_SUPPORT2(clamp_min_, Tensor, SINGLE_ARG(binary_pointwise_inplace_batch_rule<TensorInplaceT, &Tensor::clamp_min_>));
   VMAP_SUPPORT2(clamp_max_, Tensor, SINGLE_ARG(binary_pointwise_inplace_batch_rule<TensorInplaceT, &Tensor::clamp_max_>));
diff --git a/aten/src/ATen/functorch/BatchRulesConvolution.cpp b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
index ca4eda19a36fb..dd24207e7e778 100644
--- a/aten/src/ATen/functorch/BatchRulesConvolution.cpp
+++ b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
@@ -124,7 +124,7 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
 }
 
 static Tensor _convolution_decomp(
-    const Tensor& input_r, const Tensor& weight_r, const c10::optional<Tensor>& bias_r_opt,
+    const Tensor& input_r, const Tensor& weight_r, const std::optional<Tensor>& bias_r_opt,
     IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_,
     bool transposed_, IntArrayRef output_padding_, int64_t groups_,
     bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) {
diff --git a/aten/src/ATen/functorch/BatchRulesFactory.cpp b/aten/src/ATen/functorch/BatchRulesFactory.cpp
index f317fee6af6c7..1edce4f52e271 100644
--- a/aten/src/ATen/functorch/BatchRulesFactory.cpp
+++ b/aten/src/ATen/functorch/BatchRulesFactory.cpp
@@ -107,11 +107,11 @@ static std::tuple<Tensor,optional<int64_t>> linspace_logspace_batch_rule_helper(
     const at::Tensor& start, optional<int64_t> start_bdim,
     const at::Tensor& end, optional<int64_t> end_bdim,
     int64_t steps,
-    c10::optional<double> base,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory)
+    std::optional<double> base,
+    std::optional<at::ScalarType> dtype,
+    std::optional<at::Layout> layout,
+    std::optional<at::Device> device,
+    std::optional<bool> pin_memory)
 {
   auto batch_size = get_bdim_size2(start, start_bdim, end, end_bdim);
   auto start_ = ensure_has_bdim(start, start_bdim.has_value(), batch_size);
@@ -145,10 +145,10 @@ static std::tuple<Tensor,optional<int64_t>> linspace_Tensor_Tensor_batch_rule(
     const at::Tensor& start, optional<int64_t> start_bdim,
     const at::Tensor& end, optional<int64_t> end_bdim,
     int64_t steps,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory){
+    std::optional<at::ScalarType> dtype,
+    std::optional<at::Layout> layout,
+    std::optional<at::Device> device,
+    std::optional<bool> pin_memory){
   return linspace_logspace_batch_rule_helper(start, start_bdim, end, end_bdim, steps, c10::nullopt, dtype, layout, device, pin_memory);
 }
 
@@ -156,10 +156,10 @@ static std::tuple<Tensor,optional<int64_t>> linspace_Tensor_Scalar_batch_rule(
     const at::Tensor& start, optional<int64_t> start_bdim,
     const at::Scalar& end,
     int64_t steps,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory){
+    std::optional<at::ScalarType> dtype,
+    std::optional<at::Layout> layout,
+    std::optional<at::Device> device,
+    std::optional<bool> pin_memory){
 
   auto end_t = at::native::wrapped_scalar_tensor(end, start.device());
   return linspace_logspace_batch_rule_helper(start, start_bdim, end_t, c10::nullopt, steps, c10::nullopt, dtype, layout, device, pin_memory);
@@ -169,10 +169,10 @@ static std::tuple<Tensor,optional<int64_t>> linspace_Scalar_Tensor_batch_rule(
     const at::Scalar& start,
     const at::Tensor& end, optional<int64_t> end_bdim,
     int64_t steps,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory){
+    std::optional<at::ScalarType> dtype,
+    std::optional<at::Layout> layout,
+    std::optional<at::Device> device,
+    std::optional<bool> pin_memory){
 
   auto start_t = at::native::wrapped_scalar_tensor(start, end.device());
   return linspace_logspace_batch_rule_helper(start_t, c10::nullopt, end, end_bdim, steps, c10::nullopt, dtype, layout, device, pin_memory);
@@ -183,10 +183,10 @@ static std::tuple<Tensor,optional<int64_t>> logspace_Tensor_Tensor_batch_rule(
     const at::Tensor& end, optional<int64_t> end_bdim,
     int64_t steps,
     double base,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory){
+    std::optional<at::ScalarType> dtype,
+    std::optional<at::Layout> layout,
+    std::optional<at::Device> device,
+    std::optional<bool> pin_memory){
   return linspace_logspace_batch_rule_helper(start, start_bdim, end, end_bdim, steps, c10::make_optional(base), dtype, layout, device, pin_memory);
 }
 
@@ -195,10 +195,10 @@ static std::tuple<Tensor,optional<int64_t>> logspace_Tensor_Scalar_batch_rule(
     const at::Scalar& end,
     int64_t steps,
     double base,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory){
+    std::optional<at::ScalarType> dtype,
+    std::optional<at::Layout> layout,
+    std::optional<at::Device> device,
+    std::optional<bool> pin_memory){
 
   auto end_t = at::native::wrapped_scalar_tensor(end, start.device());
   return linspace_logspace_batch_rule_helper(start, start_bdim, end_t, c10::nullopt, steps, c10::make_optional(base), dtype, layout, device, pin_memory);
@@ -209,10 +209,10 @@ static std::tuple<Tensor,optional<int64_t>> logspace_Scalar_Tensor_batch_rule(
     const at::Tensor& end, optional<int64_t> end_bdim,
     int64_t steps,
     double base,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory){
+    std::optional<at::ScalarType> dtype,
+    std::optional<at::Layout> layout,
+    std::optional<at::Device> device,
+    std::optional<bool> pin_memory){
 
   auto start_t = at::native::wrapped_scalar_tensor(start, end.device());
   return linspace_logspace_batch_rule_helper(start_t, c10::nullopt, end, end_bdim, steps, c10::make_optional(base), dtype, layout, device, pin_memory);
diff --git a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
index 6a17adb4e268c..511a0a6d45450 100644
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@@ -157,9 +157,9 @@ void _linalg_check_errors_batch_rule(const Tensor& info, optional<int64_t> info_
   at::_linalg_check_errors(info_, api_name, false);
 }
 
-std::tuple<Tensor, c10::optional<int64_t>>
-householder_product_batch_rule(const Tensor &input, c10::optional<int64_t> input_bdim,
-                               const Tensor &tau, c10::optional<int64_t> tau_bdim)
+std::tuple<Tensor, std::optional<int64_t>>
+householder_product_batch_rule(const Tensor &input, std::optional<int64_t> input_bdim,
+                               const Tensor &tau, std::optional<int64_t> tau_bdim)
 {
   auto input_ = moveBatchDimToFront(input, input_bdim);
   auto tau_ = moveBatchDimToFront(tau, tau_bdim);
@@ -330,8 +330,8 @@ oneOutput linalg_lu_solve_batch_rule(
 }
 
 oneOutput cholesky_solve_batch_rule(
-    const Tensor& self, c10::optional<int64_t> self_bdim,
-    const Tensor& A, c10::optional<int64_t> A_bdim,
+    const Tensor& self, std::optional<int64_t> self_bdim,
+    const Tensor& A, std::optional<int64_t> A_bdim,
     bool upper) {
   TORCH_CHECK(rankWithoutBatchDim(self, self_bdim) >= 2,
            "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
@@ -345,14 +345,14 @@ oneOutput cholesky_solve_batch_rule(
 }
 
 threeOutputs linalg_lu_factor_ex_batch_rule(
-    const Tensor& A, c10::optional<int64_t> A_bdim, bool pivot, bool check_errors) {
+    const Tensor& A, std::optional<int64_t> A_bdim, bool pivot, bool check_errors) {
   TORCH_CHECK(rankWithoutBatchDim(A, A_bdim) >= 2, "torch.lu_factor_ex: Expected tensor with 2 or more dimensions. Got size: ", A.sizes(), " instead");
   const auto A_ = moveBatchDimToFront(A, A_bdim);
   const auto res = at::linalg_lu_factor_ex(A_, pivot, check_errors);
   return std::make_tuple(std::get<0>(res), 0, std::get<1>(res), 0, std::get<2>(res), 0);
 }
 
-oneOutput matrix_exp_batch_rule(const Tensor& self, c10::optional<int64_t> self_bdim) {
+oneOutput matrix_exp_batch_rule(const Tensor& self, std::optional<int64_t> self_bdim) {
   TORCH_CHECK(rankWithoutBatchDim(self, self_bdim) >= 2, "torch.matrix_exp: The input tensor A must have at least 2 dimensions.");
   const auto self_ = moveBatchDimToFront(self, self_bdim).contiguous();  // seems to be a bug
   return std::make_tuple(at::matrix_exp(self_), 0);
@@ -400,8 +400,8 @@ fourOutputs solve_ex_batch_rule(
   return std::make_tuple(std::get<0>(res), 0, std::get<1>(res), 0, std::get<2>(res), 0, std::get<3>(res), 0);
 }
 
-oneOutput cross_batch_rule(const Tensor& self, c10::optional<int64_t> self_bdim,
-                           const Tensor& other, c10::optional<int64_t> other_bdim, const int64_t dim) {
+oneOutput cross_batch_rule(const Tensor& self, std::optional<int64_t> self_bdim,
+                           const Tensor& other, std::optional<int64_t> other_bdim, const int64_t dim) {
   // match cross dimension checks
   TORCH_CHECK(rankWithoutBatchDim(self, self_bdim) == rankWithoutBatchDim(other, other_bdim),
     "linalg.cross: inputs must have the same number of dimensions."
@@ -418,16 +418,16 @@ oneOutput cross_batch_rule(const Tensor& self, c10::optional<int64_t> self_bdim,
   return std::make_tuple(linalg_cross(self_, other_, dim_), 0);
 }
 
-c10::optional<int64_t> batch_dim_if_not_empty(const Tensor& t) {
+std::optional<int64_t> batch_dim_if_not_empty(const Tensor& t) {
   if (t.dim() == 1 && t.size(0) == 0) {
-    return c10::optional<int64_t>();
+    return std::optional<int64_t>();
   }
-  return c10::optional<int64_t>(0);
+  return std::optional<int64_t>(0);
 }
 
 fourOutputs linalg_lstsq_batch_rule(
-    const Tensor& self, c10::optional<int64_t> self_bdim, const Tensor& b, c10::optional<int64_t> b_bdim,
-    c10::optional<double> rcond, c10::optional<c10::string_view> driver) {
+    const Tensor& self, std::optional<int64_t> self_bdim, const Tensor& b, c10::optional<int64_t> b_bdim,
+    std::optional<double> rcond, c10::optional<c10::string_view> driver) {
   TORCH_CHECK(rankWithoutBatchDim(self, self_bdim) >= 2, "torch.linalg.lstsq: input must have at least 2 dimensions.");
   TORCH_CHECK(rankWithoutBatchDim(b, b_bdim) >= 1, "torch.linalg.lstsq: other must have at least 1 dimension.");
 
@@ -449,7 +449,7 @@ fourOutputs linalg_lstsq_batch_rule(
 }
 
 template<typename F>
-std::tuple<Tensor, c10::optional<int64_t>>
+std::tuple<Tensor, std::optional<int64_t>>
 atol_rtol_tensor_batch_rule(
     F Func, const Tensor& input, optional<int64_t> input_bdim,
     const optional<Tensor>& atol, const optional<int64_t> atol_bdim,
@@ -478,11 +478,11 @@ atol_rtol_tensor_batch_rule(
   return std::make_tuple(Func(input_, atol_, rtol_, hermitian), 0);
 }
 
-static std::tuple<Tensor, c10::optional<int64_t>>
+static std::tuple<Tensor, std::optional<int64_t>>
 pinv_batch_rule(
-    const Tensor& input, c10::optional<int64_t> input_bdim, const optional<Tensor>& atol,
-    const c10::optional<int64_t> atol_bdim, const optional<Tensor>& rtol,
-    const c10::optional<int64_t> rtol_bdim, bool hermitian) {
+    const Tensor& input, std::optional<int64_t> input_bdim, const optional<Tensor>& atol,
+    const std::optional<int64_t> atol_bdim, const optional<Tensor>& rtol,
+    const std::optional<int64_t> rtol_bdim, bool hermitian) {
   return atol_rtol_tensor_batch_rule(ATEN_FN2(linalg_pinv, atol_rtol_tensor), input, input_bdim, atol, atol_bdim, rtol, rtol_bdim, hermitian, "linalg.pinv");
 }
 }
diff --git a/aten/src/ATen/functorch/BatchRulesLoss.cpp b/aten/src/ATen/functorch/BatchRulesLoss.cpp
index 22f3adff95a01..cd5ef41d4069f 100644
--- a/aten/src/ATen/functorch/BatchRulesLoss.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLoss.cpp
@@ -123,7 +123,7 @@ static Tensor binary_cross_entropy_plumbing(
 
 static Tensor binary_cross_entropy_backward_plumbing(
     const Tensor& grad, const Tensor& input, const Tensor& target,
-    const c10::optional<Tensor>& weight_opt, int64_t reduction) {
+    const std::optional<Tensor>& weight_opt, int64_t reduction) {
   auto maybe_layer = maybeCurrentDynamicLayer();
   vmap_check_escaped(maybe_layer, "binary_cross_entropy_backward_plumbing");
   int64_t cur_level = maybe_layer->layerId();
diff --git a/aten/src/ATen/functorch/BatchRulesNorm.cpp b/aten/src/ATen/functorch/BatchRulesNorm.cpp
index faf39d8e374a3..89a23fe0298d7 100644
--- a/aten/src/ATen/functorch/BatchRulesNorm.cpp
+++ b/aten/src/ATen/functorch/BatchRulesNorm.cpp
@@ -45,10 +45,10 @@ template<typename F, F Func>
 std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>,Tensor,optional<int64_t>>
 batch_norm_batch_rule(
     const Tensor& input, optional<int64_t> input_bdim,
-    const c10::optional<Tensor>& weight_opt, optional<int64_t> weight_bdim,
-    const c10::optional<Tensor>& bias_opt, optional<int64_t> bias_bdim,
-    const c10::optional<Tensor>& running_mean_opt, optional<int64_t> running_mean_bdim,
-    const c10::optional<Tensor>& running_var_opt, optional<int64_t> running_var_bdim,
+    const std::optional<Tensor>& weight_opt, optional<int64_t> weight_bdim,
+    const std::optional<Tensor>& bias_opt, optional<int64_t> bias_bdim,
+    const std::optional<Tensor>& running_mean_opt, optional<int64_t> running_mean_bdim,
+    const std::optional<Tensor>& running_var_opt, optional<int64_t> running_var_bdim,
     bool training, double momentum, double eps) {
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -63,7 +63,7 @@ batch_norm_batch_rule(
       "were not batched.\nIf you are using a module and do not need eval mode, please set `track_running_stats` to be False.",
       "If you are using a prebuilt module and do not need eval mode, please see the functorch website for resources on ",
       "how to patch your module to work with vmap");
-  c10::optional<int64_t> bdim_size;
+  std::optional<int64_t> bdim_size;
   Tensor result0;
   Tensor mean;
   Tensor rstd;
@@ -80,8 +80,8 @@ batch_norm_batch_rule(
     input_ = ensure_has_bdim(input_, input_bdim.has_value(), bdim_size.value());
     input_ = reshape_dim_into(0, /*channels dim*/1, input_);
 
-    c10::optional<Tensor> running_mean_;
-    c10::optional<Tensor> running_var_;
+    std::optional<Tensor> running_mean_;
+    std::optional<Tensor> running_var_;
     if (running_mean.defined()) {
       running_mean_ = moveBatchDimToFront(running_mean, running_mean_bdim);
       running_mean_ = ensure_has_bdim(*running_mean_, running_mean_bdim.has_value(), bdim_size.value());
@@ -127,8 +127,8 @@ template<typename F, F Func>
 std::tuple<at::Tensor,optional<int64_t>> batch_norm_backward_no_weight_bias_batch_rule(
     const at::Tensor & grad_out, optional<int64_t> grad_out_bdim,
     const at::Tensor & input, optional<int64_t> input_bdim,
-    const c10::optional<at::Tensor> & running_mean_opt, optional<int64_t> running_mean_bdim,
-    const c10::optional<at::Tensor> & running_var_opt, optional<int64_t> running_var_bdim,
+    const std::optional<at::Tensor> & running_mean_opt, optional<int64_t> running_mean_bdim,
+    const std::optional<at::Tensor> & running_var_opt, optional<int64_t> running_var_bdim,
     const at::Tensor & mean, optional<int64_t> mean_bdim,
     const at::Tensor & rstd, optional<int64_t> rstd_bdim,
     bool training, double eps) {
@@ -199,11 +199,11 @@ template<typename F, F Func>
 std::tuple<at::Tensor,at::Tensor,at::Tensor> batch_norm_backward_plumbing(
     const at::Tensor & grad_out,
     const at::Tensor & input,
-    const c10::optional<at::Tensor> & weight_opt,
-    const c10::optional<at::Tensor> & running_mean_opt,
-    const c10::optional<at::Tensor> & running_var_opt,
-    const c10::optional<at::Tensor> & save_mean_opt,
-    const c10::optional<at::Tensor> & save_rstd_opt,
+    const std::optional<at::Tensor> & weight_opt,
+    const std::optional<at::Tensor> & running_mean_opt,
+    const std::optional<at::Tensor> & running_var_opt,
+    const std::optional<at::Tensor> & save_mean_opt,
+    const std::optional<at::Tensor> & save_rstd_opt,
     bool training,
     double eps,
     std::array<bool,3> output_mask) {
@@ -284,8 +284,8 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> batch_norm_backward_plumbing(
 }
 
 static std::tuple<Tensor,Tensor,Tensor> native_group_norm_plumbing(
-    const Tensor & input, const c10::optional<Tensor> & weight_opt,
-    const c10::optional<Tensor> & bias_opt, int64_t N, int64_t C,
+    const Tensor & input, const std::optional<Tensor> & weight_opt,
+    const std::optional<Tensor> & bias_opt, int64_t N, int64_t C,
     int64_t HxW, int64_t group, double eps) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
@@ -372,7 +372,7 @@ static std::tuple<at::Tensor,optional<int64_t>> group_norm_backward_no_weight_bi
 
 static std::tuple<Tensor,Tensor,Tensor> native_group_norm_backward_plumbing(
   const Tensor & grad_out, const Tensor & input, const Tensor & mean,
-  const Tensor & rstd, const c10::optional<Tensor> & weight_opt,
+  const Tensor & rstd, const std::optional<Tensor> & weight_opt,
   int64_t N, int64_t C, int64_t HxW, int64_t group, std::array<bool,3> output_mask
 ) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -488,8 +488,8 @@ static std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>,Tensor,optio
 native_layer_norm_batch_rule(
     const Tensor& input, optional<int64_t> input_bdim,
     c10::SymIntArrayRef normalized_shape,
-    const c10::optional<Tensor>& weight_opt, optional<int64_t> weight_bdim,
-    const c10::optional<Tensor>& bias_opt, optional<int64_t> bias_bdim,
+    const std::optional<Tensor>& weight_opt, optional<int64_t> weight_bdim,
+    const std::optional<Tensor>& bias_opt, optional<int64_t> bias_bdim,
     double eps) {
   auto input_ = moveBatchDimToFront(input, input_bdim);
   if (!weight_bdim && !bias_bdim) {
@@ -573,8 +573,8 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_p
     at::IntArrayRef normalized_shape,
     const at::Tensor & mean,
     const at::Tensor & rstd,
-    const c10::optional<at::Tensor> & weight_opt,
-    const c10::optional<at::Tensor> & bias_opt,
+    const std::optional<at::Tensor> & weight_opt,
+    const std::optional<at::Tensor> & bias_opt,
     std::array<bool,3> output_mask) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
@@ -653,10 +653,10 @@ template <typename F, F Func>
 struct NativeBatchNormBatchRuleHelper {
   static std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>,Tensor,optional<int64_t>> apply(
     const Tensor& input, optional<int64_t> input_bdim,
-    const c10::optional<Tensor>& weight_opt, optional<int64_t> weight_bdim,
-    const c10::optional<Tensor>& bias_opt, optional<int64_t> bias_bdim,
-    const c10::optional<Tensor>& running_mean_opt, optional<int64_t> running_mean_bdim,
-    const c10::optional<Tensor>& running_var_opt, optional<int64_t> running_var_bdim,
+    const std::optional<Tensor>& weight_opt, optional<int64_t> weight_bdim,
+    const std::optional<Tensor>& bias_opt, optional<int64_t> bias_bdim,
+    const std::optional<Tensor>& running_mean_opt, optional<int64_t> running_mean_bdim,
+    const std::optional<Tensor>& running_var_opt, optional<int64_t> running_var_bdim,
     bool training, double momentum, double eps) {
     return batch_norm_batch_rule<F, Func>(
         input, input_bdim, weight_opt, weight_bdim, bias_opt, bias_bdim,
@@ -669,9 +669,9 @@ struct CudnnBatchNormBatchRuleHelper {
   static std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>,Tensor,optional<int64_t>,Tensor,optional<int64_t>> apply(
     const Tensor& input, optional<int64_t> input_bdim,
     const Tensor& weight_opt, optional<int64_t> weight_bdim,
-    const c10::optional<Tensor>& bias_opt, optional<int64_t> bias_bdim,
-    const c10::optional<Tensor>& running_mean_opt, optional<int64_t> running_mean_bdim,
-    const c10::optional<Tensor>& running_var_opt, optional<int64_t> running_var_bdim,
+    const std::optional<Tensor>& bias_opt, optional<int64_t> bias_bdim,
+    const std::optional<Tensor>& running_mean_opt, optional<int64_t> running_mean_bdim,
+    const std::optional<Tensor>& running_var_opt, optional<int64_t> running_var_bdim,
     bool training, double momentum, double eps) {
     auto reserve = at::empty({0}, input.options().dtype(kByte));  // in experiments, reserve was never set to anything other than empty by cuda
     auto res = batch_norm_batch_rule<F, Func>(
@@ -686,9 +686,9 @@ struct MiopenBatchNormBatchRuleHelper {
   static std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>,Tensor,optional<int64_t>> apply(
     const Tensor& input, optional<int64_t> input_bdim,
     const Tensor& weight_opt, optional<int64_t> weight_bdim,
-    const c10::optional<Tensor>& bias_opt, optional<int64_t> bias_bdim,
-    const c10::optional<Tensor>& running_mean_opt, optional<int64_t> running_mean_bdim,
-    const c10::optional<Tensor>& running_var_opt, optional<int64_t> running_var_bdim,
+    const std::optional<Tensor>& bias_opt, optional<int64_t> bias_bdim,
+    const std::optional<Tensor>& running_mean_opt, optional<int64_t> running_mean_bdim,
+    const std::optional<Tensor>& running_var_opt, optional<int64_t> running_var_bdim,
     bool training, double momentum, double eps) {
     return batch_norm_batch_rule<F, Func>(
         input, input_bdim, weight_opt, weight_bdim, bias_opt, bias_bdim,
@@ -716,11 +716,11 @@ struct NativeBatchNormBackwardBatchRuleHelper {
   static std::tuple<Tensor,Tensor,Tensor> apply(
     const at::Tensor & grad_out,
     const at::Tensor & input,
-    const c10::optional<at::Tensor> & weight_opt,
-    const c10::optional<at::Tensor> & running_mean_opt,
-    const c10::optional<at::Tensor> & running_var_opt,
-    const c10::optional<at::Tensor> & save_mean_opt,
-    const c10::optional<at::Tensor> & save_rstd_opt,
+    const std::optional<at::Tensor> & weight_opt,
+    const std::optional<at::Tensor> & running_mean_opt,
+    const std::optional<at::Tensor> & running_var_opt,
+    const std::optional<at::Tensor> & save_mean_opt,
+    const std::optional<at::Tensor> & save_rstd_opt,
     bool training,
     double eps,
     std::array<bool,3> output_mask) {
@@ -748,10 +748,10 @@ struct CudnnBatchNormBackwardBatchRuleHelper {
     const at::Tensor & input,
     const at::Tensor & grad_out,
     const at::Tensor & weight,
-    const c10::optional<at::Tensor> & running_mean_opt,
-    const c10::optional<at::Tensor> & running_var_opt,
-    const c10::optional<at::Tensor> & save_mean_opt,
-    const c10::optional<at::Tensor> & save_rstd_opt,
+    const std::optional<at::Tensor> & running_mean_opt,
+    const std::optional<at::Tensor> & running_var_opt,
+    const std::optional<at::Tensor> & save_mean_opt,
+    const std::optional<at::Tensor> & save_rstd_opt,
     double eps,
     const at::Tensor & reserve) {
 
@@ -777,10 +777,10 @@ struct MiopenBatchNormBackwardBatchRuleHelper {
     const at::Tensor & input,
     const at::Tensor & grad_out,
     const at::Tensor & weight,
-    const c10::optional<at::Tensor> & running_mean_opt,
-    const c10::optional<at::Tensor> & running_var_opt,
-    const c10::optional<at::Tensor> & save_mean_opt,
-    const c10::optional<at::Tensor> & save_rstd_opt,
+    const std::optional<at::Tensor> & running_mean_opt,
+    const std::optional<at::Tensor> & running_var_opt,
+    const std::optional<at::Tensor> & save_mean_opt,
+    const std::optional<at::Tensor> & save_rstd_opt,
     double eps) {
 
     auto maybe_layer = maybeCurrentDynamicLayer();
@@ -818,10 +818,10 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_batch_norm_backward_wr
     const at::Tensor & grad_out,
     const at::Tensor & input,
     const at::Tensor& weight_opt,
-    const c10::optional<at::Tensor> & running_mean_opt,
-    const c10::optional<at::Tensor> & running_var_opt,
-    const c10::optional<at::Tensor> & save_mean_opt,
-    const c10::optional<at::Tensor> & save_rstd_opt,
+    const std::optional<at::Tensor> & running_mean_opt,
+    const std::optional<at::Tensor> & running_var_opt,
+    const std::optional<at::Tensor> & save_mean_opt,
+    const std::optional<at::Tensor> & save_rstd_opt,
     bool training,
     double eps,
     std::array<bool,3> output_mask) {
@@ -834,10 +834,10 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_batch_norm_backward_w
     const at::Tensor & grad_out,
     const at::Tensor & input,
     const at::Tensor& weight_opt,
-    const c10::optional<at::Tensor> & running_mean_opt,
-    const c10::optional<at::Tensor> & running_var_opt,
-    const c10::optional<at::Tensor> & save_mean_opt,
-    const c10::optional<at::Tensor> & save_rstd_opt,
+    const std::optional<at::Tensor> & running_mean_opt,
+    const std::optional<at::Tensor> & running_var_opt,
+    const std::optional<at::Tensor> & save_mean_opt,
+    const std::optional<at::Tensor> & save_rstd_opt,
     bool training,
     double eps,
     std::array<bool,3> output_mask) {
@@ -850,13 +850,13 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_batch_norm_backward_w
 // work with dynamo anyway so we gain some buffer room to do wrong things here. The (reasonable) hope is that we will
 // make native_batch_norm composite implicit within a few weeks and we can fix this before vmap works with dynamo.
 static std::tuple<at::Tensor,at::Tensor,at::Tensor> _native_batch_norm_legit_batch(
-  const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+  const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
   Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps) {
     return at::native_batch_norm(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, eps);
 }
 
 static std::tuple<at::Tensor,at::Tensor,at::Tensor> _native_batch_norm_legit_no_stats_batch(
-  const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+  const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
   bool train, double momentum, double eps) {
     return at::native_batch_norm(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, eps);
 }
diff --git a/aten/src/ATen/functorch/BatchRulesRandomness.cpp b/aten/src/ATen/functorch/BatchRulesRandomness.cpp
index 79572f22ea3f6..fe2e790331fa0 100644
--- a/aten/src/ATen/functorch/BatchRulesRandomness.cpp
+++ b/aten/src/ATen/functorch/BatchRulesRandomness.cpp
@@ -58,7 +58,7 @@ Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) {
   }
 }
 
-static Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+static Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor& p_, std::optional<Generator> gen) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   auto cur_level = maybe_layer->layerId();
@@ -173,7 +173,7 @@ Tensor tensor_like_random_batch_rule(const Tensor& self, ExtraArgs... extra_args
   return (randomness == RandomnessType::Same) ? res : makeBatched(res, 0, cur_level);
 }
 
-static std::tuple<Tensor,Tensor> native_dropout_batching_rule(const Tensor& tensor, double p, c10::optional<bool> train) {
+static std::tuple<Tensor,Tensor> native_dropout_batching_rule(const Tensor& tensor, double p, std::optional<bool> train) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   const auto cur_level = maybe_layer->layerId();
@@ -213,7 +213,7 @@ static std::tuple<Tensor,Tensor> native_dropout_batching_rule(const Tensor& tens
   return std::make_tuple(output, mask);
 }
 
-static Tensor multinomial_batching_rule(const Tensor& self, const int64_t num_samples, const bool replacement, const c10::optional<Generator> generator) {
+static Tensor multinomial_batching_rule(const Tensor& self, const int64_t num_samples, const bool replacement, const std::optional<Generator> generator) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   const auto cur_level = maybe_layer->layerId();
diff --git a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
index cb6d6ac519dd8..90371c0eb9ce8 100644
--- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@@ -169,7 +169,7 @@ void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack
     new_dims.push_back(getPhysicalDim(self, self_bdim.has_value(), dim));
   }
   bool is_scalar_case = logical_dim == 0 && dims.size() == 1 && is_allowed_dim_on_scalar_tensor(dims[0]);
-  c10::optional<bool> maybe_keepdim;
+  std::optional<bool> maybe_keepdim;
   if (is_scalar_case) {
     // NOTE: [boxed_reduction_batch_rule scalar tensor handling]
     // Reduction operations in PyTorch have an edge case where they allow
@@ -321,9 +321,9 @@ static std::tuple<Tensor,optional<int64_t>> searchsorted_batch_rule(
     optional<int64_t> self_bdim,
     bool out_int32,
     bool right,
-    c10::optional<c10::string_view> side,
-    const c10::optional<Tensor>& sorter,
-    c10::optional<int64_t> sorter_bdim) {
+    std::optional<c10::string_view> side,
+    const std::optional<Tensor>& sorter,
+    std::optional<int64_t> sorter_bdim) {
   auto buckets_logical_rank = rankWithoutBatchDim(sorted_sequence, sorted_sequence_bdim);
   auto self_logical_rank = rankWithoutBatchDim(self, self_bdim);
 
diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
index 0a1475497b03d..839e0ee405abb 100644
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@@ -375,7 +375,7 @@ namespace {
   // Code is mostly duplicated from
   // https://github.com/pytorch/pytorch/blob/fb0e27d38a8fdab4e1c14d6378c9e41cb30fd6a3
   // /aten/src/ATen/native/TensorAdvancedIndexing.cpp#L379-L405
-  VmapDimVector get_indexed_shape(Tensor self, const torch::List<c10::optional<at::Tensor>> &orig)
+  VmapDimVector get_indexed_shape(Tensor self, const torch::List<std::optional<at::Tensor>> &orig)
   {
     at::native::checkIndexTensorTypes(orig);
     // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
@@ -869,8 +869,8 @@ Tensor index_copy_decomp(
 // through a decomposition: slice_scatter's output needs to have the same
 // size, size, strides and storage_offset as the input.
 Tensor slice_scatter_decomp(const Tensor &self, const Tensor &src,
-                            int64_t dim, c10::optional<int64_t> start,
-                            c10::optional<int64_t> end, int64_t step)
+                            int64_t dim, std::optional<int64_t> start,
+                            std::optional<int64_t> end, int64_t step)
 {
   auto idx = at::arange(start.value_or(0), end.value_or(self.size(dim)), step, self.options().dtype(kLong));
   idx = get_expanded_index(idx, self.sizes(), dim);
@@ -889,8 +889,8 @@ Tensor select_scatter_decomp(
 }
 
 std::tuple<Tensor, optional<int64_t>> diagonal_scatter_batch_rule(
-    const Tensor &self, c10::optional<int64_t> self_bdim,
-    const Tensor &src, c10::optional<int64_t> src_bdim,
+    const Tensor &self, std::optional<int64_t> self_bdim,
+    const Tensor &src, std::optional<int64_t> src_bdim,
     int64_t offset, int64_t dim1, int64_t dim2)
 {
   auto self_ = moveBatchDimToFront(self, self_bdim);
diff --git a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
index f44000674db8a..d8213a1b9e0dd 100644
--- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
@@ -63,7 +63,7 @@ std::tuple<Tensor,optional<int64_t>>
 to_other_batch_rule(const Tensor& self, optional<int64_t> self_bdim,
                     const Tensor& other, optional<int64_t> other_bdim,
                     bool non_blocking,
-                    bool copy, c10::optional<at::MemoryFormat> memory_format) {
+                    bool copy, std::optional<at::MemoryFormat> memory_format) {
   return std::make_tuple(self.to(other, non_blocking, copy, memory_format), self_bdim);
 }
 }
diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp
index 81e9d5b9aa21c..18f5d4f38f3cc 100644
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@@ -149,7 +149,7 @@ std::tuple<Tensor,optional<int64_t>> flip_batch_rule(const Tensor& self, optiona
 const Tensor& resize__plumbing(
     const Tensor& self,
     IntArrayRef size,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   TORCH_CHECK(
       !optional_memory_format.has_value() ||
       optional_memory_format == c10::MemoryFormat::Contiguous,
@@ -217,7 +217,7 @@ std::tuple<Tensor, optional<int64_t>> squeeze_batch_rule(const Tensor& self, opt
   }
 
   auto result = self.view_symint(squeezed_sizes);
-  return std::make_tuple(std::move(result), c10::optional<int64_t>(new_batch_idx));
+  return std::make_tuple(std::move(result), std::optional<int64_t>(new_batch_idx));
 }
 
 std::tuple<Tensor, optional<int64_t>> squeeze_dims_batch_rule(
@@ -335,8 +335,8 @@ std::tuple<Tensor,optional<int64_t>> slice_batch_rule(
     const Tensor& self,
     optional<int64_t> self_bdim,
     int64_t dim,
-    c10::optional<c10::SymInt> start,
-    c10::optional<c10::SymInt> end,
+    std::optional<c10::SymInt> start,
+    std::optional<c10::SymInt> end,
     c10::SymInt step) {
   auto self_ = moveBatchDimToFront(self, self_bdim);
   dim = getPhysicalDim(self, self_bdim.has_value(), dim);
diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp
index 45976fa855f32..35f2439c982db 100644
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@@ -387,7 +387,7 @@ bool isInplaceOp(const FunctionSchema& schema) {
   return return_alias_info && return_alias_info->isWrite();
 }
 
-c10::optional<size_t> findAliasedOutput(const FunctionSchema& schema, const int64_t immutable_input_idx) {
+std::optional<size_t> findAliasedOutput(const FunctionSchema& schema, const int64_t immutable_input_idx) {
   for (size_t res_idx = 0; res_idx != schema.returns().size(); ++res_idx) {
     if (schema.may_contain_alias(SchemaArgument(SchemaArgType::input, immutable_input_idx), SchemaArgument(SchemaArgType::output, res_idx))) {
       return res_idx; // for everything currently in native_functions, each input aliases at most one output (tensor list counts as one output)
diff --git a/aten/src/ATen/functorch/DynamicLayer.h b/aten/src/ATen/functorch/DynamicLayer.h
index 9311503f3538d..554e6678d09a1 100644
--- a/aten/src/ATen/functorch/DynamicLayer.h
+++ b/aten/src/ATen/functorch/DynamicLayer.h
@@ -71,7 +71,7 @@ TORCH_API int64_t initAndPushDynamicLayer(
     optional<bool> prev_fwd_grad_mode = nullopt,
     optional<bool> functionalize_add_back_views = nullopt);
 TORCH_API DynamicLayer popDynamicLayerAndDeleteMetadata();
-TORCH_API c10::optional<DynamicLayer> maybeCurrentDynamicLayer();
+TORCH_API std::optional<DynamicLayer> maybeCurrentDynamicLayer();
 TORCH_API const std::vector<DynamicLayer>& getDynamicLayerStack();
 TORCH_API void setDynamicLayerStack(const std::vector<DynamicLayer>& stack);
 TORCH_API void setDynamicLayerFrontBackKeysIncluded(bool included);
@@ -95,7 +95,7 @@ TORCH_API const std::shared_ptr<bool>& getLifeHandleForLevel(int64_t level);
 TORCH_API bool isInplaceOp(const c10::FunctionSchema& schema);
 
 // Given the indices of unwrapped inputs and the schema, this returns the indices of any outputs that should remain unwrapped
-TORCH_API c10::optional<size_t> findAliasedOutput(const FunctionSchema& schema, const int64_t immutable_input);
+TORCH_API std::optional<size_t> findAliasedOutput(const FunctionSchema& schema, const int64_t immutable_input);
 
 TORCH_API Tensor unwrapIfDead(const Tensor& tensor);
 TORCH_API bool isDeadTensorWrapper(const Tensor& tensor);
diff --git a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
index b7a131766ec86..760035d8e46ec 100644
--- a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
@@ -536,7 +536,7 @@ Tensor cat_batching_rule(const ITensorListRef& tensors, int64_t dim) {
   // we'll just slice the tensor to get a Tensor of shape [0] to pass to at::cat.
   std::vector<Tensor> tensors_to_cat;
   tensors_to_cat.reserve(tensors.size());
-  c10::optional<int64_t> bdim_size = c10::nullopt;
+  std::optional<int64_t> bdim_size = c10::nullopt;
 
   // find the bdim size. Might not exist if all BatchedTensors should be skipped
   // by cat's special case.
@@ -573,7 +573,7 @@ Tensor cat_batching_rule(const ITensorListRef& tensors, int64_t dim) {
   }
 
   auto new_dim = bdim_size.has_value() ? dim + 1 : dim;
-  c10::optional<int64_t> new_bdim = bdim_size.has_value() ? c10::make_optional((int64_t)0) : nullopt;
+  std::optional<int64_t> new_bdim = bdim_size.has_value() ? c10::make_optional((int64_t)0) : nullopt;
   auto result = at::cat(tensors_to_cat, new_dim);
   return makeBatched(result, new_bdim, get_current_level());
 }
diff --git a/aten/src/ATen/functorch/PlumbingHelper.cpp b/aten/src/ATen/functorch/PlumbingHelper.cpp
index 76982fd1b6480..e2a3a9582cf49 100644
--- a/aten/src/ATen/functorch/PlumbingHelper.cpp
+++ b/aten/src/ATen/functorch/PlumbingHelper.cpp
@@ -40,7 +40,7 @@ std::vector<Tensor> makeBatchedVector(const std::vector<Tensor>& tensors, option
   return res;
 }
 
-std::tuple<Tensor, c10::optional<int64_t>> unwrapTensorAtLevel(const Tensor& tensor, int64_t level) {
+std::tuple<Tensor, std::optional<int64_t>> unwrapTensorAtLevel(const Tensor& tensor, int64_t level) {
   auto* batched = maybeGetBatchedImpl(tensor);
   if (!batched) {
     return std::make_tuple(tensor, nullopt);
@@ -56,7 +56,7 @@ bool isBatchedAtLevel(const Tensor& tensor, int64_t level) {
   return std::get<1>(result).has_value();
 }
 
-bool isBatchedAtLevel(const c10::optional<Tensor>& maybe_tensor, int64_t level) {
+bool isBatchedAtLevel(const std::optional<Tensor>& maybe_tensor, int64_t level) {
   if (!maybe_tensor.has_value()) {
     return false;
   }
@@ -72,7 +72,7 @@ bool isBatchedAtLevel(ITensorListRef tensors, int64_t level) {
   return false;
 }
 
-bool isBatchedAtLevel(const c10::List<c10::optional<Tensor>>& maybe_tensors, int64_t level) {
+bool isBatchedAtLevel(const c10::List<std::optional<Tensor>>& maybe_tensors, int64_t level) {
   for (const auto idx : c10::irange(0, maybe_tensors.size())) {
     const auto& maybe_tensor = maybe_tensors.get(idx);
     if (isBatchedAtLevel(maybe_tensor, level)) {
diff --git a/aten/src/ATen/functorch/PlumbingHelper.h b/aten/src/ATen/functorch/PlumbingHelper.h
index 552a618b144c8..c2c16c67bcd91 100644
--- a/aten/src/ATen/functorch/PlumbingHelper.h
+++ b/aten/src/ATen/functorch/PlumbingHelper.h
@@ -35,16 +35,16 @@ TORCH_API Tensor makeBatched(const Tensor& tensor, optional<int64_t> bdim, int64
 // If `tensor` is not a BatchedTensor, or is a BatchedTensor but the level
 // doesn't match, then this returns (tensor, nullopt).
 // Otherwise, it returns (unwrap(tensor), bdim).
-TORCH_API std::tuple<Tensor, c10::optional<int64_t>> unwrapTensorAtLevel(const Tensor& tensor, int64_t level);
+TORCH_API std::tuple<Tensor, std::optional<int64_t>> unwrapTensorAtLevel(const Tensor& tensor, int64_t level);
 
 // Creates a vector of BatchedTensor
 TORCH_API std::vector<Tensor> makeBatchedVector(const std::vector<Tensor>& tensors, optional<int64_t> bdim, int64_t level);
 
 // Returns True if ANY tensor in tensors is batched at level
 TORCH_API bool isBatchedAtLevel(ITensorListRef tensors, int64_t level);
-TORCH_API bool isBatchedAtLevel(const c10::List<c10::optional<Tensor>>& maybe_tensors, int64_t level);
+TORCH_API bool isBatchedAtLevel(const c10::List<std::optional<Tensor>>& maybe_tensors, int64_t level);
 TORCH_API bool isBatchedAtLevel(const Tensor& tensor, int64_t level);
-TORCH_API bool isBatchedAtLevel(const c10::optional<Tensor>& maybe_tensor, int64_t level);
+TORCH_API bool isBatchedAtLevel(const std::optional<Tensor>& maybe_tensor, int64_t level);
 
 // Convenience helper. Returns true if any tensor is batched at level
 TORCH_API bool areAnyBatchedAtLevel(ArrayRef<optional<Tensor>> maybe_tensors, int64_t level);
diff --git a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
index 355ac5965da51..ce3f20ef97efc 100644
--- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
+++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
@@ -73,7 +73,7 @@ static bool can_perform_inplace(const Tensor& a, const Tensor& b) {
 
 // TODO: linear is pretty important for performance, but I'm not sure how to work
 // around the in-place.
-Tensor linear_hack(const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt) {
+Tensor linear_hack(const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt) {
   // See [Note: hacky wrapper removal for optional tensor]
   auto bias = bias_opt.has_value()
     ? c10::MaybeOwned<Tensor>::borrowed(*bias_opt)
@@ -123,8 +123,8 @@ static inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64
 Tensor binary_cross_entropy_with_logits_hack(
     const Tensor& input,
     const Tensor& target,
-    const c10::optional<Tensor>& weight_opt,
-    const c10::optional<Tensor>& pos_weight_opt,
+    const std::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& pos_weight_opt,
     int64_t reduction) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
diff --git a/aten/src/ATen/miopen/AutocastRNN.cpp b/aten/src/ATen/miopen/AutocastRNN.cpp
index 271d80ea03cd4..a23eb4a1a19b8 100644
--- a/aten/src/ATen/miopen/AutocastRNN.cpp
+++ b/aten/src/ATen/miopen/AutocastRNN.cpp
@@ -14,7 +14,7 @@ miopen_rnn(const Tensor & input_r,
            TensorList weight,
            int64_t weight_stride0,
            const Tensor & hx,
-           const c10::optional<Tensor>& cx_opt,
+           const std::optional<Tensor>& cx_opt,
            int64_t fn_mode,
            int64_t fn_hidden_size,
            int64_t fn_num_layers,
@@ -23,7 +23,7 @@ miopen_rnn(const Tensor & input_r,
            bool fn_train,
            bool fn_bidirectional,
            IntArrayRef fn_batch_sizes,
-           const c10::optional<Tensor>& fn_dropout_state_opt) {
+           const std::optional<Tensor>& fn_dropout_state_opt) {
 
 #if AT_ROCM_ENABLED()
 
diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp
index f7918ac18993c..baa91eabb3898 100644
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@@ -20,11 +20,11 @@
 namespace at::detail {
 TensorBase empty_mps(
     IntArrayRef size,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt) {
 #if defined(__APPLE__)
 #if __is_target_os(macOS)
   if (at::hasMPS()) {
@@ -95,7 +95,7 @@ TensorBase empty_strided_mps(
     IntArrayRef size,
     IntArrayRef stride,
     ScalarType dtype,
-    c10::optional<Device> device_opt) {
+    std::optional<Device> device_opt) {
 #if defined(__APPLE__)
 #if __is_target_os(macOS)
   if (at::hasMPS()) {
diff --git a/aten/src/ATen/mps/EmptyTensor.h b/aten/src/ATen/mps/EmptyTensor.h
index 88a29547406cd..39b206cb3031d 100644
--- a/aten/src/ATen/mps/EmptyTensor.h
+++ b/aten/src/ATen/mps/EmptyTensor.h
@@ -7,11 +7,11 @@ namespace at::detail {
 
 C10_EXPORT TensorBase empty_mps(
     IntArrayRef size,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt);
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
 C10_EXPORT TensorBase empty_mps(
     IntArrayRef size, const TensorOptions &options);
 
@@ -19,7 +19,7 @@ C10_EXPORT TensorBase empty_strided_mps(
     IntArrayRef size,
     IntArrayRef stride,
     ScalarType dtype,
-    c10::optional<Device> device_opt);
+    std::optional<Device> device_opt);
 
 C10_EXPORT TensorBase empty_strided_mps(
     IntArrayRef size,
diff --git a/aten/src/ATen/mps/MPSGuardImpl.h b/aten/src/ATen/mps/MPSGuardImpl.h
index fe43fcf40fd34..1b57d2966767a 100644
--- a/aten/src/ATen/mps/MPSGuardImpl.h
+++ b/aten/src/ATen/mps/MPSGuardImpl.h
@@ -52,7 +52,7 @@ struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface
     return Device(c10::DeviceType::MPS, 0);
   }
 
-  c10::optional<Device> uncheckedGetDevice() const noexcept {
+  std::optional<Device> uncheckedGetDevice() const noexcept {
     return Device(c10::DeviceType::MPS, 0);
   }
 
@@ -112,12 +112,12 @@ struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface
 struct OptionalMPSGuard {
   explicit OptionalMPSGuard() : guard_() {}
 
-  explicit OptionalMPSGuard(c10::optional<Device> device_opt)
+  explicit OptionalMPSGuard(std::optional<Device> device_opt)
       : guard_(device_opt) {}
 
   /// Set the current MPS device to the passed device index, if it is not
   /// nullopt
-  explicit OptionalMPSGuard(c10::optional<DeviceIndex> device_index_opt)
+  explicit OptionalMPSGuard(std::optional<DeviceIndex> device_index_opt)
       : guard_(device_index_opt) {}
 
   // Copy is not allowed
@@ -147,14 +147,14 @@ struct OptionalMPSGuard {
 
   /// Returns the device that was set immediately prior to initialization of the
   /// guard, or nullopt if the guard is uninitialized.
-  c10::optional<Device> original_device() const {
+  std::optional<Device> original_device() const {
     return guard_.original_device();
   }
 
   /// Returns the most recent device that was set using this device guard,
   /// either from construction, or via set_device, if the guard is initialized,
   /// or nullopt if the guard is uninitialized.
-  c10::optional<Device> current_device() const {
+  std::optional<Device> current_device() const {
     return guard_.current_device();
   }
 
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index 533bc32216365..a0141f974923e 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -572,7 +572,7 @@ inline void _rrelu_with_noise_train(
     const Tensor& noise,
     const Scalar& lower_,
     const Scalar& upper_,
-    c10::optional<Generator> generator) {
+    std::optional<Generator> generator) {
   using opmath_t = at::opmath_type<scalar_t>;
   opmath_t lower = lower_.to<opmath_t>();
   opmath_t upper = upper_.to<opmath_t>();
@@ -603,8 +603,9 @@ Tensor& rrelu_with_noise_out_cpu(const Tensor& self,
     const Scalar& lower,
     const Scalar& upper,
     bool training,
-    c10::optional<Generator> generator,
+    std::optional<Generator> generator,
     Tensor& output) {
+  TORCH_CHECK(self.sym_sizes() == noise.sym_sizes(), "noise tensor shape must match self tensor shape. Got self.shape = ", self.sym_sizes(), " noise.shape = ", noise.sym_sizes());
   if (training) {
     AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "rrelu_with_noise_out_cpu", [&] {
       _rrelu_with_noise_train<scalar_t>(output, self.contiguous(), noise, lower, upper, generator);
@@ -625,7 +626,7 @@ Tensor rrelu_with_noise_cpu(
     const Scalar& lower,
     const Scalar& upper,
     bool training,
-    c10::optional<Generator> generator) {
+    std::optional<Generator> generator) {
   auto output = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   return at::native::rrelu_with_noise_out_cpu(
       self, noise, lower, upper, training, std::move(generator), output);
@@ -637,7 +638,7 @@ Tensor& rrelu_with_noise_cpu_(
     const Scalar& lower,
     const Scalar& upper,
     bool training,
-    c10::optional<Generator> generator) {
+    std::optional<Generator> generator) {
   return at::native::rrelu_with_noise_out_cpu(
       self, noise, lower, upper, training, std::move(generator), self);
 }
@@ -660,12 +661,12 @@ Tensor rrelu_with_noise_backward(
   }
 }
 
-Tensor rrelu(const Tensor & self, const Scalar& lower, const Scalar& upper, bool training, c10::optional<Generator> generator) {
+Tensor rrelu(const Tensor & self, const Scalar& lower, const Scalar& upper, bool training, std::optional<Generator> generator) {
   TORCH_CHECK(lower.to<double>() <= upper.to<double>(), "Lower bound should be less than or equal to the upper bound")
   return at::rrelu_with_noise(self, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT), lower, upper, training, std::move(generator));
 }
 
-Tensor & rrelu_(Tensor & self, const Scalar& lower, const Scalar& upper, bool training, c10::optional<Generator> generator) {
+Tensor & rrelu_(Tensor & self, const Scalar& lower, const Scalar& upper, bool training, std::optional<Generator> generator) {
   TORCH_CHECK(lower.to<double>() <= upper.to<double>(), "Lower bound should be less than or equal to the upper bound")
   return at::rrelu_with_noise_(self, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT), lower, upper, training, std::move(generator));
 }
diff --git a/aten/src/ATen/native/AveragePool2d.cpp b/aten/src/ATen/native/AveragePool2d.cpp
index 854b4585db10a..368dc02c2832f 100644
--- a/aten/src/ATen/native/AveragePool2d.cpp
+++ b/aten/src/ATen/native/AveragePool2d.cpp
@@ -21,7 +21,7 @@ TORCH_PRECOMPUTE_META_FUNC(avg_pool2d)
  IntArrayRef padding,
  bool ceil_mode,
  bool count_include_pad,
- c10::optional<int64_t> divisor_override) {
+ std::optional<int64_t> divisor_override) {
   // #20866, #22032: Guarantee this for the official C++ API?
   TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
     "avg_pool2d: kernel_size must either be a single int, or a tuple of two ints");
@@ -101,7 +101,7 @@ TORCH_META_FUNC(avg_pool2d_backward) (
   IntArrayRef padding,
   bool ceil_mode,
   bool count_include_pad,
-  c10::optional<int64_t> divisor_override
+  std::optional<int64_t> divisor_override
 ) {
   // #20866, #22032: Guarantee this for the official C++ API?
   TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
@@ -159,7 +159,7 @@ TORCH_IMPL_FUNC(avg_pool2d_out_cpu)
  int64_t padW,
  bool ceil_mode,
  bool count_include_pad,
- c10::optional<int64_t> divisor_override,
+ std::optional<int64_t> divisor_override,
  const Tensor& output) {
   avg_pool2d_kernel(
       kCPU,
@@ -183,7 +183,7 @@ TORCH_IMPL_FUNC(avg_pool2d_backward_out_cpu) (
   IntArrayRef padding,
   bool ceil_mode,
   bool count_include_pad,
-  c10::optional<int64_t> divisor_override,
+  std::optional<int64_t> divisor_override,
   const Tensor& gradInput
 ) {
   const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
diff --git a/aten/src/ATen/native/AveragePool3d.cpp b/aten/src/ATen/native/AveragePool3d.cpp
index c2d7b44a5076c..701ad09bfd512 100644
--- a/aten/src/ATen/native/AveragePool3d.cpp
+++ b/aten/src/ATen/native/AveragePool3d.cpp
@@ -25,7 +25,7 @@ TORCH_META_FUNC(avg_pool3d) (
   IntArrayRef padding,
   bool ceil_mode,
   bool count_include_pad,
-  c10::optional<int64_t> divisor_override
+  std::optional<int64_t> divisor_override
 ) {
   // #20866, #22032: Guarantee this for the official C++ API?
   TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 3,
@@ -94,7 +94,7 @@ TORCH_META_FUNC(avg_pool3d_backward) (
   IntArrayRef padding,
   bool ceil_mode,
   bool count_include_pad,
-  c10::optional<int64_t> divisor_override
+  std::optional<int64_t> divisor_override
 ) {
   // #20866, #22032: Guarantee this for the official C++ API?
   TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 3,
@@ -174,7 +174,7 @@ static void avg_pool3d_out_frame(
           int padW,
           int padH,
           bool count_include_pad,
-          c10::optional<int64_t> divisor_override)
+          std::optional<int64_t> divisor_override)
 {
   at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
     for (const auto k : c10::irange(start, end)) {
@@ -261,7 +261,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) (
   IntArrayRef padding,
   bool ceil_mode,
   bool count_include_pad,
-  c10::optional<int64_t> divisor_override,
+  std::optional<int64_t> divisor_override,
   const Tensor& output
 ) {
   const int kT = safe_downcast<int, int64_t>(kernel_size[0]);
@@ -362,7 +362,7 @@ static void avg_pool3d_backward_out_frame(
           int padW,
           int padH,
           bool count_include_pad,
-          c10::optional<int64_t> divisor_override)
+          std::optional<int64_t> divisor_override)
 {
   at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
     for (const auto k : c10::irange(start, end)) {
@@ -441,7 +441,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cpu) (
   IntArrayRef padding,
   bool ceil_mode,
   bool count_include_pad,
-  c10::optional<int64_t> divisor_override,
+  std::optional<int64_t> divisor_override,
   const Tensor& gradInput
 ) {
   const int kT = safe_downcast<int, int64_t>(kernel_size[0]);
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 40e6b34dc9725..ce4b4d15b7968 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -656,7 +656,7 @@ TORCH_META_FUNC(linalg_qr)(const Tensor& A,
 TORCH_META_FUNC(_linalg_svd)(const Tensor& A,
                              bool full_matrices,
                              bool compute_uv,
-                             c10::optional<c10::string_view> driver) {
+                             std::optional<c10::string_view> driver) {
   at::native::checkIsMatrix(A, "linalg.svd");
   at::native::checkFloatingOrComplex(A, "linalg.svd");
 
@@ -3128,7 +3128,7 @@ DEFINE_DISPATCH(svd_stub);
 TORCH_IMPL_FUNC(_linalg_svd_out)(const Tensor& A,
                                  const bool full_matrices,
                                  const bool compute_uv,
-                                 c10::optional<c10::string_view> driver,
+                                 std::optional<c10::string_view> driver,
                                  const Tensor & U,
                                  const Tensor & S,
                                  const Tensor & Vh) {
@@ -3177,7 +3177,7 @@ TORCH_IMPL_FUNC(_linalg_svd_out)(const Tensor& A,
 std::tuple<Tensor&, Tensor&, Tensor&>
 linalg_svd_out(const Tensor& A,
                bool full_matrices,
-               c10::optional<c10::string_view> driver,
+               std::optional<c10::string_view> driver,
                Tensor & U,
                Tensor & S,
                Tensor & Vh) {
@@ -3196,12 +3196,12 @@ linalg_svd_out(const Tensor& A,
 }
 
 std::tuple<Tensor, Tensor, Tensor> linalg_svd(const Tensor& A, bool full_matrices,
-    c10::optional<c10::string_view> driver) {
+    std::optional<c10::string_view> driver) {
   return at::_linalg_svd(A, full_matrices, /*compute_uv=*/true, driver);
 }
 
 // See note in linalg_svd for why this function does not have an _ex variant
-Tensor& linalg_svdvals_out(const Tensor& A, c10::optional<c10::string_view> driver, Tensor & S) {
+Tensor& linalg_svdvals_out(const Tensor& A, std::optional<c10::string_view> driver, Tensor & S) {
   // Dummies
   auto U = at::empty({0}, A.options());
   auto Vh = at::empty({0}, A.options());
@@ -3209,7 +3209,7 @@ Tensor& linalg_svdvals_out(const Tensor& A, c10::optional<c10::string_view> driv
   return S;
 }
 
-Tensor linalg_svdvals(const Tensor& A, c10::optional<c10::string_view> driver) {
+Tensor linalg_svdvals(const Tensor& A, std::optional<c10::string_view> driver) {
   return std::get<1>(at::_linalg_svd(A, /*full_matrices=*/false,
                      /*compute_uv=*/_may_require_fw_or_bw_grad(A),
                      /*driver=*/driver));
@@ -3469,7 +3469,7 @@ static void linalg_lstsq_out_info(
   }
 }
 
-static std::string get_default_lstsq_driver(c10::optional<c10::string_view> driver, const Tensor& input) {
+static std::string get_default_lstsq_driver(std::optional<c10::string_view> driver, const Tensor& input) {
   // if `driver` is empty, we set driver_str to "gels" if working with CUDA tensors,
   // otherwise to "gelsy" driver.
   std::string driver_str;
@@ -3505,8 +3505,8 @@ static std::string get_default_lstsq_driver(c10::optional<c10::string_view> driv
 std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> linalg_lstsq_out(
     const Tensor& input,
     const Tensor& other,
-    c10::optional<double> rcond,
-    c10::optional<c10::string_view> driver,
+    std::optional<double> rcond,
+    std::optional<c10::string_view> driver,
     Tensor& solution,
     Tensor& residuals,
     Tensor& rank,
@@ -3668,8 +3668,8 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> linalg_lstsq_out(
 
 std::tuple<Tensor, Tensor, Tensor, Tensor> linalg_lstsq(
     const Tensor& input, const Tensor& other,
-    c10::optional<double> rcond,
-    c10::optional<c10::string_view> driver) {
+    std::optional<double> rcond,
+    std::optional<c10::string_view> driver) {
   Tensor solution = at::empty({0}, input.options());
   Tensor residuals = at::empty({0}, input.options().dtype(toRealValueType(input.scalar_type())));
   Tensor rank = at::empty({0}, input.options().dtype(at::kLong));
@@ -4003,7 +4003,7 @@ Tensor linalg_solve_triangular(
 
 Tensor linalg_vander_symint(
     const Tensor& x,
-    c10::optional<c10::SymInt> N) {
+    std::optional<c10::SymInt> N) {
   auto t = x.scalar_type();
   TORCH_CHECK(t == ScalarType::Float ||
               t == ScalarType::Double ||
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.h b/aten/src/ATen/native/BatchLinearAlgebra.h
index efbe7ce1b9d1c..c8402640aa08a 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.h
+++ b/aten/src/ATen/native/BatchLinearAlgebra.h
@@ -304,7 +304,7 @@ using svd_fn = void (*)(
     const Tensor& /*A*/,
     const bool /*full_matrices*/,
     const bool /*compute_uv*/,
-    const c10::optional<c10::string_view>& /*driver*/,
+    const std::optional<c10::string_view>& /*driver*/,
     const Tensor& /*U*/,
     const Tensor& /*S*/,
     const Tensor& /*Vh*/,
diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
index f29970afe2b44..79e7b8b049381 100644
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@@ -1087,7 +1087,7 @@ static void apply_svd(const Tensor& A,
 void svd_kernel(const Tensor& A,
                 const bool full_matrices,
                 const bool compute_uv,
-                const c10::optional<c10::string_view>& driver,
+                const std::optional<c10::string_view>& driver,
                 const Tensor& U,
                 const Tensor& S,
                 const Tensor& Vh,
diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index 78f57470a922d..19c70672fb93c 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -173,7 +173,7 @@ TORCH_META_FUNC2(div, Tensor) (const Tensor& self, const Tensor& other) {
   build_borrowing_binary_float_op(maybe_get_output(), self, other);
 }
 
-TORCH_META_FUNC2(div, Tensor_mode) (const Tensor& self, const Tensor& other, c10::optional<c10::string_view> rounding_mode) {
+TORCH_META_FUNC2(div, Tensor_mode) (const Tensor& self, const Tensor& other, std::optional<c10::string_view> rounding_mode) {
   if (!rounding_mode.has_value()) {
     build_borrowing_binary_float_op(maybe_get_output(), self, other);
   // NOLINTNEXTLINE(bugprone-branch-clone)
@@ -303,7 +303,7 @@ TORCH_META_FUNC2(xlogy, Tensor) (const Tensor& self, const Tensor& other) {
   build_borrowing_binary_float_op(maybe_get_output(), self, other);
 }
 
-TORCH_META_FUNC(logit_backward) (const Tensor& grad_output, const Tensor& input, c10::optional<double> eps) {
+TORCH_META_FUNC(logit_backward) (const Tensor& grad_output, const Tensor& input, std::optional<double> eps) {
   build_borrowing_binary_op(maybe_get_output(), grad_output, input);
 }
 
@@ -448,7 +448,7 @@ TORCH_IMPL_FUNC(div_out) (const Tensor& self, const Tensor& other, const Tensor&
 }
 
 TORCH_IMPL_FUNC(div_out_mode) (
-  const Tensor& self, const Tensor& other, c10::optional<c10::string_view> rounding_mode, const Tensor& result
+  const Tensor& self, const Tensor& other, std::optional<c10::string_view> rounding_mode, const Tensor& result
 ) {
   if (!rounding_mode.has_value()) {
     div_true_stub(device_type(), *this);
@@ -459,7 +459,7 @@ TORCH_IMPL_FUNC(div_out_mode) (
   }
 }
 
-TORCH_IMPL_FUNC(logit_backward_out) (const Tensor& grad_output, const Tensor& input, c10::optional<double> eps, const Tensor& result) {
+TORCH_IMPL_FUNC(logit_backward_out) (const Tensor& grad_output, const Tensor& input, std::optional<double> eps, const Tensor& result) {
   logit_backward_stub(device_type(), *this, Scalar(eps ? eps.value() : -1.0));
 }
 
@@ -896,11 +896,11 @@ Tensor& div_(Tensor& self, const Scalar& other) {
   return self.div_(wrapped_scalar_tensor(other)); // redispatch!
 }
 
-Tensor div(const Tensor& self, const Scalar& other, c10::optional<c10::string_view> rounding_mode) {
+Tensor div(const Tensor& self, const Scalar& other, std::optional<c10::string_view> rounding_mode) {
   return self.div(wrapped_scalar_tensor(other), std::move(rounding_mode)); // redispatch!
 }
 
-Tensor& div_(Tensor& self, const Scalar& other, c10::optional<c10::string_view> rounding_mode) {
+Tensor& div_(Tensor& self, const Scalar& other, std::optional<c10::string_view> rounding_mode) {
   return self.div_(wrapped_scalar_tensor(other), std::move(rounding_mode)); // redispatch!
 }
 
@@ -925,23 +925,23 @@ Tensor& divide_(Tensor& self, const Scalar& other) {
   return self.div_(other);
 }
 
-Tensor& divide_out(const Tensor& self, const Tensor& other, c10::optional<c10::string_view> rounding_mode, Tensor& result) {
+Tensor& divide_out(const Tensor& self, const Tensor& other, std::optional<c10::string_view> rounding_mode, Tensor& result) {
   return at::div_out(result, self, other, std::move(rounding_mode));
 }
 
-Tensor divide(const Tensor& self, const Tensor& other, c10::optional<c10::string_view> rounding_mode) {
+Tensor divide(const Tensor& self, const Tensor& other, std::optional<c10::string_view> rounding_mode) {
   return self.div(other, std::move(rounding_mode));
 }
 
-Tensor& divide_(Tensor& self, const Tensor& other, c10::optional<c10::string_view> rounding_mode) {
+Tensor& divide_(Tensor& self, const Tensor& other, std::optional<c10::string_view> rounding_mode) {
   return self.div_(other, std::move(rounding_mode));
 }
 
-Tensor divide(const Tensor& self, const Scalar& other, c10::optional<c10::string_view> rounding_mode) {
+Tensor divide(const Tensor& self, const Scalar& other, std::optional<c10::string_view> rounding_mode) {
   return self.div(other, std::move(rounding_mode));
 }
 
-Tensor& divide_(Tensor& self, const Scalar& other, c10::optional<c10::string_view> rounding_mode) {
+Tensor& divide_(Tensor& self, const Scalar& other, std::optional<c10::string_view> rounding_mode) {
   return self.div_(other, std::move(rounding_mode));
 }
 
diff --git a/aten/src/ATen/native/Bucketization.cpp b/aten/src/ATen/native/Bucketization.cpp
index 736273a40cb09..98e37af91b316 100644
--- a/aten/src/ATen/native/Bucketization.cpp
+++ b/aten/src/ATen/native/Bucketization.cpp
@@ -146,8 +146,8 @@ Tensor& searchsorted_out_cpu(
     const Tensor& self,
     bool out_int32,
     bool right,
-    const c10::optional<c10::string_view> side_opt,
-    const c10::optional<Tensor>& sorter_opt,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter_opt,
     Tensor& result) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> sorter_maybe_owned = at::borrow_from_optional_tensor(sorter_opt);
@@ -193,8 +193,8 @@ Tensor& searchsorted_out_cpu(
     const Scalar& self,
     bool out_int32,
     bool right,
-    const c10::optional<c10::string_view> side_opt,
-    const c10::optional<Tensor>& sorter_opt,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter_opt,
     Tensor& result) {
   const Tensor& scalar_tensor = searchsorted_scalar_tensor(self, sorted_sequence.device());
   return searchsorted_out_cpu(sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter_opt, result);
@@ -205,8 +205,8 @@ Tensor searchsorted_cpu(
       const Tensor& self,
       bool out_int32,
       bool right,
-      const c10::optional<c10::string_view> side_opt,
-      const c10::optional<Tensor>& sorter_opt) {
+      const std::optional<c10::string_view> side_opt,
+      const std::optional<Tensor>& sorter_opt) {
   ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
   c10::TensorOptions options = TensorOptions().device(self.options().device()).dtype(scalar_type);
   Tensor result = at::empty({0}, options, MemoryFormat::Contiguous);
@@ -219,8 +219,8 @@ Tensor searchsorted_cpu(
     const Scalar& self,
     bool out_int32,
     bool right,
-    const c10::optional<c10::string_view> side_opt,
-    const c10::optional<Tensor>& sorter_opt) {
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter_opt) {
   const Tensor& scalar_tensor = searchsorted_scalar_tensor(self, sorted_sequence.device());
   return searchsorted_cpu(sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter_opt);
 }
diff --git a/aten/src/ATen/native/BucketizationUtils.h b/aten/src/ATen/native/BucketizationUtils.h
index 59d459bd9c29e..90747c264b156 100644
--- a/aten/src/ATen/native/BucketizationUtils.h
+++ b/aten/src/ATen/native/BucketizationUtils.h
@@ -107,7 +107,7 @@ inline void searchsorted_pre_check(
     const Tensor& output,
     const bool out_int32,
     const bool right,
-    const c10::optional<c10::string_view> side_opt,
+    const std::optional<c10::string_view> side_opt,
     const Tensor& sorter) {
   if (side_opt) {
     const c10::string_view side = *side_opt;
diff --git a/aten/src/ATen/native/CPUFallback.cpp b/aten/src/ATen/native/CPUFallback.cpp
index 502c61e4d144c..1d0930cf3a5ea 100644
--- a/aten/src/ATen/native/CPUFallback.cpp
+++ b/aten/src/ATen/native/CPUFallback.cpp
@@ -48,7 +48,7 @@ static std::vector<at::Tensor> to_cpu(const at::TensorList& tensors) {
   return cpu_tensors;
 }
 
-static c10::optional<c10::Device> compute_target_device(std::vector<at::Tensor>& t_args, std::vector<c10::List<at::Tensor>> tlist_args) {
+static std::optional<c10::Device> compute_target_device(std::vector<at::Tensor>& t_args, std::vector<c10::List<at::Tensor>> tlist_args) {
   // Decide what device to move the output tensor(s) to.
   // The current convention is that we use the first tensor arg to pick the device
   // Barring that, we take the first tensor from a TensorList arg.
@@ -89,7 +89,7 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool
   std::vector<c10::List<at::Tensor>> tensorlist_args;
   std::vector<int> tensorlist_args_indices;
 
-  c10::optional<c10::Device> tgt_device = c10::nullopt;
+  std::optional<c10::Device> tgt_device = c10::nullopt;
   // save converted cpu tensor for TensorList
   std::vector<c10::IValue> tensorlist_cpu_args;
 
diff --git a/aten/src/ATen/native/ComparisonUtils.cpp b/aten/src/ATen/native/ComparisonUtils.cpp
index 5a1138d041b1c..57f00ec86137f 100644
--- a/aten/src/ATen/native/ComparisonUtils.cpp
+++ b/aten/src/ATen/native/ComparisonUtils.cpp
@@ -25,7 +25,7 @@ void _assert_match(const O& original, const C& compared, const std::string& name
   }
 }
 
-void _assert_tensor_metadata(at::Tensor const& tensor, at::OptionalIntArrayRef sizes, at::OptionalIntArrayRef strides, c10::optional<c10::ScalarType> dtype) {
+void _assert_tensor_metadata(at::Tensor const& tensor, at::OptionalIntArrayRef sizes, at::OptionalIntArrayRef strides, std::optional<c10::ScalarType> dtype) {
   _assert_match(tensor.sizes(), sizes, "sizes");
   _assert_match(tensor.strides(), strides, "strides");
   _assert_match(tensor.dtype(), dtype, "dtype");
diff --git a/aten/src/ATen/native/Constraints.cpp b/aten/src/ATen/native/Constraints.cpp
index 8f3f8c11e696c..21a64537af283 100644
--- a/aten/src/ATen/native/Constraints.cpp
+++ b/aten/src/ATen/native/Constraints.cpp
@@ -24,8 +24,8 @@ namespace at::native {
 
 void sym_constrain_range(
     const Scalar& size,
-    c10::optional<int64_t> min,
-    c10::optional<int64_t> max) {
+    std::optional<int64_t> min,
+    std::optional<int64_t> max) {
 
     int64_t min_val = min.has_value() ? min.value() : std::numeric_limits<int64_t>::min();
     int64_t max_val = max.has_value() ? max.value() : std::numeric_limits<int64_t>::max();
@@ -53,14 +53,14 @@ void sym_constrain_range(
 
 Tensor _functional_sym_constrain_range(
     const Scalar& size,
-    c10::optional<int64_t> min,
-    c10::optional<int64_t> max,
+    std::optional<int64_t> min,
+    std::optional<int64_t> max,
     const Tensor& dep_token) {
   sym_constrain_range(size, min, max);
   return dep_token.clone();
 }
 
-void sym_constrain_range_for_size(const Scalar& size, c10::optional<int64_t> min, c10::optional<int64_t> max) {
+void sym_constrain_range_for_size(const Scalar& size, std::optional<int64_t> min, c10::optional<int64_t> max) {
   int64_t min_val = min.has_value() ? min.value() : 0;
   if (max.has_value() && max.value() <= 2) {
     TORCH_CHECK(false, "Max value to constrain_range_for_size must be greater than 2. got: ", max.value());
@@ -70,19 +70,19 @@ void sym_constrain_range_for_size(const Scalar& size, c10::optional<int64_t> min
 
 Tensor _functional_sym_constrain_range_for_size(
   const Scalar& size,
-  c10::optional<int64_t> min,
-  c10::optional<int64_t> max,
+  std::optional<int64_t> min,
+  std::optional<int64_t> max,
   const Tensor& dep_token) {
   sym_constrain_range_for_size(size, min, max);
   return dep_token.clone();
 }
 
 Tensor _make_dep_token_cpu(
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt) {
   return at::empty(
       {}, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
 }
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 4b814f3e442cb..d504d088a8620 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -44,7 +44,7 @@ using mkldnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tens
     const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
     at::IntArrayRef, int64_t, std::array<bool,3>);
 DECLARE_DISPATCH(mkldnn_convolution_backward_fn, mkldnn_convolution_backward_stub);
-using mkldnn_convolution_transpose_fn = Tensor(*)(const Tensor&, const Tensor&, const c10::optional<Tensor>&,
+using mkldnn_convolution_transpose_fn = Tensor(*)(const Tensor&, const Tensor&, const std::optional<Tensor>&,
     IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t);
 DECLARE_DISPATCH(mkldnn_convolution_transpose_fn, mkldnn_convolution_transpose_stub);
 using mkldnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
@@ -117,7 +117,7 @@ enum class ConvBackend {
 // Overload for selecting the convolution backend from the full set of convolution inputs.
 // This overload is exposed to python for testing, etc.
 TORCH_API ConvBackend select_conv_backend(
-    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt,
     SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation,
     bool transposed, SymIntArrayRef output_padding, c10::SymInt groups, const at::OptionalSymIntArrayRef bias_sizes_opt);
 
@@ -360,7 +360,7 @@ static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const
   bool can_use_miopen_channels_last_2d = false;
   // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
   // See #64427
-  static c10::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
+  static std::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
 
   auto input_memory_format = input.suggest_memory_format();
   auto weight_memory_format = weight.suggest_memory_format();
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 717280a6cdcab..ecedc73579d66 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -368,7 +368,7 @@ struct ConvParams {
     }
   }
 
-  bool use_cpu_depthwise3x3_winograd(const at::Tensor& input, const at::Tensor& weight, const c10::optional<at::Tensor>& bias) const {
+  bool use_cpu_depthwise3x3_winograd(const at::Tensor& input, const at::Tensor& weight, const std::optional<at::Tensor>& bias) const {
 #if defined(__ARM_NEON__)
     // Currently only 3x3 depthwise convolutions on tensors of float are supported.
     return (input.ndimension() == 4) &&
@@ -878,7 +878,7 @@ at::Tensor complex_convolution(
 at::Tensor complex_convolution_mode(
     const at::Tensor& input,
     const at::Tensor& weight,
-    const c10::optional<at::Tensor>& bias_opt,
+    const std::optional<at::Tensor>& bias_opt,
     c10::SymIntArrayRef stride,
     c10::string_view padding,
     c10::SymIntArrayRef dilation,
@@ -908,7 +908,7 @@ at::Tensor complex_convolution_mode(
 } // namespace
 
 at::Tensor conv1d_symint(
-    const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input_, const Tensor& weight, const std::optional<Tensor>& bias_opt,
     SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, c10::SymInt groups) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
@@ -933,7 +933,7 @@ at::Tensor conv1d_symint(
 }
 
 at::Tensor conv2d_symint(
-    const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input_, const Tensor& weight, const std::optional<Tensor>& bias_opt,
     SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, c10::SymInt groups) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
@@ -958,7 +958,7 @@ at::Tensor conv2d_symint(
 }
 
 at::Tensor conv3d_symint(
-    const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input_, const Tensor& weight, const std::optional<Tensor>& bias_opt,
     SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, c10::SymInt groups) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
@@ -1049,7 +1049,7 @@ static Tensor convolution_same(
 }
 
 Tensor _convolution_mode_symint(
-    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt,
     SymIntArrayRef stride, c10::string_view padding, SymIntArrayRef dilation,
     c10::SymInt groups) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -1067,7 +1067,7 @@ Tensor _convolution_mode_symint(
 }
 
 at::Tensor conv1d_padding_symint(
-    const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias,
+    const Tensor& input_, const Tensor& weight, const std::optional<Tensor>& bias,
     c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation,
     c10::SymInt groups) {
   auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d");
@@ -1081,7 +1081,7 @@ at::Tensor conv1d_padding_symint(
 }
 
 at::Tensor conv2d_padding_symint(
-    const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias,
+    const Tensor& input_, const Tensor& weight, const std::optional<Tensor>& bias,
     c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation,
     c10::SymInt groups) {
   auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d");
@@ -1095,7 +1095,7 @@ at::Tensor conv2d_padding_symint(
 }
 
 at::Tensor conv3d_padding_symint(
-    const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias,
+    const Tensor& input_, const Tensor& weight, const std::optional<Tensor>& bias,
     c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation,
     c10::SymInt groups) {
   auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d");
@@ -1109,7 +1109,7 @@ at::Tensor conv3d_padding_symint(
 }
 
 at::Tensor conv_transpose1d_symint(
-    const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input_, const Tensor& weight, const std::optional<Tensor>& bias_opt,
     SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, c10::SymInt groups, SymIntArrayRef dilation) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
@@ -1128,7 +1128,7 @@ at::Tensor conv_transpose1d_symint(
 }
 
 at::Tensor conv_transpose2d_symint(
-    const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input_, const Tensor& weight, const std::optional<Tensor>& bias_opt,
     SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, c10::SymInt groups, SymIntArrayRef dilation) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
@@ -1147,7 +1147,7 @@ at::Tensor conv_transpose2d_symint(
 }
 
 at::Tensor conv_transpose3d_symint(
-    const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input_, const Tensor& weight, const std::optional<Tensor>& bias_opt,
     SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, c10::SymInt groups, SymIntArrayRef dilation) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
@@ -1166,7 +1166,7 @@ at::Tensor conv_transpose3d_symint(
 }
 
 at::Tensor convolution(
-    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
     bool transposed, IntArrayRef output_padding, int64_t groups) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -1182,7 +1182,7 @@ at::Tensor convolution(
 }
 
 at::Tensor convolution_overrideable(
-    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
     bool transposed, IntArrayRef output_padding, int64_t groups) {
   TORCH_CHECK_NOT_IMPLEMENTED(false, "convolution_overrideable not implemented. You are likely triggering this with tensor backend other than CPU/CUDA/MKLDNN, if this is intended, please use TORCH_LIBRARY_IMPL to override this function ");
@@ -1197,7 +1197,7 @@ template <typename T>
 ConvBackend _select_conv_backend(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const at::OptionalArrayRef<T> bias_sizes_opt,
     const bool need_backward,
     const ConvParams<T>& params) {
@@ -1304,7 +1304,7 @@ ConvBackend _select_conv_backend(
 
 // Selects a backend for convolution based on the inputs and params.
 ConvBackend select_conv_backend(
-    const Tensor& input_r, const Tensor& weight_r, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input_r, const Tensor& weight_r, const std::optional<Tensor>& bias_opt,
     SymIntArrayRef stride_, SymIntArrayRef padding_, SymIntArrayRef dilation_,
     bool transposed_, SymIntArrayRef output_padding_, c10::SymInt groups_, const at::OptionalSymIntArrayRef bias_sizes_opt) {
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
@@ -1339,7 +1339,7 @@ ConvBackend select_conv_backend(
     weight = view4d(weight);
   }
 
-  auto bias_sizes = bias.defined() ? c10::optional<SymIntArrayRef>(bias.sym_sizes()) : bias_sizes_opt;
+  auto bias_sizes = bias.defined() ? std::optional<SymIntArrayRef>(bias.sym_sizes()) : bias_sizes_opt;
   bool need_backward = GradMode::is_enabled() &&
       (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad()));
   return _select_conv_backend(input, weight, bias, bias_sizes, need_backward, params);
@@ -1461,7 +1461,7 @@ at::MemoryFormat _determine_backend_memory_format(
 }
 
 at::Tensor _convolution(
-    const Tensor& input_r, const Tensor& weight_r, const c10::optional<Tensor>& bias_r_opt,
+    const Tensor& input_r, const Tensor& weight_r, const std::optional<Tensor>& bias_r_opt,
     IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_,
     bool transposed_, IntArrayRef output_padding_, int64_t groups_,
     bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) {
@@ -1504,7 +1504,7 @@ at::Tensor _convolution(
   }
 
   // Select appropriate backend to use.
-  auto bias_sizes_opt = bias.defined() ? c10::optional<IntArrayRef>(bias.sizes()) : c10::nullopt;
+  auto bias_sizes_opt = bias.defined() ? std::optional<IntArrayRef>(bias.sizes()) : c10::nullopt;
   bool need_backward = GradMode::is_enabled() &&
       (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad()));
   ConvBackend backend = _select_conv_backend(input, weight, bias, c10::OptionalIntArrayRef(bias_sizes_opt), need_backward, params);
@@ -1701,7 +1701,7 @@ at::Tensor _convolution(
 }
 
 at::Tensor _convolution(
-    const Tensor& input_r, const Tensor& weight_r, const c10::optional<Tensor>& bias_r_opt,
+    const Tensor& input_r, const Tensor& weight_r, const std::optional<Tensor>& bias_r_opt,
     IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_,
     bool transposed_, IntArrayRef output_padding_, int64_t groups_,
     bool benchmark, bool deterministic, bool cudnn_enabled)
@@ -1730,7 +1730,7 @@ static Tensor subvariable(const Tensor& var, int dim, int groups, int g) {
   return result;
 }
 
-std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::optional<Tensor>& ggI_opt, const c10::optional<Tensor>& ggW_r_opt, const c10::optional<Tensor>& ggb_opt,
+std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const std::optional<Tensor>& ggI_opt, const c10::optional<Tensor>& ggW_r_opt, const c10::optional<Tensor>& ggb_opt,
     const Tensor& gO_r, const Tensor& weight_r, const Tensor& input,
     IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_,
     bool transposed_, IntArrayRef output_padding_, int64_t groups_,
diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp
index 6f8a3477c239c..686948584c728 100644
--- a/aten/src/ATen/native/ConvolutionMM2d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM2d.cpp
@@ -538,7 +538,7 @@ static void slow_conv2d_backward_weight_out_cpu_template(
 Tensor& slow_conv2d_forward_out_cpu(
     const Tensor& self,
     const Tensor& weight_,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride,
     IntArrayRef padding,
     Tensor& output) {
@@ -627,7 +627,7 @@ Tensor& slow_conv2d_forward_out_cpu(
 Tensor slow_conv2d_forward_cpu(
     const Tensor& self,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride,
     IntArrayRef padding) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -726,7 +726,7 @@ std::tuple<Tensor, Tensor, Tensor> slow_conv2d_backward_cpu(
   return std::make_tuple(grad_input, grad_weight, grad_bias);
 }
 
-Tensor & thnn_conv2d_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output) {
+Tensor & thnn_conv2d_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
@@ -734,7 +734,7 @@ Tensor & thnn_conv2d_out(const Tensor & self, const Tensor & weight, IntArrayRef
   return at::_slow_conv2d_forward_out(output, self, weight, kernel_size, bias, stride, padding);
 }
 
-Tensor thnn_conv2d(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding) {
+Tensor thnn_conv2d(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp
index 1d5e7a8333def..f361b3a819129 100644
--- a/aten/src/ATen/native/ConvolutionMM3d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM3d.cpp
@@ -553,7 +553,7 @@ static void slow_conv3d_backward_parameters_out_cpu_template(
 
 Tensor& slow_conv3d_forward_out_cpu(const Tensor& self,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride,
     IntArrayRef padding,
     Tensor& output) {
@@ -668,7 +668,7 @@ Tensor& slow_conv3d_forward_out_cpu(const Tensor& self,
 Tensor slow_conv3d_forward_cpu(
     const Tensor& self,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride,
     IntArrayRef padding) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -771,7 +771,7 @@ std::tuple<Tensor, Tensor, Tensor> slow_conv3d_backward_cpu(
 
 Tensor& slow_conv3d_out(const Tensor& self,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride,
     IntArrayRef padding,
     Tensor& output) {
@@ -792,7 +792,7 @@ Tensor& slow_conv3d_out(const Tensor& self,
 Tensor slow_conv3d(
     const Tensor& self,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride,
     IntArrayRef padding) {
   // See [Note: hacky wrapper removal for optional tensor]
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index e6aa8493905d3..c5f81e98906dd 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -325,7 +325,7 @@ Tensor copy(const Tensor& self, const Tensor& src, bool non_blocking) {
   // Instead, generate an empty tensor with the right sizes/strides, since we should be able to assume
   // that copy_() will fully overwrite all data with that of src
   if (self_storage->nbytes() == 0) {
-    r = at::empty_strided(self.sizes(), self.strides());
+    r = at::empty_strided(self.sizes(), self.strides(), self.options());
   } else {
     r = clone_preserve_strides(self);
   }
diff --git a/aten/src/ATen/native/Correlation.cpp b/aten/src/ATen/native/Correlation.cpp
index 95384684961a4..5482a8e0a597a 100644
--- a/aten/src/ATen/native/Correlation.cpp
+++ b/aten/src/ATen/native/Correlation.cpp
@@ -24,8 +24,8 @@ namespace at::native {
 Tensor cov(
     const Tensor& self,
     int64_t correction,
-    const c10::optional<Tensor>& fweights,
-    const c10::optional<Tensor>& aweights) {
+    const std::optional<Tensor>& fweights,
+    const std::optional<Tensor>& aweights) {
   constexpr int64_t OBSERVATIONS_DIM = 1;
 
   TORCH_CHECK(
diff --git a/aten/src/ATen/native/Cross.cpp b/aten/src/ATen/native/Cross.cpp
index 99f0760fcc0f4..7297aaed80d38 100644
--- a/aten/src/ATen/native/Cross.cpp
+++ b/aten/src/ATen/native/Cross.cpp
@@ -40,7 +40,7 @@ namespace at::native {
 
 DEFINE_DISPATCH(cross_stub);
 
-static int64_t _default_cross_dim(const c10::optional<int64_t> &dimension, SymIntArrayRef sizes) {
+static int64_t _default_cross_dim(const std::optional<int64_t> &dimension, SymIntArrayRef sizes) {
   // If dimension is not given, it defaults to the first dimension found with the size 3.
   // Note that this behaviour might be unexpected.
   // _default_cross_dim is called internally inside the cross implementation to calculate
@@ -57,7 +57,7 @@ static int64_t _default_cross_dim(const c10::optional<int64_t> &dimension, SymIn
   TORCH_CHECK(false, "no dimension of size 3 in input");
 }
 
-Tensor cross(const Tensor & input, const Tensor & other, const c10::optional<int64_t> dimension) {
+Tensor cross(const Tensor & input, const Tensor & other, const std::optional<int64_t> dimension) {
   if (!dimension) {
     TORCH_WARN_ONCE(
       "Using torch.cross without specifying the dim arg is deprecated.\n",
@@ -69,7 +69,7 @@ Tensor cross(const Tensor & input, const Tensor & other, const c10::optional<int
   return at::linalg_cross(input, other, dim);
 }
 
-Tensor & cross_out(const Tensor & input, const Tensor & other, const c10::optional<int64_t> dimension, Tensor & out) {
+Tensor & cross_out(const Tensor & input, const Tensor & other, const std::optional<int64_t> dimension, Tensor & out) {
   auto dim = _default_cross_dim(dimension, input.sym_sizes());
   return at::linalg_cross_out(out, input, other, dim);
 }
diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp
index 5af87802a1246..942461c7612c1 100644
--- a/aten/src/ATen/native/Distance.cpp
+++ b/aten/src/ATen/native/Distance.cpp
@@ -78,7 +78,7 @@ Tensor _euclidean_dist(const Tensor& x1, const Tensor& x2) {
   return result;
 }
 
-static Tensor cdist_impl(const Tensor& x1, const Tensor& x2, const double p, c10::optional<int64_t> compute_mode) {
+static Tensor cdist_impl(const Tensor& x1, const Tensor& x2, const double p, std::optional<int64_t> compute_mode) {
   TORCH_CHECK(at::isFloatingType(x1.scalar_type()), "cdist only supports floating-point dtypes, X1 got: ", x1.scalar_type());
   auto device1 = x1.device().type();
   TORCH_CHECK(at::isFloatingType(x2.scalar_type()), "cdist only supports floating-point dtypes, X2 got: ", x2.scalar_type());
@@ -147,7 +147,7 @@ static Tensor cdist_impl(const Tensor& x1, const Tensor& x2, const double p, c10
   return result;
 }
 
-Tensor cdist(const Tensor& x1, const Tensor& x2, const double p, c10::optional<int64_t> compute_mode) {
+Tensor cdist(const Tensor& x1, const Tensor& x2, const double p, std::optional<int64_t> compute_mode) {
   TORCH_CHECK(x1.dim() >= 2, "cdist only supports at least 2D tensors, X1 got: ", x1.dim(), "D");
   TORCH_CHECK(x2.dim() >= 2, "cdist only supports at least 2D tensors, X2 got: ", x2.dim(), "D");
   TORCH_CHECK(x1.sym_size(-1) == x2.sym_size(-1), "X1 and X2 must have the same number of columns. X1: ", x1.sym_size(-1), " X2: ", x2.sym_size(-1));
@@ -175,7 +175,7 @@ Tensor cdist(const Tensor& x1, const Tensor& x2, const double p, c10::optional<i
   return result;
 }
 
-Tensor _cdist_forward(const Tensor& x1, const Tensor& x2, const double p, c10::optional<int64_t> compute_mode) {
+Tensor _cdist_forward(const Tensor& x1, const Tensor& x2, const double p, std::optional<int64_t> compute_mode) {
   TORCH_CHECK(x1.dim() >= 2, "cdist only supports at least 2D tensors, X1 got: ", x1.dim(), "D");
   TORCH_CHECK(x2.dim() >= 2, "cdist only supports at least 2D tensors, X2 got: ", x2.dim(), "D");
   TORCH_CHECK(x1.size(-1) == x2.size(-1), "X1 and X2 must have the same number of columns. X1: ", x1.size(-1), " X2: ", x2.size(-1));
diff --git a/aten/src/ATen/native/DistributionTemplates.h b/aten/src/ATen/native/DistributionTemplates.h
index a5ed9526c270d..ba72f0df11a0a 100644
--- a/aten/src/ATen/native/DistributionTemplates.h
+++ b/aten/src/ATen/native/DistributionTemplates.h
@@ -81,7 +81,7 @@ int64_t update_to(int64_t to) {
   }
 
 template<template<typename> class random_kernel, typename RNG>
-at::Tensor& random_impl(at::Tensor& self, c10::optional<Generator> generator) {
+at::Tensor& random_impl(at::Tensor& self, std::optional<Generator> generator) {
   CHECK_EMPTY_AND_RETURN(self);
   auto iter = at::TensorIterator::borrowing_nullary_op(self);
   random_kernel<RNG>()(iter, generator);
@@ -132,7 +132,7 @@ static void check_from_to_in_range(int64_t from, int64_t to_inc, caffe2::TypeMet
 }
 
 template<template<typename> class random_from_to_kernel, typename RNG>
-at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, c10::optional<int64_t> to_opt, c10::optional<Generator> generator) {
+at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, std::optional<int64_t> to_opt, c10::optional<Generator> generator) {
   uint64_t range = 0;
   auto iter = at::TensorIterator::borrowing_nullary_op(self);
   if (to_opt.has_value()) {
@@ -200,7 +200,7 @@ at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, c10::optional<in
   TORCH_CHECK(std >= 0.0, "normal expects std >= 0.0, but found std ", std);
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor& normal_impl_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+Tensor& normal_impl_(Tensor& self, double mean, double std, std::optional<Generator> gen) {
   CHECK_NORMAL_STD(std);
   CHECK_EMPTY_AND_RETURN(self);
 
@@ -216,7 +216,7 @@ Tensor& normal_impl_(Tensor& self, double mean, double std, c10::optional<Genera
 }
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, c10::optional<Generator> gen) {
+Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, std::optional<Generator> gen) {
   CHECK_NORMAL_STD(std);
   auto std_tensor = at::empty_like(output, MemoryFormat::Contiguous);
   auto shape = at::infer_size(mean.sizes(), std_tensor.sizes());
@@ -227,7 +227,7 @@ Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, c10::opt
 }
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, c10::optional<Generator> gen) {
+Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, std::optional<Generator> gen) {
   CHECK_NORMAL_TENSOR_STD(std);
   auto mean_tensor = at::full({}, mean, output.options());
   auto shape = at::infer_size(mean_tensor.sizes(), std.sizes());
@@ -242,7 +242,7 @@ Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, c10::opt
 }
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
+Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, std::optional<Generator> gen) {
   CHECK_NORMAL_TENSOR_STD(std);
   auto shape = at::infer_size(mean.sizes(), std.sizes());
   at::native::resize_output(output, shape);
@@ -256,7 +256,7 @@ Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, c
 }
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor normal_impl(const Tensor& mean, double std, c10::optional<Generator> gen) {
+Tensor normal_impl(const Tensor& mean, double std, std::optional<Generator> gen) {
   CHECK_NORMAL_STD(std);
   Tensor ret = at::empty_like(mean, MemoryFormat::Contiguous);
   normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
@@ -264,7 +264,7 @@ Tensor normal_impl(const Tensor& mean, double std, c10::optional<Generator> gen)
 }
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor normal_impl(double mean, const Tensor& std, c10::optional<Generator> gen) {
+Tensor normal_impl(double mean, const Tensor& std, std::optional<Generator> gen) {
   CHECK_NORMAL_TENSOR_STD(std);
   Tensor ret = at::empty_like(std, MemoryFormat::Contiguous);
   normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
@@ -272,7 +272,7 @@ Tensor normal_impl(double mean, const Tensor& std, c10::optional<Generator> gen)
 }
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor normal_impl(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
+Tensor normal_impl(const Tensor& mean, const Tensor& std, std::optional<Generator> gen) {
   CHECK_NORMAL_TENSOR_STD(std);
   auto shape = at::infer_size(mean.sizes(), std.sizes());
   Tensor ret = at::empty(shape, mean.options(), MemoryFormat::Contiguous);
@@ -283,7 +283,7 @@ Tensor normal_impl(const Tensor& mean, const Tensor& std, c10::optional<Generato
 // ==================================================== Uniform =======================================================
 
 template<template<typename> class uniform_kernel, typename RNG>
-at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, c10::optional<Generator> generator) {
+at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, std::optional<Generator> generator) {
   if (self.is_complex()) {
     CHECK_EMPTY_AND_RETURN(self);
     auto float_tensor = at::view_as_real(self);
@@ -313,7 +313,7 @@ at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, c10::optiona
 // ================================================== LogNormal =======================================================
 
 template<template<typename> class log_normal_kernel, typename RNG>
-at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, std::optional<Generator> gen) {
   TORCH_CHECK(std > 0.0, "log_normal_ expects std > 0.0, but found std=", std);
   CHECK_EMPTY_AND_RETURN(self);
   auto iter = TensorIterator::borrowing_nullary_op(self);
@@ -324,7 +324,7 @@ at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, c10::opt
 // =================================================== Geometric ======================================================
 
 template<template<typename> class geometric_kernel, typename RNG>
-Tensor& geometric_impl_(Tensor& self, double p, c10::optional<Generator> gen) {
+Tensor& geometric_impl_(Tensor& self, double p, std::optional<Generator> gen) {
   TORCH_CHECK(0 < p && p < 1, "geometric_ expects p to be in (0, 1), but got p=", p);
   CHECK_EMPTY_AND_RETURN(self);
   auto iter = TensorIterator::borrowing_nullary_op(self);
@@ -335,7 +335,7 @@ Tensor& geometric_impl_(Tensor& self, double p, c10::optional<Generator> gen) {
 // ================================================== Exponential =====================================================
 
 template<template<typename> class exponential_kernel, typename RNG>
-Tensor& exponential_impl_(Tensor& self, double lambda, c10::optional<Generator> gen) {
+Tensor& exponential_impl_(Tensor& self, double lambda, std::optional<Generator> gen) {
   TORCH_CHECK(lambda > 0.0, "exponential_ expects lambda > 0.0, but found lambda=", lambda);
   CHECK_EMPTY_AND_RETURN(self);
   auto iter = TensorIterator::borrowing_nullary_op(self);
@@ -346,7 +346,7 @@ Tensor& exponential_impl_(Tensor& self, double lambda, c10::optional<Generator>
 // ==================================================== Cauchy ========================================================
 
 template<template<typename> class cauchy_kernel, typename RNG>
-Tensor& cauchy_impl_(Tensor& self, double median, double sigma, c10::optional<Generator> gen) {
+Tensor& cauchy_impl_(Tensor& self, double median, double sigma, std::optional<Generator> gen) {
   // TODO: instead of variable name 'sigma', use 'gamma' or 'scale'
   // the variance, squared sigma, is undefined for cauchy distribution
   TORCH_CHECK(sigma > 0.0, "cauchy_ expects sigma > 0.0, but found sigma=", sigma);
@@ -360,7 +360,7 @@ Tensor& cauchy_impl_(Tensor& self, double median, double sigma, c10::optional<Ge
 // ==================================================== Bernoulli =====================================================
 
 template<template<typename> class bernoulli_tensor_kernel, typename RNG>
-Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, std::optional<Generator> gen) {
   CHECK_EMPTY_AND_RETURN(self);
   NoNamesGuard guard;
   at::assert_no_internal_overlap(self);
@@ -369,7 +369,7 @@ Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, c10::optional<Generator>
 }
 
 template<template<typename> class bernoulli_scalar_kernel, typename RNG>
-Tensor& bernoulli_impl_(Tensor& self, double p, c10::optional<Generator> gen) {
+Tensor& bernoulli_impl_(Tensor& self, double p, std::optional<Generator> gen) {
   TORCH_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p);
   CHECK_EMPTY_AND_RETURN(self);
   at::assert_no_internal_overlap(self);
@@ -378,7 +378,7 @@ Tensor& bernoulli_impl_(Tensor& self, double p, c10::optional<Generator> gen) {
 }
 
 template<template<typename> class bernoulli_tensor_kernel, typename RNG>
-Tensor& bernoulli_out_impl(Tensor& result, const Tensor& self, c10::optional<Generator> gen) {
+Tensor& bernoulli_out_impl(Tensor& result, const Tensor& self, std::optional<Generator> gen) {
   // result.resize_as_(self) requires self to have same dtype as result, so we
   // use resize_ instead.
   // TODO: Fix resize_as_. See pytorch/pytorch#11665.
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index 4d4eb2efaf401..7ecb8ebb9ffc8 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -160,36 +160,36 @@ DEFINE_DISPATCH(random_full_64_bits_range_stub);
 
 template<typename RNG>
 struct BernoulliStub {
-  void operator()(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+  void operator()(Tensor& self, const Tensor& p_, std::optional<Generator> gen) {
     bernoulli_tensor_stub(self.device().type(), self, p_, gen);
   }
 
-  void operator()(Tensor& self, double p, c10::optional<Generator> gen) {
+  void operator()(Tensor& self, double p, std::optional<Generator> gen) {
     bernoulli_scalar_stub(self.device().type(), self, p, gen);
   }
 };
 
-Tensor bernoulli(const Tensor& self, c10::optional<Generator> gen) {
+Tensor bernoulli(const Tensor& self, std::optional<Generator> gen) {
   Tensor result = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   result.bernoulli_(self, std::move(gen));
   return result;
 }
 
-Tensor bernoulli(const Tensor& self, double p, c10::optional<Generator> gen) {
+Tensor bernoulli(const Tensor& self, double p, std::optional<Generator> gen) {
   Tensor result = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   result.bernoulli_(p, std::move(gen));
   return result;
 }
 
-Tensor& bernoulli_out(const Tensor& self, c10::optional<Generator> gen, Tensor& result) {
+Tensor& bernoulli_out(const Tensor& self, std::optional<Generator> gen, Tensor& result) {
   return at::native::templates::bernoulli_out_impl<BernoulliStub, Generator>(result, self, std::move(gen));
 }
 
-Tensor& bernoulli_(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+Tensor& bernoulli_(Tensor& self, const Tensor& p_, std::optional<Generator> gen) {
   return at::native::templates::bernoulli_impl_<BernoulliStub, Generator>(self, p_, std::move(gen));
 }
 
-Tensor& bernoulli_(Tensor& self, double p, c10::optional<Generator> gen) {
+Tensor& bernoulli_(Tensor& self, double p, std::optional<Generator> gen) {
   return at::native::templates::bernoulli_impl_<BernoulliStub, Generator>(self, p, std::move(gen));
 }
 
@@ -197,12 +197,12 @@ Tensor& bernoulli_(Tensor& self, double p, c10::optional<Generator> gen) {
 
 template<typename RNG>
 struct LogNormalStub {
-  void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double mean, double std, std::optional<Generator> gen) {
     log_normal_stub(iter.device_type(), iter, mean, std, gen);
   }
 };
 
-Tensor& log_normal_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+Tensor& log_normal_(Tensor& self, double mean, double std, std::optional<Generator> gen) {
   return at::native::templates::log_normal_impl_<LogNormalStub, Generator>(self, mean, std, std::move(gen));
 }
 
@@ -210,12 +210,12 @@ Tensor& log_normal_(Tensor& self, double mean, double std, c10::optional<Generat
 
 template<typename RNG>
 struct CauchyStub {
-  void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, std::optional<Generator> gen) {
     cauchy_stub(iter.device_type(), iter, median, sigma, gen);
   }
 };
 
-Tensor& cauchy_(Tensor& self, double median, double sigma, c10::optional<Generator> gen) {
+Tensor& cauchy_(Tensor& self, double median, double sigma, std::optional<Generator> gen) {
   return at::native::templates::cauchy_impl_<CauchyStub, Generator>(self, median, sigma, std::move(gen));
 }
 
@@ -223,12 +223,12 @@ Tensor& cauchy_(Tensor& self, double median, double sigma, c10::optional<Generat
 
 template<typename RNG>
 struct ExponentialStub {
-  void operator()(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
     exponential_stub(iter.device_type(), iter, lambda, gen);
   }
 };
 
-Tensor& exponential_(Tensor& self, double lambda, c10::optional<Generator> gen) {
+Tensor& exponential_(Tensor& self, double lambda, std::optional<Generator> gen) {
   return at::native::templates::exponential_impl_<ExponentialStub, Generator>(self, lambda, std::move(gen));
 }
 
@@ -236,12 +236,12 @@ Tensor& exponential_(Tensor& self, double lambda, c10::optional<Generator> gen)
 
 template<typename RNG>
 struct GeometricStub {
-  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double p, std::optional<Generator> gen) {
     geometric_stub(iter.device_type(), iter, p, gen);
   }
 };
 
-Tensor& geometric_(Tensor& self, double p, c10::optional<Generator> gen) {
+Tensor& geometric_(Tensor& self, double p, std::optional<Generator> gen) {
   return at::native::templates::geometric_impl_<GeometricStub, Generator>(self, p, std::move(gen));
 }
 
@@ -249,7 +249,7 @@ Tensor& geometric_(Tensor& self, double p, c10::optional<Generator> gen) {
 
 template<typename RNG>
 struct UniformStub {
-  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double from, double to, std::optional<Generator> gen) {
     uniform_stub(iter.device_type(), iter, from, to, gen);
   }
 };
@@ -257,15 +257,15 @@ struct UniformStub {
 template<typename RNG>
 struct UniformMeta {
   // No-op!
-  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double from, double to, std::optional<Generator> gen) {
   }
 };
 
-Tensor& uniform_(Tensor& self, double from, double to, c10::optional<Generator> gen) {
+Tensor& uniform_(Tensor& self, double from, double to, std::optional<Generator> gen) {
   return at::native::templates::uniform_impl_<UniformStub, Generator>(self, from, to, std::move(gen));
 }
 
-Tensor& uniform_meta_(Tensor& self, double from, double to, c10::optional<Generator> gen) {
+Tensor& uniform_meta_(Tensor& self, double from, double to, std::optional<Generator> gen) {
   return at::native::templates::uniform_impl_<UniformMeta, Generator>(self, from, to, std::move(gen));
 }
 
@@ -273,7 +273,7 @@ Tensor& uniform_meta_(Tensor& self, double from, double to, c10::optional<Genera
 
 template<typename RNG>
 struct NormalStub {
-  void operator()(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  void operator()(Tensor& self, double mean, double std, std::optional<Generator> gen) {
     normal_stub(self.device().type(), self, mean, std, gen);
   }
 };
@@ -281,76 +281,76 @@ struct NormalStub {
 template<typename RNG>
 struct NormalMeta {
   // No-op!
-  void operator()(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  void operator()(Tensor& self, double mean, double std, std::optional<Generator> gen) {
   }
 };
 
 // inplace
-Tensor& normal_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+Tensor& normal_(Tensor& self, double mean, double std, std::optional<Generator> gen) {
   return at::native::templates::normal_impl_<NormalStub, Generator>(self, mean, std, std::move(gen));
 }
 
-Tensor& normal_meta_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+Tensor& normal_meta_(Tensor& self, double mean, double std, std::optional<Generator> gen) {
   return at::native::templates::normal_impl_<NormalMeta, Generator>(self, mean, std, std::move(gen));
 }
 
 // out tensor float
-Tensor& normal_out(const Tensor& mean, double std, c10::optional<Generator> gen, Tensor& output) {
+Tensor& normal_out(const Tensor& mean, double std, std::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<NormalStub, Generator>(output, mean, std, std::move(gen));
 }
 
-Tensor& normal_out_meta(const Tensor& mean, double std, c10::optional<Generator> gen, Tensor& output) {
+Tensor& normal_out_meta(const Tensor& mean, double std, std::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<NormalMeta, Generator>(output, mean, std, std::move(gen));
 }
 
 // out float tensor
-Tensor& normal_out(double mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+Tensor& normal_out(double mean, const Tensor& std, std::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<NormalStub, Generator>(output, mean, std, std::move(gen));
 }
 
-Tensor& normal_out_meta(double mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+Tensor& normal_out_meta(double mean, const Tensor& std, std::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<NormalMeta, Generator>(output, mean, std, std::move(gen));
 
 }
 
 // out tensor tensor
-Tensor& normal_out(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+Tensor& normal_out(const Tensor& mean, const Tensor& std, std::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<NormalStub, Generator>(output, mean, std, std::move(gen));
 }
 
-Tensor& normal_out_meta(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+Tensor& normal_out_meta(const Tensor& mean, const Tensor& std, std::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<NormalMeta, Generator>(output, mean, std, std::move(gen));
 }
 
 // functional tensor float
-Tensor normal(const Tensor& mean, double std, c10::optional<Generator> gen) {
+Tensor normal(const Tensor& mean, double std, std::optional<Generator> gen) {
   return at::native::templates::normal_impl<NormalStub, Generator>(mean, std, std::move(gen));
 }
 
-Tensor normal_meta(const Tensor& mean, double std, c10::optional<Generator> gen) {
+Tensor normal_meta(const Tensor& mean, double std, std::optional<Generator> gen) {
   return at::native::templates::normal_impl<NormalMeta, Generator>(mean, std, std::move(gen));
 }
 
 // functional float tensor
-Tensor normal(double mean, const Tensor& std, c10::optional<Generator> gen) {
+Tensor normal(double mean, const Tensor& std, std::optional<Generator> gen) {
   return at::native::templates::normal_impl<NormalStub, Generator>(mean, std, std::move(gen));
 }
 
-Tensor normal_meta(double mean, const Tensor& std, c10::optional<Generator> gen) {
+Tensor normal_meta(double mean, const Tensor& std, std::optional<Generator> gen) {
   return at::native::templates::normal_impl<NormalMeta, Generator>(mean, std, std::move(gen));
 }
 
 // functional tensor tensor
-Tensor normal(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
+Tensor normal(const Tensor& mean, const Tensor& std, std::optional<Generator> gen) {
   return at::native::templates::normal_impl<NormalStub, Generator>(mean, std, std::move(gen));
 }
 
-Tensor normal_meta(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
+Tensor normal_meta(const Tensor& mean, const Tensor& std, std::optional<Generator> gen) {
   return at::native::templates::normal_impl<NormalMeta, Generator>(mean, std, std::move(gen));
 }
 
 // functional variant, only used by the functionalization pass.
-Tensor normal_functional(const Tensor& self, double mean, double std, c10::optional<at::Generator> generator) {
+Tensor normal_functional(const Tensor& self, double mean, double std, std::optional<at::Generator> generator) {
   return self.clone().normal_(mean, std, std::move(generator));
 }
 
@@ -358,44 +358,44 @@ Tensor normal_functional(const Tensor& self, double mean, double std, c10::optio
 
 template<typename RNG>
 struct RandomStub {
-  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, std::optional<Generator> gen) {
     random_stub(iter.device_type(), iter, gen);
   }
 };
 
-Tensor& random_(Tensor& self, c10::optional<Generator> gen) {
+Tensor& random_(Tensor& self, std::optional<Generator> gen) {
   return at::native::templates::random_impl<RandomStub, Generator>(self, std::move(gen));
 }
 
 template<typename RNG>
 struct RandomFromToStub {
-  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t from, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t from, std::optional<Generator> gen) {
     random_from_to_stub(iter.device_type(), iter, range, from, gen);
   }
-  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, std::optional<Generator> gen) {
     random_full_64_bits_range_stub(iter.device_type(), iter, gen);
   }
 };
 
-Tensor& random_(Tensor& self, int64_t from, optional<int64_t> to, c10::optional<Generator> gen) {
+Tensor& random_(Tensor& self, int64_t from, optional<int64_t> to, std::optional<Generator> gen) {
   return at::native::templates::random_from_to_impl<RandomFromToStub, Generator>(self, from, to, std::move(gen));
 }
 
-Tensor& random_(Tensor& self, int64_t to, c10::optional<Generator> gen) {
+Tensor& random_(Tensor& self, int64_t to, std::optional<Generator> gen) {
   return random_(self, 0, to, std::move(gen));
 }
 
-Tensor& random_meta_(Tensor& self, c10::optional<Generator> gen) {
+Tensor& random_meta_(Tensor& self, std::optional<Generator> gen) {
   // No error checking yay
   return self;
 }
 
-Tensor& random_meta_(Tensor& self, int64_t from, optional<int64_t> to, c10::optional<Generator> gen) {
+Tensor& random_meta_(Tensor& self, int64_t from, optional<int64_t> to, std::optional<Generator> gen) {
   // No error checking yay
   return self;
 }
 
-Tensor& random_meta_(Tensor& self, int64_t to, c10::optional<Generator> gen) {
+Tensor& random_meta_(Tensor& self, int64_t to, std::optional<Generator> gen) {
   // No error checking yay
   return self;
 }
@@ -437,7 +437,7 @@ Tensor _dirichlet_grad_cpu(const Tensor& x, const Tensor& alpha, const Tensor& t
  * This section is a counterpart to Distributions.cu
  */
 
-Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, c10::optional<Generator> gen) {
+Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, std::optional<Generator> gen) {
   Tensor ret = at::zeros(count.sizes(), count.options());
   auto iter = TensorIteratorConfig()
     .add_output(ret)
@@ -462,7 +462,7 @@ Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, c10::optional<Ge
   return ret;
 }
 
-Tensor _s_poisson_cpu(const Tensor& lambda, c10::optional<Generator> gen) {
+Tensor _s_poisson_cpu(const Tensor& lambda, std::optional<Generator> gen) {
   Tensor ret = at::zeros(lambda.sizes(), lambda.options());
   auto iter = TensorIteratorConfig()
     .add_output(ret)
@@ -479,7 +479,7 @@ Tensor _s_poisson_cpu(const Tensor& lambda, c10::optional<Generator> gen) {
   return ret;
 }
 
-Tensor _s_gamma_cpu(const Tensor& alpha, c10::optional<Generator> gen) {
+Tensor _s_gamma_cpu(const Tensor& alpha, std::optional<Generator> gen) {
   Tensor ret = at::zeros(alpha.sizes(), alpha.options());
   auto iter = TensorIteratorConfig()
     .add_output(ret)
@@ -509,7 +509,7 @@ Tensor _s_gamma_cpu(const Tensor& alpha, c10::optional<Generator> gen) {
   return ret;
 }
 
-Tensor _s_dirichlet_cpu(const Tensor& alpha, c10::optional<Generator> gen) {
+Tensor _s_dirichlet_cpu(const Tensor& alpha, std::optional<Generator> gen) {
   Tensor ret = at::zeros(alpha.sizes(), alpha.options());
   AT_DISPATCH_FLOATING_TYPES(ret.scalar_type(), "dirichlet", [&] {
     Tensor gamma = at::zeros(alpha.sizes(), alpha.options().dtype(ScalarType::Double));
@@ -562,7 +562,7 @@ constexpr int64_t FLOAT32_MAX_CONSECUTIVE_INT = 1 << (FLT_MANT_DIG);
 Tensor& multinomial_out(const Tensor& self,
     int64_t n_sample,
     bool with_replacement,
-    c10::optional<Generator> gen,
+    std::optional<Generator> gen,
     Tensor& result) {
   TORCH_CHECK(
       result.device() == self.device(),
@@ -647,7 +647,7 @@ Tensor multinomial(
     const Tensor& self,
     int64_t n_sample,
     bool with_replacement,
-    c10::optional<Generator> gen) {
+    std::optional<Generator> gen) {
   Tensor result = at::empty({0}, self.options().dtype(kLong));
   native::multinomial_out(self, n_sample, with_replacement, std::move(gen), result);
   return result;
diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp
index 7014ec65d1f5a..8a5d4a702a0ca 100644
--- a/aten/src/ATen/native/Dropout.cpp
+++ b/aten/src/ATen/native/Dropout.cpp
@@ -102,7 +102,7 @@ ALIAS_SPECIALIZATION(_feature_alpha_dropout, true,  true )
 } // anonymous namespace
 
 std::tuple<Tensor,Tensor>
-native_dropout_cpu(const Tensor& input, double p, c10::optional<bool> train) {
+native_dropout_cpu(const Tensor& input, double p, std::optional<bool> train) {
   if (input.numel() == 0) {
     return std::make_tuple(input, at::empty_like(input, input.options()));
   }
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index 705b08ab39f06..b0c4644e579c2 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -88,7 +88,7 @@ Tensor embedding_sparse_backward(
   Tensor indices = indices_;
   Tensor grad = grad_;
   if (padding_idx != -1) {
-    c10::List<c10::optional<Tensor>> c({indices != padding_idx});
+    c10::List<std::optional<Tensor>> c({indices != padding_idx});
     indices = indices.index(c);
     grad = grad.index(c);
   }
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index 8b6c90dae2375..216fad05dc07f 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -103,7 +103,7 @@ bool is_fast_path_index_select_scale(const Tensor& src, const Tensor& scale, Ten
 }
 
 template<typename index_t>
-bool is_fast_path(const Tensor& src, const c10::optional<Tensor>& scale, Tensor& output, index_t padding_idx) {
+bool is_fast_path(const Tensor& src, const std::optional<Tensor>& scale, Tensor& output, index_t padding_idx) {
   return (scale.has_value() && scale.value().defined()) ?
          is_fast_path_index_select_scale(src, scale.value(), output, padding_idx) :
          is_fast_path_index_select(src, output, padding_idx);
@@ -891,7 +891,7 @@ void check_arguments(
     const Tensor& indices,
     const Tensor& offsets,
     const int64_t mode,
-    const c10::optional<Tensor>& per_sample_weights,
+    const std::optional<Tensor>& per_sample_weights,
     bool include_last_offset) {
   auto indices_arg = TensorArg(indices, "indices", 1);
   checkScalarTypes("embedding_bag", indices_arg, {kLong, kInt});
@@ -985,7 +985,7 @@ void make_offset2bag_out(
     const Tensor& indices,
     const Tensor& offsets,
     const int64_t mode,
-    const c10::optional<Tensor>& per_sample_weights,
+    const std::optional<Tensor>& per_sample_weights,
     const int64_t padding_idx) {
   // To save compute, if we are going to go down the fast path case for the 'sum'
   // mode, we skip calculating offset2bag, since it is not going to be used.
@@ -1040,7 +1040,7 @@ static Tensor make_offset2bag(
     const Tensor& indices,
     const Tensor& offsets,
     const int64_t mode,
-    const c10::optional<Tensor>& per_sample_weights,
+    const std::optional<Tensor>& per_sample_weights,
     const int64_t padding_idx) {
   Tensor offset2bag = at::empty({0}, offsets.options());
   make_offset2bag_out(offset2bag, output, weight, indices, offsets, mode, per_sample_weights, padding_idx);
@@ -1144,7 +1144,7 @@ void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag,
                             Tensor& bag_size, Tensor* max_indices,
                             const Tensor &weight, const Tensor &indices,
                             const Tensor &offsets, const int64_t mode,
-                            const c10::optional<Tensor>& per_sample_weights,
+                            const std::optional<Tensor>& per_sample_weights,
                             bool include_last_offset, int64_t padding_idx, _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
   if (mode == MODE_MEAN || mode == MODE_SUM) {
     AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, weight.scalar_type(), "embedding_bag_no_grad_cpu_out",
@@ -1241,8 +1241,8 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_cpu_impl(
 std::tuple<Tensor, Tensor, Tensor, Tensor>
 embedding_bag(const Tensor &weight, const Tensor &indices,
               const Tensor &offsets, const bool scale_grad_by_freq,
-              const int64_t mode, bool sparse, const c10::optional<Tensor>& per_sample_weights_opt,
-              bool include_last_offset, c10::optional<int64_t> padding_idx_opt) {
+              const int64_t mode, bool sparse, const std::optional<Tensor>& per_sample_weights_opt,
+              bool include_last_offset, std::optional<int64_t> padding_idx_opt) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
   const Tensor& per_sample_weights = *per_sample_weights_maybe_owned;
@@ -1273,7 +1273,7 @@ embedding_bag(const Tensor &weight, const Tensor &indices,
 std::tuple<Tensor, Tensor, Tensor, Tensor>
 embedding_bag(const Tensor &weight, const Tensor &indices,
               const Tensor &offsets, const bool scale_grad_by_freq,
-              const int64_t mode, bool sparse, const c10::optional<Tensor>& per_sample_weights_opt,
+              const int64_t mode, bool sparse, const std::optional<Tensor>& per_sample_weights_opt,
               bool include_last_offset) {
   return at::native::embedding_bag(weight, indices, offsets, scale_grad_by_freq,
       mode, sparse, per_sample_weights_opt, include_last_offset, c10::nullopt);
@@ -1284,7 +1284,7 @@ embedding_bag(const Tensor &weight, const Tensor &indices,
 std::tuple<Tensor, Tensor, Tensor, Tensor>
 _embedding_bag_forward_only_cpu(const Tensor &weight, const Tensor &indices,
                   const Tensor &offsets, const bool scale_grad_by_freq,
-                  const int64_t mode, bool sparse, const c10::optional<Tensor>& per_sample_weights_opt, bool include_last_offset,
+                  const int64_t mode, bool sparse, const std::optional<Tensor>& per_sample_weights_opt, bool include_last_offset,
                   int64_t padding_idx) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
@@ -1307,7 +1307,7 @@ _embedding_bag_forward_only_cpu(const Tensor &weight, const Tensor &indices,
 std::tuple<Tensor, Tensor, Tensor, Tensor>
 _embedding_bag_cpu(const Tensor &weight, const Tensor &indices,
                   const Tensor &offsets, const bool scale_grad_by_freq,
-                  const int64_t mode, bool sparse, const c10::optional<Tensor>& per_sample_weights_opt, bool include_last_offset,
+                  const int64_t mode, bool sparse, const std::optional<Tensor>& per_sample_weights_opt, bool include_last_offset,
                   int64_t padding_idx) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
@@ -1337,9 +1337,9 @@ void _embedding_bag_cpu_out(
     const bool /* scale_grad_by_freq */,
     const int64_t mode,
     const bool /* sparse */,
-    const c10::optional<at::Tensor>& per_sample_weights,
+    const std::optional<at::Tensor>& per_sample_weights,
     const bool include_last_offset,
-    const c10::optional<int64_t>& padding_idx,
+    const std::optional<int64_t>& padding_idx,
     _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
   auto [indicesMaybeOwned, offsetsMaybeOwned] = promoteIndicesAndOffsets(indices_, offsets_);
   const auto& indices = *indicesMaybeOwned;
@@ -1393,7 +1393,7 @@ Tensor _embedding_bag_backward(const Tensor &grad, const Tensor &indices_,
                               const Tensor &max_indices_,
                               int64_t num_weights,
                               bool scale_grad_by_freq, int64_t mode,
-                              bool sparse, const c10::optional<Tensor>& per_sample_weights_opt,
+                              bool sparse, const std::optional<Tensor>& per_sample_weights_opt,
                               int64_t padding_idx) {
     return at::native::_embedding_bag_backward_symint(
         grad, indices_, offsets_, offset2bag, bag_size_, max_indices_, num_weights, scale_grad_by_freq, mode, sparse, per_sample_weights_opt, padding_idx);
@@ -1408,7 +1408,7 @@ Tensor _embedding_bag_backward_symint(const Tensor &grad, const Tensor &indices_
                               const Tensor &max_indices_,
                               c10::SymInt num_weights,
                               bool scale_grad_by_freq, int64_t mode,
-                              bool sparse, const c10::optional<Tensor>& per_sample_weights_opt,
+                              bool sparse, const std::optional<Tensor>& per_sample_weights_opt,
                               int64_t padding_idx) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
@@ -1610,7 +1610,7 @@ Tensor _embedding_bag_dense_backward_cpu(const Tensor &grad_, const Tensor &indi
                                   const Tensor &offset2bag__,
                                   const Tensor &bag_size_,
                                   const Tensor& max_indices_, int64_t num_weights,
-                                  bool scale_grad_by_freq, int64_t mode, const c10::optional<Tensor>& per_sample_weights__opt,
+                                  bool scale_grad_by_freq, int64_t mode, const std::optional<Tensor>& per_sample_weights__opt,
                                   int64_t padding_idx) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> per_sample_weights__maybe_owned = at::borrow_from_optional_tensor(per_sample_weights__opt);
@@ -1765,7 +1765,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu(
 Tensor _embedding_bag_sparse_backward_symint(
     const Tensor &grad_, const Tensor &indices, const Tensor &offsets,
     const Tensor &offset2bag, const Tensor &bag_size_, SymInt num_weights,
-    bool scale_grad_by_freq, int64_t mode, const c10::optional<Tensor>& per_sample_weights_opt,
+    bool scale_grad_by_freq, int64_t mode, const std::optional<Tensor>& per_sample_weights_opt,
     int64_t padding_idx) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
diff --git a/aten/src/ATen/native/EmbeddingBag.h b/aten/src/ATen/native/EmbeddingBag.h
index c2e61f280bf59..796127f0441ee 100644
--- a/aten/src/ATen/native/EmbeddingBag.h
+++ b/aten/src/ATen/native/EmbeddingBag.h
@@ -13,7 +13,7 @@ void check_arguments(
     const Tensor& indices,
     const Tensor& offsets,
     const int64_t mode,
-    const c10::optional<Tensor>& per_sample_weights,
+    const std::optional<Tensor>& per_sample_weights,
     bool include_last_offset);
 
 void make_bag_size_out(
@@ -40,7 +40,7 @@ void make_offset2bag_out(
     const Tensor& indices,
     const Tensor& offsets,
     const int64_t mode,
-    const c10::optional<Tensor>& per_sample_weights,
+    const std::optional<Tensor>& per_sample_weights,
     const int64_t padding_idx = -1);
 
 #ifdef USE_FBGEMM
@@ -64,7 +64,7 @@ struct _CallbackAndBlockSize {
 
     _CallbackAndBlockSize() = default;
 
-    explicit _CallbackAndBlockSize(c10::optional<int64_t> maybe_block_size)
+    explicit _CallbackAndBlockSize(std::optional<int64_t> maybe_block_size)
       : blockSize(maybe_block_size.value_or(-1))
       , callback(maybe_block_size.has_value() ? generateCallback(maybe_block_size.value()) : nullptr)
     {}
@@ -75,7 +75,7 @@ struct _EmbeddingBagKernelCacheImpl : private StorageMixins... {
 
     _EmbeddingBagKernelCacheImpl() = default;
     // use each of the mixins to store corresponding kernel and block size
-    explicit _EmbeddingBagKernelCacheImpl(c10::optional<int64_t> maybe_block_size)
+    explicit _EmbeddingBagKernelCacheImpl(std::optional<int64_t> maybe_block_size)
       : StorageMixins(maybe_block_size)...
     {}
 
@@ -107,7 +107,7 @@ using _EmbeddingBagKernelCache = _EmbeddingBagKernelCacheImpl<
     _CallbackAndBlockSize<false, int64_t, unsigned short>>;
 #else
 struct _EmbeddingBagKernelCache {
-    explicit _EmbeddingBagKernelCache(c10::optional<int64_t> /* maybe_block_size */) {}
+    explicit _EmbeddingBagKernelCache(std::optional<int64_t> /* maybe_block_size */) {}
 };
 #endif
 
@@ -115,7 +115,7 @@ void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag,
     Tensor& bag_size, Tensor* max_indices,
     const Tensor &weight, const Tensor &indices,
     const Tensor &offsets, const int64_t mode = 0,
-    const c10::optional<Tensor>& per_sample_weights = c10::nullopt,
+    const std::optional<Tensor>& per_sample_weights = c10::nullopt,
     bool include_last_offset = false,
     int64_t padding_idx = -1,
     _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr);
@@ -131,9 +131,9 @@ void _embedding_bag_cpu_out(
     const bool scale_grad_by_freq,
     const int64_t mode,
     const bool sparse,
-    const c10::optional<at::Tensor>& per_sample_weights,
+    const std::optional<at::Tensor>& per_sample_weights,
     const bool include_last_offset,
-    const c10::optional<int64_t>& padding_idx,
+    const std::optional<int64_t>& padding_idx,
     _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr);
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index f44ae1179de8f..0839dd9a1560c 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -258,7 +258,7 @@ inline bool can_use_fast_route(
 using DeviceDtypeKey = std::pair<at::Device, at::ScalarType>;
 using IndicesT = std::vector<size_t>;
 using nested_optional_tensorvec_t =
-    std::vector<std::vector<c10::optional<at::Tensor>>>;
+    std::vector<std::vector<std::optional<at::Tensor>>>;
 using TensorsAndIndicesT = std::pair<nested_optional_tensorvec_t, IndicesT>;
 using FlatMap = std::unordered_map<
     DeviceDtypeKey,
@@ -339,7 +339,7 @@ inline FlatMap _group_tensors_by_first_tensors_device_and_dtype(
                  nested_optional_tensorvec_t nested_tensorvec;
                  nested_tensorvec.reserve(num_lists);
                  for (const auto& i : c10::irange(num_lists)) {
-                   std::vector<c10::optional<at::Tensor>> tensors;
+                   std::vector<std::optional<at::Tensor>> tensors;
                    if (!nested_tensorlist[i].empty()) {
                      // NB: num_tensors is the max possible length for any of
                      // the inner lists of tensor references. Reserving the max
diff --git a/aten/src/ATen/native/FusedAdagrad.cpp b/aten/src/ATen/native/FusedAdagrad.cpp
deleted file mode 100644
index 1c5f553e6854c..0000000000000
--- a/aten/src/ATen/native/FusedAdagrad.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
-#include <ATen/native/DispatchStub.h>
-#include <ATen/native/FusedAdagrad.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/_fused_adagrad.h>
-#include <ATen/ops/_fused_adagrad_native.h>
-#endif
-namespace at {
-
-namespace native {
-
-void _fused_adagrad_kernel_cpu_(
-    at::TensorList params,
-    at::TensorList grads,
-    at::TensorList state_sums,
-    at::TensorList state_steps,
-    const double lr,
-    const double lr_decay,
-    const double weight_decay,
-    const double eps,
-    const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
-  const float* grad_scale_ptr =
-      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
-  const float* found_inf_ptr =
-      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
-  if (found_inf_ptr && *found_inf_ptr == 1.0) {
-      return;
-  }
-  size_t n_tensors = params.size();
-  TORCH_CHECK(grads.size() == n_tensors);
-  TORCH_CHECK(state_sums.size() == n_tensors);
-  TORCH_CHECK(state_steps.size() == n_tensors);
-  for (size_t i = 0; i < n_tensors; i++){
-    fused_adagrad_stub(
-      kCPU,
-      params[i],
-      grads[i],
-      state_sums[i],
-      state_steps[i],
-      lr,
-      lr_decay,
-      weight_decay,
-      eps,
-      maximize,
-      grad_scale_ptr);
-  }
-}
-
-DEFINE_DISPATCH(fused_adagrad_stub);
-
-}
-}
diff --git a/aten/src/ATen/native/FusedAdagrad.h b/aten/src/ATen/native/FusedAdagrad.h
deleted file mode 100644
index 395cbdd43aa81..0000000000000
--- a/aten/src/ATen/native/FusedAdagrad.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <ATen/core/Tensor.h>
-#include <ATen/native/DispatchStub.h>
-
-namespace at {
-
-namespace native {
-
-using fused_adagrad_fn = void (*)(
-    const at::Tensor& param,
-    const at::Tensor& grad,
-    const at::Tensor& state_sum,
-    const at::Tensor& state_step,
-    const double lr,
-    const double lr_decay,
-    const double weight_decay,
-    const double eps,
-    const bool maximize,
-    const float* grad_scale_ptr);
-
-DECLARE_DISPATCH(fused_adagrad_fn, fused_adagrad_stub);
-
-}
-}
diff --git a/aten/src/ATen/native/FusedAdam.cpp b/aten/src/ATen/native/FusedAdam.cpp
index b3be769b24f18..41ef04b02d548 100644
--- a/aten/src/ATen/native/FusedAdam.cpp
+++ b/aten/src/ATen/native/FusedAdam.cpp
@@ -30,8 +30,8 @@ void _fused_adam_kernel_cpu_(
     const double eps,
     const bool amsgrad,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   const float* grad_scale_ptr =
       grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
   const float* found_inf_ptr =
@@ -87,8 +87,8 @@ void _fused_adam_kernel_cpu_(
     const double eps,
     const bool amsgrad,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   _fused_adam_kernel_cpu_(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr.item<double>(), beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
 }
 
@@ -106,8 +106,8 @@ void _fused_adamw_kernel_cpu_(
     const double eps,
     const bool amsgrad,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   const float* grad_scale_ptr =
       grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
   const float* found_inf_ptr =
@@ -163,8 +163,8 @@ void _fused_adamw_kernel_cpu_(
     const double eps,
     const bool amsgrad,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   _fused_adamw_kernel_cpu_(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr.item<double>(), beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
 }
 
diff --git a/aten/src/ATen/native/FusedSGD.cpp b/aten/src/ATen/native/FusedSGD.cpp
index 56e2e91759113..2fb1f5af9e02f 100644
--- a/aten/src/ATen/native/FusedSGD.cpp
+++ b/aten/src/ATen/native/FusedSGD.cpp
@@ -26,8 +26,8 @@ void _fused_sgd_kernel_cpu_(
     const bool nesterov,
     const bool maximize,
     const bool is_first_step,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   const float* grad_scale_ptr =
       grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
   const float* found_inf_ptr =
@@ -71,8 +71,8 @@ void _fused_sgd_kernel_cpu_(
     const bool nesterov,
     const bool maximize,
     const bool is_first_step,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
     _fused_sgd_kernel_cpu_(
         params, grads, momentum_buffer_list, weight_decay,
         momentum, lr.item<double>(), dampening, nesterov,
diff --git a/aten/src/ATen/native/Histogram.cpp b/aten/src/ATen/native/Histogram.cpp
index d5258866f8a34..9954edef94607 100644
--- a/aten/src/ATen/native/Histogram.cpp
+++ b/aten/src/ATen/native/Histogram.cpp
@@ -71,7 +71,7 @@ namespace {
 
 /* Checks properties of input tensors input, bins, and weight.
  */
-void histogramdd_check_inputs(const Tensor& input, const TensorList& bins, const c10::optional<Tensor>& weight) {
+void histogramdd_check_inputs(const Tensor& input, const TensorList& bins, const std::optional<Tensor>& weight) {
     TORCH_CHECK(input.dim() >= 2, "torch.histogramdd: input tensor should have at least 2 dimensions, but got ",
                 input.dim());
 
@@ -158,7 +158,7 @@ void histogramdd_prepare_out(const Tensor& input, TensorList bins,
  * assumes that input has already been reshaped to (M, N).
  */
 std::pair<std::vector<double>, std::vector<double>>
-select_outer_bin_edges(const Tensor& input, c10::optional<c10::ArrayRef<double>> range) {
+select_outer_bin_edges(const Tensor& input, std::optional<c10::ArrayRef<double>> range) {
     TORCH_INTERNAL_ASSERT(input.dim() == 2, "expected input to have shape (M, N)");
     const int64_t N = input.size(-1);
 
@@ -244,7 +244,7 @@ static std::vector<Tensor> allocate_bin_edges_tensors(const Tensor& self) {
 /* Versions of histogramdd in which bins is a Tensor[] defining the sequences of bin edges.
  */
 static Tensor& histogramdd_out(const Tensor& self, TensorList bins,
-        const c10::optional<Tensor>& weight, bool density,
+        const std::optional<Tensor>& weight, bool density,
         Tensor& hist, TensorList& bin_edges) {
     histogramdd_check_inputs(self, bins, weight);
     histogramdd_prepare_out(self, bins, hist, bin_edges);
@@ -258,7 +258,7 @@ static Tensor& histogramdd_out(const Tensor& self, TensorList bins,
 }
 
 Tensor _histogramdd(const Tensor& self, TensorList bins,
-        const c10::optional<Tensor>& weight, bool density) {
+        const std::optional<Tensor>& weight, bool density) {
     Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous);
     std::vector<Tensor> bin_edges_out = allocate_bin_edges_tensors(self);
     TensorList bin_edges_out_tl(bin_edges_out);
@@ -271,8 +271,8 @@ Tensor _histogramdd(const Tensor& self, TensorList bins,
  * defining the number of bins in each dimension.
  */
 static std::vector<Tensor>& histogramdd_bin_edges_out(const Tensor& self, IntArrayRef bin_ct,
-        c10::optional<c10::ArrayRef<double>> range,
-        const c10::optional<Tensor>& weight, bool density,
+        std::optional<c10::ArrayRef<double>> range,
+        const std::optional<Tensor>& weight, bool density,
         std::vector<Tensor>& bin_edges_out) {
     TensorList bin_edges_out_tl(bin_edges_out);
 
@@ -296,15 +296,15 @@ static std::vector<Tensor>& histogramdd_bin_edges_out(const Tensor& self, IntArr
 }
 
 std::vector<Tensor> histogramdd_bin_edges(const Tensor& self, IntArrayRef bin_ct,
-        c10::optional<c10::ArrayRef<double>> range,
-        const c10::optional<Tensor>& weight, bool density) {
+        std::optional<c10::ArrayRef<double>> range,
+        const std::optional<Tensor>& weight, bool density) {
     std::vector<Tensor> bin_edges_out = allocate_bin_edges_tensors(self);
     return histogramdd_bin_edges_out(self, bin_ct, range, weight, density, bin_edges_out);
 }
 
 static Tensor& histogramdd_out(const Tensor& self, IntArrayRef bin_ct,
-        c10::optional<c10::ArrayRef<double>> range,
-        const c10::optional<Tensor>& weight, bool density,
+        std::optional<c10::ArrayRef<double>> range,
+        const std::optional<Tensor>& weight, bool density,
         Tensor& hist, TensorList& bin_edges) {
     std::vector<Tensor> bins = histogramdd_bin_edges(self, bin_ct, range, weight, density);
 
@@ -320,8 +320,8 @@ static Tensor& histogramdd_out(const Tensor& self, IntArrayRef bin_ct,
 }
 
 Tensor _histogramdd(const Tensor& self, IntArrayRef bin_ct,
-        c10::optional<c10::ArrayRef<double>> range,
-        const c10::optional<Tensor>& weight, bool density) {
+        std::optional<c10::ArrayRef<double>> range,
+        const std::optional<Tensor>& weight, bool density) {
     Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous);
     std::vector<Tensor> bin_edges_out = allocate_bin_edges_tensors(self);
     TensorList bin_edges_out_tl(bin_edges_out);
@@ -334,10 +334,10 @@ Tensor _histogramdd(const Tensor& self, IntArrayRef bin_ct,
  */
 std::tuple<Tensor&, Tensor&>
 histogram_out(const Tensor& self, const Tensor& bins,
-        const c10::optional<Tensor>& weight, bool density,
+        const std::optional<Tensor>& weight, bool density,
         Tensor& hist, Tensor& bin_edges) {
     Tensor reshaped_self = self.reshape({ self.numel(), 1 });
-    c10::optional<Tensor> reshaped_weight = weight.has_value()
+    std::optional<Tensor> reshaped_weight = weight.has_value()
         ? weight.value().reshape({ weight.value().numel() }) : weight;
     TensorList bins_in = bins;
     TensorList bins_out = bin_edges;
@@ -349,7 +349,7 @@ histogram_out(const Tensor& self, const Tensor& bins,
 
 std::tuple<Tensor, Tensor>
 histogram(const Tensor& self, const Tensor& bins,
-        const c10::optional<Tensor>& weight, bool density) {
+        const std::optional<Tensor>& weight, bool density) {
     Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous);
     Tensor bin_edges = at::empty({0}, bins.options(), MemoryFormat::Contiguous);
     return histogram_out(self, bins, weight, density, hist, bin_edges);
@@ -358,11 +358,11 @@ histogram(const Tensor& self, const Tensor& bins,
 /* Versions of histogram in which bins is an integer specifying the number of equal-width bins.
  */
 std::tuple<Tensor&, Tensor&>
-histogram_out(const Tensor& self, int64_t bin_ct, c10::optional<c10::ArrayRef<double>> range,
-        const c10::optional<Tensor>& weight, bool density,
+histogram_out(const Tensor& self, int64_t bin_ct, std::optional<c10::ArrayRef<double>> range,
+        const std::optional<Tensor>& weight, bool density,
         Tensor& hist, Tensor& bin_edges) {
     Tensor reshaped_self = self.reshape({ self.numel(), 1 });
-    c10::optional<Tensor> reshaped_weight = weight.has_value()
+    std::optional<Tensor> reshaped_weight = weight.has_value()
         ? weight.value().reshape({ weight.value().numel() }) : weight;
     TensorList bins_in = bin_edges;
     TensorList bins_out = bin_edges;
@@ -378,8 +378,8 @@ histogram_out(const Tensor& self, int64_t bin_ct, c10::optional<c10::ArrayRef<do
 }
 
 std::tuple<Tensor, Tensor>
-histogram(const Tensor& self, int64_t bin_ct, c10::optional<c10::ArrayRef<double>> range,
-        const c10::optional<Tensor>& weight, bool density) {
+histogram(const Tensor& self, int64_t bin_ct, std::optional<c10::ArrayRef<double>> range,
+        const std::optional<Tensor>& weight, bool density) {
     Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous);
     Tensor bin_edges_out = at::empty({0}, self.options());
     return histogram_out(self, bin_ct, range, weight, density, hist, bin_edges_out);
@@ -403,7 +403,7 @@ Tensor& histogram_histc_out(const Tensor& self, int64_t bin_ct,
     histogramdd_check_inputs(reshaped, bins_in, {});
 
     histogramdd_linear_stub(reshaped.device().type(), reshaped,
-            c10::optional<Tensor>(), false, hist, bin_edges, false);
+            std::optional<Tensor>(), false, hist, bin_edges, false);
     return hist;
 }
 
@@ -414,16 +414,16 @@ Tensor histogram_histc(const Tensor& self, int64_t bin_ct,
 }
 
 std::tuple<Tensor, std::vector<Tensor>> histogramdd(
-    const Tensor &self, TensorList bins, c10::optional<ArrayRef<double>> /*range*/,
-    const c10::optional<Tensor> &weight, bool density) {
+    const Tensor &self, TensorList bins, std::optional<ArrayRef<double>> /*range*/,
+    const std::optional<Tensor> &weight, bool density) {
   auto hist = at::_histogramdd_from_bin_tensors(self, bins, weight, density);
   return std::tuple<Tensor, std::vector<Tensor>>{
       std::move(hist), bins.vec()};
 }
 
 std::tuple<Tensor, std::vector<Tensor>> histogramdd(
-    const Tensor &self, IntArrayRef bins, c10::optional<ArrayRef<double>> range,
-    const c10::optional<Tensor> &weight, bool density) {
+    const Tensor &self, IntArrayRef bins, std::optional<ArrayRef<double>> range,
+    const std::optional<Tensor> &weight, bool density) {
   auto bin_edges = at::_histogramdd_bin_edges(self, bins, range, weight, density);
   auto hist = at::_histogramdd_from_bin_cts(self, bins, range, weight, density);
   return std::tuple<Tensor, std::vector<Tensor>>{
@@ -431,8 +431,8 @@ std::tuple<Tensor, std::vector<Tensor>> histogramdd(
 }
 
 std::tuple<Tensor, std::vector<Tensor>> histogramdd(
-    const Tensor &self, int64_t bins, c10::optional<ArrayRef<double>> range,
-    const c10::optional<Tensor> &weight, bool density) {
+    const Tensor &self, int64_t bins, std::optional<ArrayRef<double>> range,
+    const std::optional<Tensor> &weight, bool density) {
   DimVector bins_v(self.size(-1), bins);
   return at::native::histogramdd(self, bins_v, range, weight, density);
 }
diff --git a/aten/src/ATen/native/Histogram.h b/aten/src/ATen/native/Histogram.h
index cd19fa4691ad0..fee7e06b87258 100644
--- a/aten/src/ATen/native/Histogram.h
+++ b/aten/src/ATen/native/Histogram.h
@@ -5,8 +5,8 @@
 
 namespace at::native {
 
-using histogramdd_fn = void(*)(const Tensor&, const c10::optional<Tensor>&, bool, Tensor&, const TensorList&);
-using histogramdd_linear_fn = void(*)(const Tensor&, const c10::optional<Tensor>&, bool, Tensor&, const TensorList&, bool);
+using histogramdd_fn = void(*)(const Tensor&, const std::optional<Tensor>&, bool, Tensor&, const TensorList&);
+using histogramdd_linear_fn = void(*)(const Tensor&, const std::optional<Tensor>&, bool, Tensor&, const TensorList&, bool);
 using histogram_select_outer_bin_edges_fn = void(*)(const Tensor& input, const int64_t N, std::vector<double> &leftmost_edges, std::vector<double> &rightmost_edges);
 
 DECLARE_DISPATCH(histogramdd_fn, histogramdd_stub);
diff --git a/aten/src/ATen/native/IndexingUtils.h b/aten/src/ATen/native/IndexingUtils.h
index 72b39eb326a0c..fb382ccbc6f0a 100644
--- a/aten/src/ATen/native/IndexingUtils.h
+++ b/aten/src/ATen/native/IndexingUtils.h
@@ -65,8 +65,8 @@ static C10_UNUSED void checkIndexTensorTypes(IOptTensorListRef indices, bool all
   }
 }
 
-inline torch::List<c10::optional<Tensor>> toListOfOptionalTensors(ArrayRef<Tensor> list) {
-  torch::List<c10::optional<Tensor>> result;
+inline torch::List<std::optional<Tensor>> toListOfOptionalTensors(ArrayRef<Tensor> list) {
+  torch::List<std::optional<Tensor>> result;
   result.reserve(list.size());
   for (const Tensor& a : list) {
     result.push_back(a);
@@ -74,11 +74,11 @@ inline torch::List<c10::optional<Tensor>> toListOfOptionalTensors(ArrayRef<Tenso
   return result;
 }
 
-inline torch::List<c10::optional<Tensor>> toListOfOptionalTensors(ArrayRef<IValue> list) {
-  torch::List<c10::optional<Tensor>> result;
+inline torch::List<std::optional<Tensor>> toListOfOptionalTensors(ArrayRef<IValue> list) {
+  torch::List<std::optional<Tensor>> result;
   result.reserve(list.size());
   for (const IValue& a : list) {
-    result.push_back(a.isTensor() ? c10::optional<Tensor>(a.toTensor()) : c10::optional<Tensor>());
+    result.push_back(a.isTensor() ? std::optional<Tensor>(a.toTensor()) : c10::optional<Tensor>());
   }
   return result;
 }
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index 9322776b03f5a..8a835410458ea 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -70,7 +70,7 @@ static inline Tensor _flatten_nd_linear(const Tensor& input, const Tensor& weigh
 }
 
 
-Tensor linear(const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt) {
+Tensor linear(const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt) {
   // _matmul_impl checks this again later, but _flatten_nd_linear does not work on scalars inputs,
   // so let's try to catch this here already
   const auto input_dim = input.dim();
@@ -121,7 +121,7 @@ Tensor linear(const Tensor& input, const Tensor& weight, const c10::optional<Ten
   return output;
 }
 
-Tensor& linear_out(const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt, Tensor& output) {
+Tensor& linear_out(const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt, Tensor& output) {
   TORCH_CHECK(!input.is_mkldnn(), "linear doesn't support out for MKLDNN tensors");
   // See [Note: hacky wrapper removal for optional tensor]
   auto bias = bias_opt.has_value()
@@ -707,7 +707,7 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
   return output;
 }
 
-Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight, const c10::optional<Tensor>& bias_opt) {
+Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight, const std::optional<Tensor>& bias_opt) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 81f461f6c95b8..3389033ac9851 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -280,7 +280,7 @@ TORCH_META_FUNC(_linalg_slogdet)(const Tensor& A) {
 }
 
 template <typename Meta>
-void common_checks_baddbmm_bmm(Meta& meta, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, bool is_bmm, const c10::optional<Tensor>& self_baddbmm = nullopt) {
+void common_checks_baddbmm_bmm(Meta& meta, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, bool is_bmm, const std::optional<Tensor>& self_baddbmm = nullopt) {
   TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
   TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
 
@@ -635,7 +635,7 @@ namespace {
 Tensor linalg_matrix_power_impl(
     const Tensor& self,
     int64_t n,
-    c10::optional<Tensor> _out) {
+    std::optional<Tensor> _out) {
   NoTF32Guard disable_tf32;
   auto out = _out.value_or(Tensor());
 
@@ -929,7 +929,7 @@ Tensor matrix_chain_multiplication(
 }
 
 // Implements torch.linalg.multi_dot
-Tensor multi_dot_impl(TensorList _tensors, c10::optional<Tensor> _out) {
+Tensor multi_dot_impl(TensorList _tensors, std::optional<Tensor> _out) {
   const size_t n = _tensors.size();
   TORCH_CHECK(n >= 2, "multi_dot(): expected at least 2 tensors but got ", n);
 
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index e21d9f6008e8e..a170e4a868aa7 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -250,7 +250,7 @@ Tensor kl_div(const Tensor& input, const Tensor& target, int64_t reduction, bool
   return apply_loss_reduction(output, reduction);
 }
 
-Tensor binary_cross_entropy_cpu(const Tensor& input, const Tensor& target, const c10::optional<Tensor>& weight_opt, int64_t reduction) {
+Tensor binary_cross_entropy_cpu(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -260,7 +260,7 @@ Tensor binary_cross_entropy_cpu(const Tensor& input, const Tensor& target, const
         input, target, weight, reduction, loss);
 }
 
-Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target, const c10::optional<Tensor>& weight_opt, int64_t reduction, Tensor& loss) {
+Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& loss) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -307,7 +307,7 @@ Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target,
     return loss;
 }
 
-Tensor binary_cross_entropy_backward_cpu(const Tensor& grad, const Tensor& input, const Tensor& target, const c10::optional<Tensor>& weight_opt, int64_t reduction) {
+Tensor binary_cross_entropy_backward_cpu(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -317,7 +317,7 @@ Tensor binary_cross_entropy_backward_cpu(const Tensor& grad, const Tensor& input
         grad, input, target, weight, reduction, grad_input);
 }
 
-Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor& input, const Tensor& target, const c10::optional<Tensor>& weight_opt, int64_t reduction, Tensor& grad_input) {
+Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& grad_input) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -359,7 +359,7 @@ Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor&
     return grad_input;
 }
 
-Tensor binary_cross_entropy_with_logits(const Tensor& input, const Tensor& target, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& pos_weight_opt, int64_t reduction) {
+Tensor binary_cross_entropy_with_logits(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& pos_weight_opt, int64_t reduction) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
diff --git a/aten/src/ATen/native/LossMulti.h b/aten/src/ATen/native/LossMulti.h
index f21269620f253..27697815ad594 100644
--- a/aten/src/ATen/native/LossMulti.h
+++ b/aten/src/ATen/native/LossMulti.h
@@ -41,7 +41,7 @@ namespace {
     const int64_t& ndims,
     const Tensor& input,
     const Tensor& target,
-    const c10::optional<Tensor>& weight) {
+    const std::optional<Tensor>& weight) {
     TORCH_CHECK(
         (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0,
         "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp
index 5b2f5ae1863b7..e7620c7900c56 100644
--- a/aten/src/ATen/native/LossMultiMargin.cpp
+++ b/aten/src/ATen/native/LossMultiMargin.cpp
@@ -102,7 +102,7 @@ void multi_margin_loss_out_cpu_template(
     const Tensor& target,
     int p,
     const Scalar& margin,
-    const c10::optional<Tensor>& weight,
+    const std::optional<Tensor>& weight,
     int64_t reduction) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int64_t nframe, dim;
@@ -266,7 +266,7 @@ Tensor multi_margin_loss_cpu(
     const Tensor& target,
     const Scalar& p,
     const Scalar& margin,
-    const c10::optional<Tensor>& weight,
+    const std::optional<Tensor>& weight,
     int64_t reduction) {
   auto output = at::empty({0}, input.options());
   multi_margin_loss_out_cpu_template(
@@ -278,7 +278,7 @@ Tensor& multi_margin_loss_cpu_out(const Tensor& input,
     const Tensor& target,
     const Scalar& p,
     const Scalar& margin,
-    const c10::optional<Tensor>& weight,
+    const std::optional<Tensor>& weight,
     int64_t reduction,
     Tensor& output) {
   multi_margin_loss_out_cpu_template(
@@ -291,7 +291,7 @@ Tensor multi_margin_loss_cpu_backward(
     const Tensor& input,
     const Tensor& target,
     const Scalar& p,
-    const Scalar& margin, const c10::optional<Tensor>& weight_opt,
+    const Scalar& margin, const std::optional<Tensor>& weight_opt,
     int64_t reduction) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
@@ -314,7 +314,7 @@ Tensor& multi_margin_loss_cpu_backward_out(const Tensor& grad_output,
     const Tensor& input,
     const Tensor& target,
     const Scalar& p,
-    const Scalar& margin, const c10::optional<Tensor>& weight_opt,
+    const Scalar& margin, const std::optional<Tensor>& weight_opt,
     int64_t reduction,
     Tensor& grad_input) {
   // See [Note: hacky wrapper removal for optional tensor]
diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index 0e7de9c27252a..b7809ab21dd5d 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -624,7 +624,7 @@ static Tensor cross_entropy_loss_label_smoothing(
 Tensor cross_entropy_loss_symint(
     const Tensor& self,
     const Tensor& target,
-    const c10::optional<Tensor>& weight,
+    const std::optional<Tensor>& weight,
     int64_t reduction,
     c10::SymInt ignore_index,
     double label_smoothing) {
@@ -658,7 +658,7 @@ Tensor cross_entropy_loss_symint(
   return ret;
 }
 
-Tensor & nll_loss_out(const Tensor & self, const Tensor & target, const c10::optional<Tensor>& weight_opt, int64_t reduction, int64_t ignore_index, Tensor & output) {
+Tensor & nll_loss_out(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, int64_t ignore_index, Tensor & output) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -667,7 +667,7 @@ Tensor & nll_loss_out(const Tensor & self, const Tensor & target, const c10::opt
   return std::get<0>(at::nll_loss_forward_out(output, total_weight, self, target, weight, reduction, ignore_index));
 }
 
-Tensor nll_loss_symint(const Tensor & self, const Tensor & target, const c10::optional<Tensor>& weight_opt, int64_t reduction, c10::SymInt ignore_index) {
+Tensor nll_loss_symint(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, c10::SymInt ignore_index) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -676,7 +676,7 @@ Tensor nll_loss_symint(const Tensor & self, const Tensor & target, const c10::op
 }
 
 // Duplicate of above code for non-symbolic ints. Kept for BC purposes and to minimize breakages.
-static Tensor nll_loss(const Tensor & self, const Tensor & target, const c10::optional<Tensor>& weight_opt, int64_t reduction, int64_t ignore_index) {
+static Tensor nll_loss(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, int64_t ignore_index) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -687,7 +687,7 @@ static Tensor nll_loss(const Tensor & self, const Tensor & target, const c10::op
 Tensor nll_loss_nd_symint(
     const Tensor& self,
     const Tensor& target,
-    const c10::optional<Tensor>& weight,
+    const std::optional<Tensor>& weight,
     int64_t reduction,
     c10::SymInt ignore_index) {
   if (self.dim() < 1) {
diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp
index 94c667dcb1b2b..6f27884b8f24b 100644
--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@@ -405,7 +405,7 @@ void nll_loss2d_backward_out_cpu_template(
 } // namespace
 
 std::tuple<Tensor&, Tensor&> nll_loss2d_forward_out_cpu(const Tensor& self,
-    const Tensor& target, const c10::optional<Tensor>& weight_opt,
+    const Tensor& target, const std::optional<Tensor>& weight_opt,
     int64_t reduction,
     int64_t ignore_index,
     Tensor& output,
@@ -421,7 +421,7 @@ std::tuple<Tensor&, Tensor&> nll_loss2d_forward_out_cpu(const Tensor& self,
 
 std::tuple<Tensor, Tensor> nll_loss2d_forward_cpu(
     const Tensor& self,
-    const Tensor& target, const c10::optional<Tensor>& weight_opt,
+    const Tensor& target, const std::optional<Tensor>& weight_opt,
     int64_t reduction,
     int64_t ignore_index) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -437,7 +437,7 @@ std::tuple<Tensor, Tensor> nll_loss2d_forward_cpu(
 
 Tensor& nll_loss2d_backward_out_cpu(const Tensor& grad_output,
     const Tensor& self,
-    const Tensor& target, const c10::optional<Tensor>& weight_opt,
+    const Tensor& target, const std::optional<Tensor>& weight_opt,
     int64_t reduction,
     int64_t ignore_index,
     const Tensor& total_weight,
@@ -461,7 +461,7 @@ Tensor& nll_loss2d_backward_out_cpu(const Tensor& grad_output,
 Tensor nll_loss2d_backward_cpu(
     const Tensor& grad_output,
     const Tensor& self,
-    const Tensor& target, const c10::optional<Tensor>& weight_opt,
+    const Tensor& target, const std::optional<Tensor>& weight_opt,
     int64_t reduction,
     int64_t ignore_index,
     const Tensor& total_weight) {
@@ -482,7 +482,7 @@ Tensor nll_loss2d_backward_cpu(
   return grad_input;
 }
 
-Tensor & nll_loss2d_out(const Tensor & self, const Tensor & target, const c10::optional<Tensor>& weight_opt, int64_t reduction, int64_t ignore_index, Tensor & output) {
+Tensor & nll_loss2d_out(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, int64_t ignore_index, Tensor & output) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -491,7 +491,7 @@ Tensor & nll_loss2d_out(const Tensor & self, const Tensor & target, const c10::o
   return std::get<0>(at::nll_loss2d_forward_out(output, total_weight, self, target, weight, reduction, ignore_index));
 }
 
-Tensor nll_loss2d_symint(const Tensor & self, const Tensor & target, const c10::optional<Tensor>& weight_opt, int64_t reduction, c10::SymInt ignore_index) {
+Tensor nll_loss2d_symint(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, c10::SymInt ignore_index) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -500,7 +500,7 @@ Tensor nll_loss2d_symint(const Tensor & self, const Tensor & target, const c10::
 }
 
 // Duplicate of above code for non-symbolic ints. Kept for BC purposes and to minimize breakages.
-static Tensor nll_loss2d(const Tensor & self, const Tensor & target, const c10::optional<Tensor>& weight_opt, int64_t reduction, int64_t ignore_index) {
+static Tensor nll_loss2d(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, int64_t ignore_index) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
diff --git a/aten/src/ATen/native/MathBitsFallback.h b/aten/src/ATen/native/MathBitsFallback.h
index 584d07aeca358..de2296634e045 100644
--- a/aten/src/ATen/native/MathBitsFallback.h
+++ b/aten/src/ATen/native/MathBitsFallback.h
@@ -56,7 +56,7 @@ struct MathOpFallback {
     const auto num_arguments = arguments.size();
     const auto stack_start = stack->size() - num_arguments;
 
-    c10::optional<bool> is_write;
+    std::optional<bool> is_write;
     for (const auto i : c10::irange(num_arguments)) {
       // Three possible states:
       // 1. alias_info has no value --> out-of-place operation
diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp
index 0d07054f72eda..fefe9ab5a8d2b 100644
--- a/aten/src/ATen/native/Memory.cpp
+++ b/aten/src/ATen/native/Memory.cpp
@@ -23,11 +23,11 @@ int64_t _debug_has_internal_overlap(const Tensor& self) {
 // pinned memory, always return false", but this makes life a little easier when
 // you haven't loaded the backend extension at all (which can happen, e.g., on a
 // CPU build of PyTorch and you try to check if something is CUDA pinned)
-bool is_pinned_default(const Tensor& self, c10::optional<Device> device) {
+bool is_pinned_default(const Tensor& self, std::optional<Device> device) {
   return false;
 }
 
-Tensor pin_memory(const Tensor& self, c10::optional<Device> device) {
+Tensor pin_memory(const Tensor& self, std::optional<Device> device) {
   // Kind of mad that I have to do two dynamic dispatches here, pretty
   // annoying
   if (self.is_pinned(device)) {
diff --git a/aten/src/ATen/native/MetaTensor.cpp b/aten/src/ATen/native/MetaTensor.cpp
index 972d13dc8fb51..518466df84ce4 100644
--- a/aten/src/ATen/native/MetaTensor.cpp
+++ b/aten/src/ATen/native/MetaTensor.cpp
@@ -13,11 +13,11 @@ namespace at::native {
 
 Tensor empty_meta_symint(
   SymIntArrayRef size,
-  c10::optional<ScalarType> dtype_opt,
-  c10::optional<Layout> layout_opt,
-  c10::optional<Device> device_opt,
-  c10::optional<bool> pin_memory_opt,
-  c10::optional<c10::MemoryFormat> memory_format_opt
+  std::optional<ScalarType> dtype_opt,
+  std::optional<Layout> layout_opt,
+  std::optional<Device> device_opt,
+  std::optional<bool> pin_memory_opt,
+  std::optional<c10::MemoryFormat> memory_format_opt
 ) {
 
   auto opt_size = asIntArrayRefSlowOpt(size);
@@ -32,10 +32,10 @@ Tensor empty_meta_symint(
 static Tensor empty_strided_meta(
   IntArrayRef size,
   IntArrayRef stride,
-  c10::optional<ScalarType> dtype_opt,
-  c10::optional<Layout> layout_opt,
-  c10::optional<Device> device_opt,
-  c10::optional<bool> pin_memory_opt
+  std::optional<ScalarType> dtype_opt,
+  std::optional<Layout> layout_opt,
+  std::optional<Device> device_opt,
+  std::optional<bool> pin_memory_opt
 ) {
   return empty_strided_meta_symint(c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), dtype_opt, layout_opt, device_opt, pin_memory_opt);
 }
@@ -43,10 +43,10 @@ static Tensor empty_strided_meta(
 Tensor empty_strided_meta_symint(
   SymIntArrayRef size,
   SymIntArrayRef stride,
-  c10::optional<ScalarType> dtype_opt,
-  c10::optional<Layout> layout_opt,
-  c10::optional<Device> device_opt,
-  c10::optional<bool> pin_memory_opt
+  std::optional<ScalarType> dtype_opt,
+  std::optional<Layout> layout_opt,
+  std::optional<Device> device_opt,
+  std::optional<bool> pin_memory_opt
 ) {
   return at::detail::empty_strided_symint_meta(
       size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
diff --git a/aten/src/ATen/native/NNPACK.cpp b/aten/src/ATen/native/NNPACK.cpp
index e43bfdd627965..89b2f3ffc493b 100644
--- a/aten/src/ATen/native/NNPACK.cpp
+++ b/aten/src/ATen/native/NNPACK.cpp
@@ -22,7 +22,7 @@ namespace at::native {
 
 at::Tensor _nnpack_spatial_convolution(
     const Tensor& input,
-    const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& weight, const std::optional<Tensor>& bias_opt,
     const IntArrayRef padding,
     const IntArrayRef stride) {
   throw std::runtime_error(
@@ -137,7 +137,7 @@ static thread_local Workspace workspace;
 
 Tensor _nnpack_spatial_convolution(
     const Tensor& input,
-    const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& weight, const std::optional<Tensor>& bias_opt,
     const IntArrayRef padding,
     const IntArrayRef stride) {
   // See [Note: hacky wrapper removal for optional tensor]
diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
index 624e820c7ba66..f82354ace3b82 100644
--- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
@@ -819,7 +819,7 @@ void slow_conv_transpose3d_acc_grad_parameters_cpu(
 
 Tensor& slow_conv_transpose3d_out_cpu(const Tensor& input,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef output_padding,
@@ -846,7 +846,7 @@ Tensor& slow_conv_transpose3d_out_cpu(const Tensor& input,
 Tensor slow_conv_transpose3d_cpu(
     const Tensor& input,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef output_padding,
diff --git a/aten/src/ATen/native/NaiveDilatedConvolution.cpp b/aten/src/ATen/native/NaiveDilatedConvolution.cpp
index 571eb16fc50e0..acf040259b135 100644
--- a/aten/src/ATen/native/NaiveDilatedConvolution.cpp
+++ b/aten/src/ATen/native/NaiveDilatedConvolution.cpp
@@ -524,7 +524,7 @@ void slow_conv_dilated_all_cpu_template(
 Tensor slow_conv_dilated2d_cpu(
     const Tensor& input,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride_size,
     IntArrayRef pad_size,
     IntArrayRef dilation_size) {
@@ -579,7 +579,7 @@ Tensor slow_conv_dilated2d_cpu(
 Tensor slow_conv_dilated3d_cpu(
     const Tensor& input,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride_size,
     IntArrayRef pad_size,
     IntArrayRef dilation_size) {
diff --git a/aten/src/ATen/native/NamedTensor.cpp b/aten/src/ATen/native/NamedTensor.cpp
index f0330481c31a9..709d63bae6368 100644
--- a/aten/src/ATen/native/NamedTensor.cpp
+++ b/aten/src/ATen/native/NamedTensor.cpp
@@ -387,13 +387,13 @@ Tensor scatter_add(const Tensor& self, Dimname dim, const Tensor& index, const T
 static Tensor& scatter_add_(Tensor& self, Dimname dim, const Tensor& index, const Tensor& source) {
   reportNYIDimnameOverload("scatter_add");
 }
-std::tuple<Tensor&, Tensor&> sort_out(const Tensor& self, c10::optional<bool> stable, Dimname dim, bool keepdim, Tensor& values, Tensor& indices) {
+std::tuple<Tensor&, Tensor&> sort_out(const Tensor& self, std::optional<bool> stable, Dimname dim, bool keepdim, Tensor& values, Tensor& indices) {
   reportNYIDimnameOverload("sort");
 }
 std::tuple<Tensor&, Tensor&> sort_out(const Tensor& self, Dimname dim, bool keepdim, Tensor& values, Tensor& indices) {
   reportNYIDimnameOverload("sort");
 }
-std::tuple<Tensor, Tensor> sort(const Tensor& self, c10::optional<bool> stable, Dimname dim, bool keepdim) {
+std::tuple<Tensor, Tensor> sort(const Tensor& self, std::optional<bool> stable, Dimname dim, bool keepdim) {
   reportNYIDimnameOverload("sort");
 }
 std::tuple<Tensor, Tensor> sort(const Tensor& self, Dimname dim, bool keepdim) {
diff --git a/aten/src/ATen/native/NonSymbolicBC.h b/aten/src/ATen/native/NonSymbolicBC.h
index 589822a4ee013..037156ac23b15 100644
--- a/aten/src/ATen/native/NonSymbolicBC.h
+++ b/aten/src/ATen/native/NonSymbolicBC.h
@@ -9,15 +9,15 @@ namespace at::native {
 // In those cases, we will duplicate the signature here with non-symbolic ints, and also duplicate the C++ implementation.
 TORCH_API at::Tensor reshape(const at::Tensor& self, at::IntArrayRef proposed_shape);
 TORCH_API at::Tensor narrow(const at::Tensor& self, int64_t dim, int64_t start, int64_t length);
-TORCH_API at::Tensor _sparse_coo_tensor_unsafe(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype=c10::nullopt, c10::optional<at::Layout> layout=c10::nullopt, c10::optional<at::Device> device=c10::nullopt, c10::optional<bool> pin_memory=c10::nullopt, c10::optional<bool> is_coalesced=c10::nullopt);
-TORCH_API at::Tensor nll_loss(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor>& weight_opt, int64_t reduction, int64_t ignore_index);
-TORCH_API at::Tensor nll_loss2d(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor>& weight_opt, int64_t reduction, int64_t ignore_index);
+TORCH_API at::Tensor _sparse_coo_tensor_unsafe(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, std::optional<at::ScalarType> dtype=c10::nullopt, c10::optional<at::Layout> layout=c10::nullopt, c10::optional<at::Device> device=c10::nullopt, c10::optional<bool> pin_memory=c10::nullopt, c10::optional<bool> is_coalesced=c10::nullopt);
+TORCH_API at::Tensor nll_loss(const at::Tensor & self, const at::Tensor & target, const std::optional<at::Tensor>& weight_opt, int64_t reduction, int64_t ignore_index);
+TORCH_API at::Tensor nll_loss2d(const at::Tensor & self, const at::Tensor & target, const std::optional<at::Tensor>& weight_opt, int64_t reduction, int64_t ignore_index);
 // The below ops don't get a duplicated C++ implementation.
 // They are backward ops, which make them very unlikely to be called directly
 // by external code (at::native::trace_backward).
 // They get their own declaration for BC purposes however.
-TORCH_API at::Tensor _embedding_bag_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
-TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
+TORCH_API at::Tensor _embedding_bag_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse, const std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
+TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
 TORCH_API at::Tensor value_selecting_reduction_backward(const at::Tensor & grad, int64_t dim, const at::Tensor & indices, at::IntArrayRef sizes, bool keepdim);
 TORCH_API at::Tensor trace_backward(const at::Tensor & grad, at::IntArrayRef sizes);
 TORCH_API at::Tensor index_select_backward(const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index);
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 93d2ce11d934f..ce1b23c2bdf6f 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -519,6 +519,7 @@ BatchNormBackend _select_batch_norm_backend(
       && weight.defined() && bias.defined()
       && ((running_mean.defined() && running_var.defined())
         || (!running_mean.defined() && !running_var.defined() && training))
+      && (input.dim() >= 3)
       && detail::getCUDAHooks().compiledWithMIOpen()
       && cudnn_enabled
       && input.suggest_memory_format() != MemoryFormat::ChannelsLast
@@ -537,7 +538,7 @@ BatchNormBackend _select_batch_norm_backend(
 // XXX: The indices of backends need to be kept synchronized between this function and its _backward.
 // TODO: remove cudnn_enabled arg
 std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */, const c10::optional<Tensor>& running_mean_opt /* optional */, const c10::optional<Tensor>& running_var_opt /* optional */,
+    const Tensor& input, const std::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */, const c10::optional<Tensor>& running_mean_opt /* optional */, const c10::optional<Tensor>& running_var_opt /* optional */,
     bool training, double momentum, double eps, bool cudnn_enabled) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
@@ -619,7 +620,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
 
 std::tuple<Tensor, Tensor, Tensor> _batch_norm_impl_index_backward(
     int64_t impl_index,
-    const Tensor& input, const Tensor& grad_output, const c10::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& running_mean_opt /* optional */, const c10::optional<Tensor>& running_var_opt /* optional */, const c10::optional<Tensor>& save_mean_opt /* optional */, const c10::optional<Tensor>& save_var_transform_opt /* optional */,
+    const Tensor& input, const Tensor& grad_output, const std::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& running_mean_opt /* optional */, const c10::optional<Tensor>& running_var_opt /* optional */, const c10::optional<Tensor>& save_mean_opt /* optional */, const c10::optional<Tensor>& save_var_transform_opt /* optional */,
     bool train, double epsilon, std::array<bool, 3> output_mask, const Tensor &reservedSpace) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
@@ -665,8 +666,8 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_impl_index_backward(
 
 // TODO: remove cudnn_enabled arg
 Tensor batch_norm(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
-    const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
     bool training, double momentum, double eps, bool cudnn_enabled) {
   const Tensor& weight = c10::value_or_else(weight_opt, [] {return Tensor();});
   const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
@@ -701,7 +702,7 @@ Tensor batch_norm(
 }
 
 Tensor instance_norm(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */, const c10::optional<Tensor>& running_mean_opt /* optional */, const c10::optional<Tensor>& running_var_opt /* optional */,
+    const Tensor& input, const std::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */, const c10::optional<Tensor>& running_mean_opt /* optional */, const c10::optional<Tensor>& running_var_opt /* optional */,
     bool use_input_stats, double momentum, double eps, bool cudnn_enabled) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
@@ -739,7 +740,7 @@ Tensor instance_norm(
 }
 
 std::tuple<Tensor, Tensor> batch_norm_update_stats_cpu(
-        const Tensor& self, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, double momentum) {
+        const Tensor& self, const std::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, double momentum) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> running_mean_maybe_owned = at::borrow_from_optional_tensor(running_mean_opt);
   const Tensor& running_mean = *running_mean_maybe_owned;
@@ -757,7 +758,7 @@ std::tuple<Tensor, Tensor> batch_norm_update_stats_cpu(
   });
 }
 
-std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_cpu_out(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_cpu_out(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
                                                   bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
@@ -800,7 +801,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_cpu_out(const Tensor& self, con
   return std::tuple<Tensor& ,Tensor&, Tensor&>(out, save_mean, save_var);
 }
 
-std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
                                                   bool train, double momentum, double eps) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
@@ -850,7 +851,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const c10:
 }
 
 std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_cpu(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
     Tensor& running_mean, Tensor& running_var, double momentum, double eps) {
   Tensor output, save_mean, save_var;
   std::tie(output, save_mean, save_var) =
@@ -860,7 +861,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_cpu(
 }
 
 std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> _batch_norm_with_update_cpu_out(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
     Tensor& running_mean, Tensor& running_var, double momentum, double eps,
     Tensor& out, Tensor& save_mean, Tensor& save_var, Tensor& reserve) {
   std::tie(out, save_mean, save_var) =
@@ -870,8 +871,8 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> _batch_norm_with_update_cpu_out(
 
 
 std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_no_update(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
-    const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
     double momentum, double eps) {
   const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
   const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
@@ -883,41 +884,41 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_no_update(
 }
 
 std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_cpu(
-    const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
     Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps) {
   return batch_norm_cpu(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, eps);
 }
 
 std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_stats_cpu(
-    const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
     bool train, double momentum, double eps) {
   return batch_norm_cpu(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, eps);
 }
 std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_training(
-    const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
     const Tensor& running_mean, const Tensor& running_var, double momentum, double eps) {
   return at::_native_batch_norm_legit(self, weight_opt, bias_opt, const_cast<Tensor&>(running_mean), const_cast<Tensor&>(running_var), /*train=*/false, momentum, eps);
 }
 
 
-std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_cpu_out(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) {
+std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_cpu_out(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) {
   return batch_norm_cpu_out(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, eps, out, save_mean, save_var);
 }
 
 
-std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_no_stats_cpu_out(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) {
+std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_no_stats_cpu_out(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) {
   return batch_norm_cpu_out(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, eps, out, save_mean, save_var);
 }
 
 std::tuple<Tensor, Tensor, Tensor> _new_batch_norm_backward_cpu(
     const Tensor& grad_output, const Tensor& input, const Tensor& weight,
-    const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
-    const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
+    const std::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const std::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
     bool update, double eps, std::array<bool,3> grad_input_mask, const Tensor& reserve) {
   return batch_norm_backward_cpu(grad_output, input, weight, running_mean_opt, running_var_opt, save_mean_opt, save_var_opt, update, eps, grad_input_mask);
 }
 
-std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu(const Tensor& grad_out, const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt,
+std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu(const Tensor& grad_out, const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt,
                                                            bool train, double eps, std::array<bool,3> grad_input_mask) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp
index aecab68c2be0f..e7172fe5a2c12 100644
--- a/aten/src/ATen/native/PadNd.cpp
+++ b/aten/src/ATen/native/PadNd.cpp
@@ -188,7 +188,7 @@ Tensor _pad_circular_symint(const Tensor &self, c10::SymIntArrayRef padding) {
   return out;
 }
 
-Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mode_int, c10::optional<double> value) {
+Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mode_int, std::optional<double> value) {
   const auto input_dim = self.dim();
   TORCH_CHECK(pad.size() % 2 == 0, "Padding length must be divisible by 2");
   TORCH_CHECK(static_cast<int64_t>(pad.size()) <= input_dim * 2,
@@ -228,7 +228,7 @@ Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mod
       "Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now");
 }
 
-Tensor pad_symint(const Tensor &self, c10::SymIntArrayRef pad, c10::string_view mode, c10::optional<double> value) {
+Tensor pad_symint(const Tensor &self, c10::SymIntArrayRef pad, c10::string_view mode, std::optional<double> value) {
   const auto mode_enum = [&] {
     if (mode == "reflect") {
       return at::padding_mode::reflect;
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index 07940729fda8c..df73299ea2308 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -19,9 +19,9 @@ DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel);
 
 // averge pooling has same signature for forward and backward
 using avg_pool2d_fn = void(*)(const Tensor& output, const Tensor& input, int64_t kW, int64_t kH,
-    int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, c10::optional<int64_t> divisor_override);
+    int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, std::optional<int64_t> divisor_override);
 using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH,
-    int dW, int dH, int padW, int padH, bool count_include_pad, c10::optional<int64_t> divisor_override);
+    int dW, int dH, int padW, int padH, bool count_include_pad, std::optional<int64_t> divisor_override);
 
 DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel);
 DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel);
@@ -30,11 +30,11 @@ DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel);
 using avg_pool3d_fn = void(*)(const Tensor& output, const Tensor& input,
     int64_t kW, int64_t kH, int64_t kD, int64_t dW, int64_t dH, int64_t dD,
     int64_t padW, int64_t padH, int64_t padD, bool count_include_pad,
-    c10::optional<int64_t> divisor_override);
+    std::optional<int64_t> divisor_override);
 using avg_pool3d_backward_fn = void(*)(const Tensor& output, const Tensor& input,
     int kW, int kH, int kD, int dW, int dH, int dD,
     int padW, int padH, int padD, bool count_include_pad,
-    c10::optional<int64_t> divisor_override);
+    std::optional<int64_t> divisor_override);
 
 DECLARE_DISPATCH(avg_pool3d_fn, avg_pool3d_kernel);
 DECLARE_DISPATCH(avg_pool3d_backward_fn, avg_pool3d_backward_kernel);
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index 97ce09ac8e51d..fccd3420d3f67 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -1163,7 +1163,7 @@ bool _use_cudnn_rnn_flatten_weight() {
 // NB: This a (composite) wrapper for _thnn_fused_lstm_cell_backward_impl.
 //     It duplicates the outputs of this function so the non-composite version doesn't have to.
 //     The point is so that we avoid triggering TensorImpl use count asserts in debug mode
-std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backward( const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt,
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backward( const std::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt,
       const Tensor& cx, const Tensor& cy,
       const Tensor& workspace, bool has_bias) {
   TORCH_INTERNAL_ASSERT(!GradMode::is_enabled());
@@ -1523,7 +1523,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
 
 std::tuple<Tensor, Tensor> lstm_cell(
     const Tensor& input, TensorList hx,
-    const Tensor& w_ih, const Tensor& w_hh, const c10::optional<Tensor>& b_ih_opt, const c10::optional<Tensor>& b_hh_opt) {
+    const Tensor& w_ih, const Tensor& w_hh, const std::optional<Tensor>& b_ih_opt, const c10::optional<Tensor>& b_hh_opt) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> b_ih_maybe_owned = at::borrow_from_optional_tensor(b_ih_opt);
   const Tensor& b_ih = *b_ih_maybe_owned;
@@ -1539,9 +1539,9 @@ std::tuple<Tensor, Tensor> lstm_cell(
 }
 
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>
-_thnn_differentiable_lstm_cell_backward( const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt,
+_thnn_differentiable_lstm_cell_backward( const std::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt,
     const Tensor& input_gates,
-    const Tensor& hidden_gates, const c10::optional<Tensor>& input_bias_opt, const c10::optional<Tensor>& hidden_bias_opt,
+    const Tensor& hidden_gates, const std::optional<Tensor>& input_bias_opt, const c10::optional<Tensor>& hidden_bias_opt,
     const Tensor& cx,
     const Tensor& cy) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -1597,7 +1597,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_differentiable_gru_cell
     const Tensor& grad_hy,
     const Tensor& input_gates,
     const Tensor& hidden_gates,
-    const Tensor& hx, const c10::optional<Tensor>& input_bias_opt, const c10::optional<Tensor>& hidden_bias_opt){
+    const Tensor& hx, const std::optional<Tensor>& input_bias_opt, const c10::optional<Tensor>& hidden_bias_opt){
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> input_bias_maybe_owned = at::borrow_from_optional_tensor(input_bias_opt);
   const Tensor& input_bias = *input_bias_maybe_owned;
@@ -1637,7 +1637,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_differentiable_gru_cell
 
 Tensor gru_cell(
     const Tensor& input, const Tensor& hx,
-    const Tensor& w_ih, const Tensor& w_hh, const c10::optional<Tensor>& b_ih_opt, const c10::optional<Tensor>& b_hh_opt) {
+    const Tensor& w_ih, const Tensor& w_hh, const std::optional<Tensor>& b_ih_opt, const c10::optional<Tensor>& b_hh_opt) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> b_ih_maybe_owned = at::borrow_from_optional_tensor(b_ih_opt);
   const Tensor& b_ih = *b_ih_maybe_owned;
@@ -1651,7 +1651,7 @@ Tensor gru_cell(
 
 Tensor rnn_tanh_cell(
     const Tensor& input, const Tensor& hx,
-    const Tensor& w_ih, const Tensor& w_hh, const c10::optional<Tensor>& b_ih_opt, const c10::optional<Tensor>& b_hh_opt) {
+    const Tensor& w_ih, const Tensor& w_hh, const std::optional<Tensor>& b_ih_opt, const c10::optional<Tensor>& b_hh_opt) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> b_ih_maybe_owned = at::borrow_from_optional_tensor(b_ih_opt);
   const Tensor& b_ih = *b_ih_maybe_owned;
@@ -1665,7 +1665,7 @@ Tensor rnn_tanh_cell(
 
 Tensor rnn_relu_cell(
     const Tensor& input, const Tensor& hx,
-    const Tensor& w_ih, const Tensor& w_hh, const c10::optional<Tensor>& b_ih_opt, const c10::optional<Tensor>& b_hh_opt) {
+    const Tensor& w_ih, const Tensor& w_hh, const std::optional<Tensor>& b_ih_opt, const c10::optional<Tensor>& b_hh_opt) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> b_ih_maybe_owned = at::borrow_from_optional_tensor(b_ih_opt);
   const Tensor& b_ih = *b_ih_maybe_owned;
@@ -1693,7 +1693,7 @@ static std::tuple<Tensor, Tensor, Tensor> quantized_lstm_input(
     bool train,
     bool bidirectional,
     bool batch_first,
-    c10::optional<ScalarType> dtype,
+    std::optional<ScalarType> dtype,
     bool use_dynamic) {
   auto hx = hx_.vec();
   std::vector<QRNNCellParamsWrapper> params;
@@ -1747,7 +1747,7 @@ static std::tuple<Tensor, Tensor, Tensor> quantized_lstm_input_legacy(
     bool train,
     bool bidirectional,
     bool batch_first,
-    c10::optional<ScalarType> dtype,
+    std::optional<ScalarType> dtype,
     bool use_dynamic) {
   TORCH_CHECK(
       false,
@@ -1766,7 +1766,7 @@ static std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data(
     double dropout_p,
     bool train,
     bool bidirectional,
-    c10::optional<ScalarType> dtype,
+    std::optional<ScalarType> dtype,
     bool use_dynamic) {
   auto hx = hx_.vec();
   std::vector<QRNNCellParamsWrapper> params;
@@ -1813,7 +1813,7 @@ static std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data_legacy(
     double dropout_p,
     bool train,
     bool bidirectional,
-    c10::optional<ScalarType> dtype,
+    std::optional<ScalarType> dtype,
     bool use_dynamic) {
   TORCH_CHECK(
       false,
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index d29b177c13960..96f6d6f90c87d 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -148,7 +148,7 @@ static ScalarType infer_dtype_from_optional(
   }
 }
 
-static IntArrayRef optional_to_arrayref(const c10::optional<int64_t>& opt) {
+static IntArrayRef optional_to_arrayref(const std::optional<int64_t>& opt) {
   return opt.has_value() ? opt.value() : IntArrayRef{};
 }
 
@@ -217,7 +217,7 @@ TORCH_META_FUNC(any)(const Tensor& self) {
 static void check_argmax_argmin(
     const char* name,
     const Tensor& self,
-    const c10::optional<int64_t>& dim) {
+    const std::optional<int64_t>& dim) {
   if (dim.has_value()) {
     auto dim_ = maybe_wrap_dim(dim.value(), self.dim());
     native::zero_numel_check_dims(self, dim_, name);
@@ -229,13 +229,13 @@ static void check_argmax_argmin(
 }
 
 TORCH_META_FUNC(argmax)
-(const Tensor& self, c10::optional<int64_t> dim, bool keepdim) {
+(const Tensor& self, std::optional<int64_t> dim, bool keepdim) {
   check_argmax_argmin("argmax()", self, dim);
   resize_reduction(*this, self, optional_to_arrayref(dim), keepdim, kLong);
 }
 
 TORCH_META_FUNC(argmin)
-(const Tensor& self, c10::optional<int64_t> dim, bool keepdim) {
+(const Tensor& self, std::optional<int64_t> dim, bool keepdim) {
   check_argmax_argmin("argmin()", self, dim);
   resize_reduction(*this, self, optional_to_arrayref(dim), keepdim, kLong);
 }
@@ -245,7 +245,7 @@ static void meta_func_cum_ops(
     const char* name,
     const Tensor& self,
     int64_t dim,
-    c10::optional<ScalarType> dtype) {
+    std::optional<ScalarType> dtype) {
   // Checking whether 'dim' is valid.
   maybe_wrap_dim(dim, self.dim());
 
@@ -264,12 +264,12 @@ static void meta_func_cum_ops(
 }
 
 TORCH_META_FUNC(cumsum)
-(const Tensor& self, int64_t dim, c10::optional<ScalarType> dtype) {
+(const Tensor& self, int64_t dim, std::optional<ScalarType> dtype) {
   meta_func_cum_ops(*this, "cumsum", self, dim, dtype);
 }
 
 TORCH_META_FUNC(cumprod)
-(const Tensor& self, int64_t dim, c10::optional<ScalarType> dtype) {
+(const Tensor& self, int64_t dim, std::optional<ScalarType> dtype) {
   meta_func_cum_ops(*this, "cumprod", self, dim, dtype);
 }
 
@@ -283,7 +283,7 @@ TORCH_META_FUNC2(prod, dim_int)
 (const Tensor& self,
  int64_t dim,
  bool keepdim,
- c10::optional<ScalarType> dtype) {
+ std::optional<ScalarType> dtype) {
   auto out_dtype = infer_dtype_from_optional(self, dtype, maybe_get_output());
   resize_reduction(*this, self, dim, keepdim, out_dtype);
 }
@@ -315,7 +315,7 @@ TORCH_META_FUNC2(mean, dim)
 static ScalarType get_result_or_self_value_dtype(
     const Tensor& self,
     const Tensor& result,
-    const c10::optional<ScalarType>& dtype) {
+    const std::optional<ScalarType>& dtype) {
   if (result.defined()) {
     return result.scalar_type();
   } else {
@@ -350,7 +350,7 @@ TORCH_META_FUNC2(norm, ScalarOpt_dim_dtype)
 }
 
 TORCH_META_FUNC(aminmax)
-(const Tensor& self, c10::optional<int64_t> dim_opt, bool keepdim) {
+(const Tensor& self, std::optional<int64_t> dim_opt, bool keepdim) {
   DimVector shape;
   if (dim_opt.has_value()) {
     auto dim = maybe_wrap_dim(dim_opt.value(), self.ndimension());
@@ -407,7 +407,7 @@ DEFINE_DISPATCH(aminmax_allreduce_stub);
 
 TORCH_IMPL_FUNC(aminmax_out)
 (const Tensor& self,
- c10::optional<int64_t> dim_opt,
+ std::optional<int64_t> dim_opt,
  bool keepdim,
  const Tensor& min,
  const Tensor& max) {
@@ -491,7 +491,7 @@ void impl_func_cum_ops(
 TORCH_IMPL_FUNC(cumsum_out)
 (const Tensor& self,
  int64_t dim,
- c10::optional<ScalarType> dtype,
+ std::optional<ScalarType> dtype,
  const Tensor& result) {
   impl_func_cum_ops(self, dim, result, cumsum_stub);
 }
@@ -499,7 +499,7 @@ TORCH_IMPL_FUNC(cumsum_out)
 TORCH_IMPL_FUNC(cumprod_out)
 (const Tensor& self,
  int64_t dim,
- c10::optional<ScalarType> dtype,
+ std::optional<ScalarType> dtype,
  const Tensor& result) {
   impl_func_cum_ops(self, dim, result, cumprod_stub);
 }
@@ -869,7 +869,7 @@ Tensor cummaxmin_backward(const Tensor& grad, const Tensor& input, const Tensor&
   return result.scatter_add_(dim, indices, grad);
 }
 
-static Tensor prepend_append_on_dim(const Tensor& self, const c10::optional<Tensor>& prepend, const c10::optional<Tensor>& append, int64_t dim) {
+static Tensor prepend_append_on_dim(const Tensor& self, const std::optional<Tensor>& prepend, const c10::optional<Tensor>& append, int64_t dim) {
   // Helper for diff that handles prepending and appending when at least one is present
   TORCH_INTERNAL_ASSERT(prepend.has_value() || append.has_value(), "either prepend or append must be have value");
   if (!prepend.has_value() && append.has_value()) {
@@ -881,7 +881,7 @@ static Tensor prepend_append_on_dim(const Tensor& self, const c10::optional<Tens
   }
 }
 
-static inline void diff_check_compatible_shape(const Tensor& self, const c10::optional<Tensor>&other, int64_t dim) {
+static inline void diff_check_compatible_shape(const Tensor& self, const std::optional<Tensor>&other, int64_t dim) {
   // Helper for diff that checks whether the shape of the tensor to prepend or append
   // is compatible with that of input
   if (other.has_value()) {
@@ -902,7 +902,7 @@ static inline void diff_check_compatible_shape(const Tensor& self, const c10::op
   }
 }
 
-static inline void diff_check(const Tensor& self, int64_t n, int64_t dim, const c10::optional<Tensor>&prepend, const c10::optional<Tensor>& append) {
+static inline void diff_check(const Tensor& self, int64_t n, int64_t dim, const std::optional<Tensor>&prepend, const c10::optional<Tensor>& append) {
   // Helper for diff that checks whether its parameters are valid
   TORCH_CHECK(
       self.dim() >= 1,
@@ -943,7 +943,7 @@ static inline Tensor diff_helper(const Tensor& self, int64_t n, int64_t dim) {
   return result;
 }
 
-Tensor diff(const Tensor& self, int64_t n, int64_t dim, const c10::optional<Tensor>& prepend, const c10::optional<Tensor>& append) {
+Tensor diff(const Tensor& self, int64_t n, int64_t dim, const std::optional<Tensor>& prepend, const c10::optional<Tensor>& append) {
   diff_check(self, n, dim, prepend, append);
   if ((!prepend.has_value() && !append.has_value()) || n == 0) {
     return diff_helper(self, n, dim);
@@ -987,7 +987,7 @@ static inline Tensor& diff_out_helper(const Tensor& self, int64_t n, int64_t dim
   return result;
 }
 
-Tensor& diff_out(const Tensor& self, int64_t n, int64_t dim, const c10::optional<Tensor>& prepend, const c10::optional<Tensor>& append, Tensor& result) {
+Tensor& diff_out(const Tensor& self, int64_t n, int64_t dim, const std::optional<Tensor>& prepend, const c10::optional<Tensor>& append, Tensor& result) {
   diff_check(self, n, dim, prepend, append);
   if ((!prepend.has_value() && !append.has_value()) || n == 0) {
     return diff_out_helper(self, n, dim, result);
@@ -997,7 +997,7 @@ Tensor& diff_out(const Tensor& self, int64_t n, int64_t dim, const c10::optional
   }
 }
 
-static void pre_check_gradient(const Tensor& self, c10::optional<int64_t> spacing_size, at::OptionalIntArrayRef dim,  int64_t edge_order) {
+static void pre_check_gradient(const Tensor& self, std::optional<int64_t> spacing_size, at::OptionalIntArrayRef dim,  int64_t edge_order) {
   // Helper for gradient function to make sure input data satisfies prerequisites
   TORCH_CHECK(self.scalar_type() != ScalarType::Byte, "torch.gradient does not support uint8 input.");
   if (spacing_size.has_value() && !dim.has_value()) {
@@ -1088,7 +1088,7 @@ static std::vector<Tensor> gradient_helper_float(const Tensor& self, ArrayRef<Sc
   return result;
 }
 
-static std::vector<int64_t> gradient_dim_preprocess(const Tensor& self, c10::optional<int64_t> dim) {
+static std::vector<int64_t> gradient_dim_preprocess(const Tensor& self, std::optional<int64_t> dim) {
   // if gradient dim is provided as an integer, then we need to compute gradient only on this direction.
   // Moreover, if it's not provided at all, then we are interested in gradient for all directions.
   // Finally, if dim is provided as vector of ints, then it is not expected to be called by this function.
@@ -1103,16 +1103,16 @@ static std::vector<int64_t> gradient_dim_preprocess(const Tensor& self, c10::opt
 
 std::vector<Tensor> gradient(const Tensor& self, TensorList coordinates, IntArrayRef dim, int64_t edge_order) {
     pre_check_gradient(self,
-                       c10::optional<int64_t>(coordinates.size()),
+                       std::optional<int64_t>(coordinates.size()),
                        at::OptionalIntArrayRef(dim),
                        edge_order);
     return gradient_helper(self, coordinates, dim, edge_order);
 }
 
-std::vector<Tensor> gradient(const Tensor& self, TensorList coordinates, c10::optional<int64_t> dim, int64_t edge_order) {
+std::vector<Tensor> gradient(const Tensor& self, TensorList coordinates, std::optional<int64_t> dim, int64_t edge_order) {
   const auto processed_dim = gradient_dim_preprocess(self, dim);
   pre_check_gradient(self,
-                     c10::optional<int64_t>(coordinates.size()),
+                     std::optional<int64_t>(coordinates.size()),
                      dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : c10::nullopt,
                      edge_order);
   return gradient_helper(self, coordinates, processed_dim, edge_order);
@@ -1120,16 +1120,16 @@ std::vector<Tensor> gradient(const Tensor& self, TensorList coordinates, c10::op
 
 std::vector<Tensor> gradient(const Tensor& self, c10::ArrayRef<Scalar> spacing, IntArrayRef dim, int64_t edge_order) {
   pre_check_gradient(self,
-                     c10::optional<int64_t>(spacing.size()),
+                     std::optional<int64_t>(spacing.size()),
                      at::OptionalIntArrayRef(dim),
                      edge_order);
   return gradient_helper_float(self, spacing, dim, edge_order);
 }
 
-std::vector<Tensor> gradient(const Tensor& self, ArrayRef<Scalar> spacing, c10::optional<int64_t> dim, int64_t edge_order) {
+std::vector<Tensor> gradient(const Tensor& self, ArrayRef<Scalar> spacing, std::optional<int64_t> dim, int64_t edge_order) {
   const auto processed_dim = gradient_dim_preprocess(self, dim);
   pre_check_gradient(self,
-                     c10::optional<int64_t>(spacing.size()),
+                     std::optional<int64_t>(spacing.size()),
                      dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : c10::nullopt,
                      edge_order);
   return gradient_helper_float(self, spacing, processed_dim, edge_order);
@@ -1140,13 +1140,13 @@ std::vector<Tensor> gradient(const Tensor& self, const Scalar& unit_size, IntArr
   // be taken as unit size at every given dimension element of - dim.
   std::vector<Scalar> spacing(dim.size(), unit_size);
   pre_check_gradient(self,
-                     c10::optional<int64_t>(spacing.size()),
+                     std::optional<int64_t>(spacing.size()),
                      at::OptionalIntArrayRef(dim),
                      edge_order);
   return gradient_helper_float(self, spacing, dim, edge_order);
 }
 
-std::vector<Tensor> gradient(const Tensor& self, const c10::optional<Scalar>& unit_size, c10::optional<int64_t> dim, int64_t edge_order) {
+std::vector<Tensor> gradient(const Tensor& self, const std::optional<Scalar>& unit_size, c10::optional<int64_t> dim, int64_t edge_order) {
   const auto processed_dim = gradient_dim_preprocess(self, dim);
   // When unit_size not provided, it is always assumed to be equal to 1.
   // When dim has integer value it implies we are looking for gradient in the specific direction, however when
@@ -1154,7 +1154,7 @@ std::vector<Tensor> gradient(const Tensor& self, const c10::optional<Scalar>& un
   std::vector<Scalar> spacing(dim.has_value() ? 1 : self.dim(),
                               unit_size.has_value() ? unit_size.value() : 1.0) ;
   pre_check_gradient(self,
-                     unit_size.has_value() ?  c10::optional<int64_t>(spacing.size()) : c10::nullopt,
+                     unit_size.has_value() ?  std::optional<int64_t>(spacing.size()) : c10::nullopt,
                      dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : c10::nullopt,
                      edge_order);
   return gradient_helper_float(self, spacing, processed_dim, edge_order);
@@ -1163,7 +1163,7 @@ std::vector<Tensor> gradient(const Tensor& self, const c10::optional<Scalar>& un
 std::vector<Tensor> gradient(const Tensor& self, IntArrayRef dim, int64_t edge_order) {
   std::vector<Scalar> spacing(dim.size(), 1.0) ;
   pre_check_gradient(self,
-                     c10::optional<int64_t>(spacing.size()),
+                     std::optional<int64_t>(spacing.size()),
                      at::OptionalIntArrayRef(dim),
                      edge_order);
   return gradient_helper_float(self, spacing, dim, edge_order);
@@ -1217,11 +1217,11 @@ TORCH_IMPL_FUNC(sum_out)
   }
 }
 
-Tensor sum(const Tensor &self, c10::optional<ScalarType> dtype) {
+Tensor sum(const Tensor &self, std::optional<ScalarType> dtype) {
   return at::sum(self, IntArrayRef{}, false, dtype);
 }
 
-Tensor sum(const Tensor& self, DimnameList dim, bool keepdim, c10::optional<ScalarType> dtype) {
+Tensor sum(const Tensor& self, DimnameList dim, bool keepdim, std::optional<ScalarType> dtype) {
   return at::sum(self, dimnames_to_positions(self, dim), keepdim, dtype);
 }
 
@@ -1252,7 +1252,7 @@ Tensor& nansum_out(const Tensor& self, at::OptionalIntArrayRef dim,
   return result;
 }
 
-Tensor nansum(const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> opt_dtype) {
+Tensor nansum(const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, std::optional<ScalarType> opt_dtype) {
   ScalarType dtype = get_dtype_from_self(self, opt_dtype, true);
   Tensor result = create_reduction_result(self, dim, keepdim, dtype);
   return at::native::nansum_out(self, dim, keepdim, dtype, result);
@@ -1306,7 +1306,7 @@ static void impl_func_prod(
     const Tensor& self,
     IntArrayRef dims,
     bool keepdim,
-    c10::optional<ScalarType> dtype,
+    std::optional<ScalarType> dtype,
     const Tensor& result) {
   auto iter = meta::make_reduction_from_out_ty(self, result, dims, keepdim, result.scalar_type());
   if (iter.numel() == 0) {
@@ -1320,12 +1320,12 @@ TORCH_IMPL_FUNC(prod_out)
 (const Tensor& self,
  int64_t dim,
  bool keepdim,
- c10::optional<ScalarType> dtype,
+ std::optional<ScalarType> dtype,
  const Tensor& result) {
   impl_func_prod(self, dim, keepdim, dtype, result);
 }
 
-Tensor prod(const Tensor &self, c10::optional<ScalarType> opt_dtype) {
+Tensor prod(const Tensor &self, std::optional<ScalarType> opt_dtype) {
   auto dtype = get_dtype_from_self(self, opt_dtype, true);
   auto shape = meta::get_reduction_shape(self, {}, false);
   Tensor result = at::empty(shape, self.options().dtype(dtype));
@@ -1333,7 +1333,7 @@ Tensor prod(const Tensor &self, c10::optional<ScalarType> opt_dtype) {
   return result;
 }
 
-Tensor prod(const Tensor& self, Dimname dim, bool keepdim, c10::optional<ScalarType> dtype) {
+Tensor prod(const Tensor& self, Dimname dim, bool keepdim, std::optional<ScalarType> dtype) {
   return at::prod(self, dimname_to_position(self, dim), keepdim, dtype);
 }
 
@@ -1346,7 +1346,7 @@ TORCH_IMPL_FUNC(mean_out)
 (const Tensor& self,
  OptionalIntArrayRef opt_dim,
  bool keepdim,
- c10::optional<ScalarType> opt_dtype,
+ std::optional<ScalarType> opt_dtype,
  const Tensor& result) {
   ScalarType dtype = result.scalar_type();
   // TODO: the TensorIterator reduction implementation of mean
@@ -1407,7 +1407,7 @@ Tensor mean(const Tensor& self, DimnameList dim, bool keepdim, optional<ScalarTy
 }
 
 Tensor& mean_out(const Tensor& self, DimnameList dim,
-                 bool keepdim, c10::optional<ScalarType> opt_dtype, Tensor& result) {
+                 bool keepdim, std::optional<ScalarType> opt_dtype, Tensor& result) {
   return at::mean_out(result, self, dimnames_to_positions(self, dim), keepdim, opt_dtype);
 }
 
@@ -1416,7 +1416,7 @@ Tensor& nanmean_out(
     const Tensor& self,
     at::OptionalIntArrayRef dim,
     bool keepdim,
-    c10::optional<ScalarType> opt_dtype,
+    std::optional<ScalarType> opt_dtype,
     Tensor& result) {
   TORCH_CHECK(
       self.is_floating_point() || self.is_complex(),
@@ -1703,7 +1703,7 @@ TORCH_IMPL_FUNC(amax_out) (const Tensor& self, IntArrayRef dim, bool keepdim, co
 template <class Stub>
 void argmax_argmin_impl(
     const Tensor& self,
-    c10::optional<int64_t> dim,
+    std::optional<int64_t> dim,
     bool keepdim,
     const Tensor& result,
     Stub& stub) {
@@ -1737,7 +1737,7 @@ void argmax_argmin_impl(
 
 TORCH_IMPL_FUNC(argmax_out)
 (const Tensor& self,
- c10::optional<int64_t> dim,
+ std::optional<int64_t> dim,
  bool keepdim,
  const Tensor& result) {
   argmax_argmin_impl(self, dim, keepdim, result, argmax_stub);
@@ -1745,7 +1745,7 @@ TORCH_IMPL_FUNC(argmax_out)
 
 TORCH_IMPL_FUNC(argmin_out)
 (const Tensor& self,
- c10::optional<int64_t> dim,
+ std::optional<int64_t> dim,
  bool keepdim,
  const Tensor& result) {
   argmax_argmin_impl(self, dim, keepdim, result, argmin_stub);
@@ -1812,7 +1812,7 @@ namespace {
 
 static Tensor& std_var_out(
     const char* fname, Tensor& result, const Tensor& self,
-    at::OptionalIntArrayRef dim, const c10::optional<Scalar>& correction_opt,
+    at::OptionalIntArrayRef dim, const std::optional<Scalar>& correction_opt,
     bool keepdim, bool take_sqrt) {
   TORCH_CHECK(self.device().is_cpu() || self.device().is_cuda(),
               "std and var only supports tensors on a CPU or CUDA device, but got: ",
@@ -1884,7 +1884,7 @@ static Tensor& std_var_out(
 
 static std::tuple<Tensor&, Tensor&> std_var_mean_out(
     const char* fname, Tensor& result1, Tensor& result2, const Tensor& self,
-    at::OptionalIntArrayRef dim, const c10::optional<Scalar>& correction_opt,
+    at::OptionalIntArrayRef dim, const std::optional<Scalar>& correction_opt,
     bool keepdim, bool take_sqrt) {
   AT_ASSERT(result1.defined() && result2.defined());
   TORCH_CHECK(self.device().is_cpu() || self.is_cuda(),
@@ -1995,7 +1995,7 @@ static TensorOptions options_to_value_type(TensorOptions opts) {
 
 std::tuple<Tensor, Tensor> var_mean(
     const Tensor& self, at::OptionalIntArrayRef dim,
-    const c10::optional<Scalar>& correction, bool keepdim) {
+    const std::optional<Scalar>& correction, bool keepdim) {
   Tensor result1 = at::empty({0}, options_to_value_type(self.options()));
   Tensor result2 = at::empty({0}, self.options());
   return std_var_mean_out(
@@ -2004,7 +2004,7 @@ std::tuple<Tensor, Tensor> var_mean(
 
 std::tuple<Tensor, Tensor> std_mean(
     const Tensor& self, at::OptionalIntArrayRef dim,
-    const c10::optional<Scalar>& correction, bool keepdim) {
+    const std::optional<Scalar>& correction, bool keepdim) {
   Tensor result1 = at::empty({0}, options_to_value_type(self.options()));
   Tensor result2 = at::empty({0}, self.options());
   return std_var_mean_out(
@@ -2047,26 +2047,26 @@ Tensor& std_out(const Tensor& self, at::OptionalIntArrayRef opt_dim, bool unbias
 }
 
 Tensor std(const Tensor& self, at::OptionalIntArrayRef dim,
-           const c10::optional<Scalar>& correction, bool keepdim) {
+           const std::optional<Scalar>& correction, bool keepdim) {
   Tensor result = at::empty({0}, options_to_value_type(self.options()));
   return std_var_out("std", result, self, dim, correction, keepdim, true);
 }
 
 Tensor& std_out(
     const Tensor& self, at::OptionalIntArrayRef dim,
-    const c10::optional<Scalar>& correction, bool keepdim, Tensor& result) {
+    const std::optional<Scalar>& correction, bool keepdim, Tensor& result) {
   return std_var_out("std", result, self, dim, correction, keepdim, true);
 }
 
 Tensor& var_out(
     const Tensor& self, at::OptionalIntArrayRef dim,
-    const c10::optional<Scalar>& correction, bool keepdim, Tensor& result) {
+    const std::optional<Scalar>& correction, bool keepdim, Tensor& result) {
   return std_var_out("var", result, self, dim, correction, keepdim, false);
 }
 
 Tensor var(
     const Tensor& self, at::OptionalIntArrayRef dim,
-    const c10::optional<Scalar>& correction, bool keepdim) {
+    const std::optional<Scalar>& correction, bool keepdim) {
   Tensor result = at::empty({0}, options_to_value_type(self.options()));
   return std_var_out("var", result, self, dim, correction, keepdim, false);
 }
@@ -2096,32 +2096,32 @@ std::tuple<Tensor,Tensor> std_mean(const Tensor& self, DimnameList dim, bool unb
   return at::std_mean(self, dimnames_to_positions(self, dim), unbiased, keepdim);
 }
 
-Tensor std(const Tensor& self, DimnameList dim, const c10::optional<Scalar>& correction, bool keepdim) {
+Tensor std(const Tensor& self, DimnameList dim, const std::optional<Scalar>& correction, bool keepdim) {
   return at::std(self, dimnames_to_positions(self, dim), correction, keepdim);
 }
 
-Tensor& std_out(const Tensor& self, DimnameList dim, const c10::optional<Scalar>& correction,
+Tensor& std_out(const Tensor& self, DimnameList dim, const std::optional<Scalar>& correction,
                 bool keepdim, Tensor& result) {
   return at::std_out(result, self, dimnames_to_positions(self, dim), correction, keepdim);
 }
 
-Tensor var(const Tensor& self, DimnameList dim, const c10::optional<Scalar>& correction, bool keepdim) {
+Tensor var(const Tensor& self, DimnameList dim, const std::optional<Scalar>& correction, bool keepdim) {
   return at::var(self, dimnames_to_positions(self, dim), correction, keepdim);
 }
 
-Tensor& var_out(const Tensor& self, DimnameList dim, const c10::optional<Scalar>& correction,
+Tensor& var_out(const Tensor& self, DimnameList dim, const std::optional<Scalar>& correction,
                 bool keepdim, Tensor& result) {
   return at::var_out(
       result, self, dimnames_to_positions(self, dim), correction, keepdim);
 }
 
 std::tuple<Tensor,Tensor> var_mean(const Tensor& self, DimnameList dim,
-                                   const c10::optional<Scalar>& correction, bool keepdim) {
+                                   const std::optional<Scalar>& correction, bool keepdim) {
   return at::var_mean(self, dimnames_to_positions(self, dim), correction, keepdim);
 }
 
 std::tuple<Tensor,Tensor> std_mean(const Tensor& self, DimnameList dim,
-                                   const c10::optional<Scalar>& correction, bool keepdim) {
+                                   const std::optional<Scalar>& correction, bool keepdim) {
   return at::std_mean(self, dimnames_to_positions(self, dim), correction, keepdim);
 }
 
@@ -2167,22 +2167,22 @@ Tensor logcumsumexp(const Tensor& self, Dimname dim) {
 Tensor& logcumsumexp_out(const Tensor& self, Dimname dim, Tensor& result) {
   return at::logcumsumexp_out(result, self, dimname_to_position(self, dim));
 }
-Tensor cumsum(const Tensor& self, Dimname dim, c10::optional<ScalarType> dtype) {
+Tensor cumsum(const Tensor& self, Dimname dim, std::optional<ScalarType> dtype) {
   return at::cumsum(self, dimname_to_position(self, dim), dtype);
 }
-Tensor& cumsum_(Tensor& self, Dimname dim, c10::optional<ScalarType> dtype) {
+Tensor& cumsum_(Tensor& self, Dimname dim, std::optional<ScalarType> dtype) {
   return at::cumsum_out(self, self, dimname_to_position(self, dim), dtype);
 }
-Tensor& cumsum_out(const Tensor& self, Dimname dim, c10::optional<ScalarType> dtype, Tensor& result) {
+Tensor& cumsum_out(const Tensor& self, Dimname dim, std::optional<ScalarType> dtype, Tensor& result) {
   return at::cumsum_out(result, self, dimname_to_position(self, dim), dtype);
 }
-Tensor cumprod(const Tensor& self, Dimname dim, c10::optional<ScalarType> dtype) {
+Tensor cumprod(const Tensor& self, Dimname dim, std::optional<ScalarType> dtype) {
   return at::cumprod(self, dimname_to_position(self, dim), dtype);
 }
-Tensor& cumprod_(Tensor& self, Dimname dim, c10::optional<ScalarType> dtype) {
+Tensor& cumprod_(Tensor& self, Dimname dim, std::optional<ScalarType> dtype) {
   return at::cumprod_out(self, self, dimname_to_position(self, dim), dtype);
 }
-Tensor& cumprod_out(const Tensor& self, Dimname dim, c10::optional<ScalarType> dtype, Tensor& result) {
+Tensor& cumprod_out(const Tensor& self, Dimname dim, std::optional<ScalarType> dtype, Tensor& result) {
   return at::cumprod_out(result, self, dimname_to_position(self, dim), dtype);
 }
 std::tuple<Tensor, Tensor> cummax(const Tensor& self, Dimname dim) {
@@ -2303,15 +2303,15 @@ Tensor value_selecting_reduction_backward_symint(const Tensor& grad, int64_t dim
   return inplace_scatter_if_not_tensor_subclass(grad, indices);
 }
 
-Tensor sum_csr(const Tensor &self, c10::optional<ScalarType> dtype) {
+Tensor sum_csr(const Tensor &self, std::optional<ScalarType> dtype) {
   return self.values().sum(dtype);
 }
 
-Tensor sum_coo(const Tensor &self, c10::optional<ScalarType> dtype) {
+Tensor sum_coo(const Tensor &self, std::optional<ScalarType> dtype) {
   return self._values().sum(dtype);
 }
 
-Tensor sum_sparse_coo(const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype) {
+Tensor sum_sparse_coo(const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, std::optional<ScalarType> dtype) {
   Tensor result;
   if (dim.has_value()) {
     if (dtype.has_value()) {
@@ -2341,7 +2341,7 @@ Tensor sum_sparse_compressed(
     const Tensor& self,
     at::OptionalIntArrayRef dim,
     bool keepdim,
-    c10::optional<ScalarType> dtype) {
+    std::optional<ScalarType> dtype) {
   // TODO: The signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype is a little
   // bit different in the second parameters `dim`, which causes the conversion of `dim`
   // to call into `_sparse_csr_sum`. Align the signatures would be a better choice.
diff --git a/aten/src/ATen/native/ReduceOps.h b/aten/src/ATen/native/ReduceOps.h
index 604d6ae8a74ef..d834f17a6d774 100644
--- a/aten/src/ATen/native/ReduceOps.h
+++ b/aten/src/ATen/native/ReduceOps.h
@@ -33,7 +33,7 @@ using reduce_std_var_function =
 DECLARE_DISPATCH(reduce_std_var_function, std_var_stub);
 
 using reduce_norm_fn =
-    void (*)(Tensor&, const Tensor&, const c10::Scalar&, c10::optional<int64_t>);
+    void (*)(Tensor&, const Tensor&, const c10::Scalar&, std::optional<int64_t>);
 DECLARE_DISPATCH(reduce_norm_fn, norm_kernel);
 
 using reduce_fn_flag = void(*)(TensorIterator &, const c10::Scalar&);
diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h
index 6989b00f6f3e6..505cf3bb3a778 100644
--- a/aten/src/ATen/native/ReduceOpsUtils.h
+++ b/aten/src/ATen/native/ReduceOpsUtils.h
@@ -77,7 +77,7 @@ inline bool _dimreduce_return_trivial_no_ident(Tensor &result, const Tensor &sel
   return false;
 }
 
-inline c10::optional<Tensor> _allreduce_return_trivial(
+inline std::optional<Tensor> _allreduce_return_trivial(
     const Tensor& self,
     const Scalar& ident) {
   // Return identity
@@ -102,7 +102,7 @@ static inline void check_scalar_type_device_layout_equal(const Tensor& out, cons
   OPTION_TYPE_EQUALITY_CHECK(layout, out.options(), self.options());
 }
 
-static inline Tensor integer_upcast(const Tensor& self, c10::optional<ScalarType> dtype) {
+static inline Tensor integer_upcast(const Tensor& self, std::optional<ScalarType> dtype) {
   ScalarType scalarType = self.scalar_type();
   TORCH_CHECK(!isBarebonesUnsignedType(scalarType), "integer upcasting for uint16, uint32 and uint64 is not currently implemented");
   ScalarType upcast_scalarType = dtype.value_or(at::isIntegralType(scalarType, /*includeBool=*/true) ? ScalarType::Long : scalarType);
@@ -323,7 +323,7 @@ static C10_UNUSED void zero_numel_tensor_resize(Tensor& result, Tensor& result_i
 
 inline ScalarType get_dtype_from_self(
     const Tensor& self,
-    const c10::optional<ScalarType>& dtype,
+    const std::optional<ScalarType>& dtype,
     bool promote_integers) {
   if (dtype.has_value()) {
     return dtype.value();
@@ -335,7 +335,7 @@ inline ScalarType get_dtype_from_self(
   return src_type;
 }
 
-inline ScalarType get_dtype_from_result(Tensor& result, c10::optional<ScalarType> dtype) {
+inline ScalarType get_dtype_from_result(Tensor& result, std::optional<ScalarType> dtype) {
   TORCH_CHECK(result.defined(), "Cannot create a new tensor inside a reduction op. You likely tried to call an operator with an out argument but the out argument was an undefined tensor.");
   if (dtype.has_value()) {
     return dtype.value();
diff --git a/aten/src/ATen/native/Repeat.cpp b/aten/src/ATen/native/Repeat.cpp
index dd87cead1f480..8bd253134b7a9 100644
--- a/aten/src/ATen/native/Repeat.cpp
+++ b/aten/src/ATen/native/Repeat.cpp
@@ -41,7 +41,7 @@ namespace at::native {
 
 Tensor repeat_interleave_cpu(
     const Tensor& repeat,
-    c10::optional<int64_t> output_size) {
+    std::optional<int64_t> output_size) {
   Tensor output;
   AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_cpu", [&]() {
     output = repeat_interleave_common<index_t, compute_cpu<index_t>>(
@@ -54,8 +54,8 @@ Tensor repeat_interleave_cpu(
 Tensor repeat_interleave_symint(
     const Tensor& self,
     const Tensor& repeats,
-    c10::optional<int64_t> dim,
-    c10::optional<SymInt> output_size) {
+    std::optional<int64_t> dim,
+    std::optional<SymInt> output_size) {
   Tensor input = self;
 
   // Store conj and neg bits
@@ -101,8 +101,8 @@ Tensor repeat_interleave_symint(
 Tensor repeat_interleave_symint(
     const Tensor& self,
     c10::SymInt repeats,
-    c10::optional<int64_t> dim_opt,
-    c10::optional<SymInt> output_size) {
+    std::optional<int64_t> dim_opt,
+    std::optional<SymInt> output_size) {
   Tensor input = dim_opt ? self : self.flatten();
   int64_t dim = c10::maybe_wrap_dim(dim_opt.value_or(0), self.dim());
   TORCH_CHECK(repeats >= 0, "Repeats must be non-negative");
diff --git a/aten/src/ATen/native/Repeat.h b/aten/src/ATen/native/Repeat.h
index e9a471d16f931..879a09bddd99b 100644
--- a/aten/src/ATen/native/Repeat.h
+++ b/aten/src/ATen/native/Repeat.h
@@ -17,7 +17,7 @@ template <
     void compute(const index_t*, const int64_t*, index_t*, int64_t, int64_t)>
 static inline Tensor repeat_interleave_common(
     const Tensor& repeats,
-    c10::optional<int64_t> output_size) {
+    std::optional<int64_t> output_size) {
   TORCH_CHECK(
       repeats.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
   TORCH_CHECK(
diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp
index be88538ed7082..fd06627b70277 100644
--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@@ -136,7 +136,7 @@ const Tensor& resize_as_sparse_(const Tensor& self, const Tensor& src);
 const Tensor& resize_as_(
     const Tensor& self,
     const Tensor& the_template,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   if (self.is_sparse() && the_template.is_sparse()) {
     TORCH_CHECK(
         !optional_memory_format.has_value(),
@@ -243,7 +243,7 @@ template <typename T>
 const Tensor& _resize_(
     const Tensor& self,
     ArrayRef<T> size,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   auto* self_ = self.unsafeGetTensorImpl();
   int64_t old_storage_nbytes = self_->unsafe_storage() ? self_->unsafe_storage().sym_nbytes().maybe_as_int().value_or(-1) : 0;
   // NOLINTNEXTLINE(bugprone-argument-comment)
@@ -267,7 +267,7 @@ const Tensor& _resize_(
 const Tensor& resize_(
     const Tensor& self,
     IntArrayRef size,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   if (self.has_names()) {
     return resize_named_tensor_(self, size, optional_memory_format);
   }
@@ -277,7 +277,7 @@ const Tensor& resize_(
 const Tensor& resize__symint(
     const Tensor& self,
     c10::SymIntArrayRef size,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   TORCH_INTERNAL_ASSERT(!self.has_names())
   return _resize_(self, size, optional_memory_format);
 }
diff --git a/aten/src/ATen/native/ResizeCommon.h b/aten/src/ATen/native/ResizeCommon.h
index 02d1e95c42efe..cea2612a22127 100644
--- a/aten/src/ATen/native/ResizeCommon.h
+++ b/aten/src/ATen/native/ResizeCommon.h
@@ -32,7 +32,7 @@ inline T storage_size_for(ArrayRef<T> size, ArrayRef<T> stride) {
 inline const Tensor& resize_named_tensor_(
     const Tensor& self,
     IntArrayRef size,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   TORCH_INTERNAL_ASSERT(self.has_names());
   TORCH_CHECK(
       self.sizes() == size,
diff --git a/aten/src/ATen/native/ScatterGatherChecks.h b/aten/src/ATen/native/ScatterGatherChecks.h
index 829959c347035..4aad28eb1f73a 100644
--- a/aten/src/ATen/native/ScatterGatherChecks.h
+++ b/aten/src/ATen/native/ScatterGatherChecks.h
@@ -15,7 +15,7 @@ static void scatter_gather_dtype_check(
   const std::string& method_name,
   const Tensor& self,
   const Tensor& index,
-  const c10::optional<Tensor>& src_opt = c10::nullopt
+  const std::optional<Tensor>& src_opt = c10::nullopt
 ) {
   if (index.numel() != 0) {
     TORCH_CHECK(
@@ -66,7 +66,7 @@ static C10_UNUSED void gather_shape_check(const Tensor& self, int64_t dim,
 //  3. index.dim() == self.dim() == src.dim()
 static C10_UNUSED void scatter_shape_check(
   const Tensor& self, int64_t dim, const Tensor& index,
-  const c10::optional<Tensor>& src_opt = c10::nullopt
+  const std::optional<Tensor>& src_opt = c10::nullopt
 ) {
   if (index.numel() == 0) return;
   TORCH_CHECK(
diff --git a/aten/src/ATen/native/SegmentReduce.cpp b/aten/src/ATen/native/SegmentReduce.cpp
index 3c7b539ee4b6d..0ab01bbe8c0bd 100644
--- a/aten/src/ATen/native/SegmentReduce.cpp
+++ b/aten/src/ATen/native/SegmentReduce.cpp
@@ -33,7 +33,7 @@ void _segment_reduce_lengths_cpu_kernel1(
     const Tensor& data,
     const T* lengths_data,
     int64_t axis,
-    const c10::optional<Scalar>& initial,
+    const std::optional<Scalar>& initial,
     Tensor& output,
     int64_t segment_count,
     int64_t lengths_stride_axis) {
@@ -132,7 +132,7 @@ Tensor _segment_reduce_lengths_cpu_kernel(
     const Tensor& data,
     const Tensor& lengths,
     int64_t axis,
-    const c10::optional<Scalar>& initial) {
+    const std::optional<Scalar>& initial) {
   // data and lengths should be contiguous from the call to .contiguous in segment_reduce_kernel
   TORCH_CHECK(data.is_contiguous(), "Expected data to be contiguous.");
   TORCH_CHECK(lengths.is_contiguous(), "Expected lengths to be contiguous.");
@@ -158,7 +158,7 @@ Tensor _segment_reduce_offsets_cpu_kernel(
     const Tensor& data,
     const Tensor& offsets,
     int64_t axis,
-    const c10::optional<Scalar>& initial) {
+    const std::optional<Scalar>& initial) {
   // data and lengths should be contiguous from the call to .contiguous in segment_reduce_kernel
   TORCH_CHECK(data.is_contiguous(), "Expected data to be contiguous.");
   TORCH_CHECK(offsets.is_contiguous(), "Expected offsets to be contiguous.");
@@ -187,7 +187,7 @@ void _segment_reduce_cpu_lengths_backward_kernel1(
     ReductionType reduction,
     const T* lengths_data,
     int64_t axis,
-    const c10::optional<Scalar>& initial,
+    const std::optional<Scalar>& initial,
     Tensor& grad_input,
     int64_t segment_count,
     int64_t lengths_stride_axis) {
@@ -323,7 +323,7 @@ Tensor _segment_reduce_cpu_lengths_backward_kernel(
     ReductionType reduction,
     const Tensor& lengths_contig,
     int64_t axis,
-    const c10::optional<Scalar>& initial) {
+    const std::optional<Scalar>& initial) {
   axis = lengths_contig.dim() - 1;
   int64_t segment_count = lengths_contig.size(axis);
   int64_t lengths_stride_axis = lengths_contig.stride(axis);
@@ -356,7 +356,7 @@ Tensor _segment_reduce_cpu_offsets_backward_kernel(
     ReductionType reduction,
     const Tensor& offsets_contig,
     int64_t axis,
-    const c10::optional<Scalar>& initial) {
+    const std::optional<Scalar>& initial) {
   axis = offsets_contig.dim() - 1;
   int64_t segment_count = offsets_contig.size(axis) - 1;
   int64_t offsets_stride_axis = offsets_contig.stride(axis);
@@ -386,12 +386,12 @@ Tensor _segment_reduce_cpu_offsets_backward_kernel(
 Tensor segment_reduce_kernel(
     const Tensor& data,
     c10::string_view reduce,
-    const c10::optional<Tensor>& lengths,
-    const c10::optional<Tensor>& indices,
-    const c10::optional<Tensor>& offsets,
+    const std::optional<Tensor>& lengths,
+    const std::optional<Tensor>& indices,
+    const std::optional<Tensor>& offsets,
     int64_t axis,
     bool unsafe,
-    const c10::optional<Scalar>& initial) {
+    const std::optional<Scalar>& initial) {
   axis = maybe_wrap_dim(axis, data.ndimension());
   TORCH_CHECK(data.numel() >= 0);
 
@@ -484,13 +484,13 @@ Tensor _segment_reduce_backward_kernel(
     const Tensor& output,
     const Tensor& data,
     c10::string_view reduce,
-    const c10::optional<Tensor>& lengths,
-    const c10::optional<Tensor>& offsets,
+    const std::optional<Tensor>& lengths,
+    const std::optional<Tensor>& offsets,
     int64_t axis,
-    const c10::optional<Scalar>& initial) {
+    const std::optional<Scalar>& initial) {
   axis = maybe_wrap_dim(axis, data.ndimension());
   // check that one of lengths or offsets is defined
-  // codegen for derivatives.yaml passes an undefined Tensor for None rather than a c10::optional
+  // codegen for derivatives.yaml passes an undefined Tensor for None rather than a std::optional
   // so checking .has_value() doesn't work unlike in the forward pass
   auto lengths_has_value = lengths.has_value() && lengths.value().defined();
   auto offsets_has_value = offsets.has_value() && offsets.value().defined();
diff --git a/aten/src/ATen/native/SegmentReduce.h b/aten/src/ATen/native/SegmentReduce.h
index 0f14aff64f887..44429d0594bfc 100644
--- a/aten/src/ATen/native/SegmentReduce.h
+++ b/aten/src/ATen/native/SegmentReduce.h
@@ -15,7 +15,7 @@ using segment_reduce_lengths_fn = Tensor (*)(
     const Tensor&,
     const Tensor&,
     int64_t,
-    const c10::optional<Scalar>&);
+    const std::optional<Scalar>&);
 DECLARE_DISPATCH(segment_reduce_lengths_fn, _segment_reduce_lengths_stub);
 
 using segment_reduce_offsets_fn = Tensor (*)(
@@ -23,7 +23,7 @@ using segment_reduce_offsets_fn = Tensor (*)(
     const Tensor&,
     const Tensor&,
     int64_t,
-    const c10::optional<Scalar>&);
+    const std::optional<Scalar>&);
 DECLARE_DISPATCH(segment_reduce_offsets_fn, _segment_reduce_offsets_stub);
 
 using segment_reduce_lengths_backward_fn = Tensor (*)(
@@ -33,7 +33,7 @@ using segment_reduce_lengths_backward_fn = Tensor (*)(
     ReductionType,
     const Tensor&,
     int64_t,
-    const c10::optional<Scalar>&);
+    const std::optional<Scalar>&);
 DECLARE_DISPATCH(segment_reduce_lengths_backward_fn, _segment_reduce_lengths_backward_stub);
 
 using segment_reduce_offsets_backward_fn = Tensor (*)(
@@ -43,7 +43,7 @@ using segment_reduce_offsets_backward_fn = Tensor (*)(
     ReductionType,
     const Tensor&,
     int64_t,
-    const c10::optional<Scalar>&);
+    const std::optional<Scalar>&);
 DECLARE_DISPATCH(segment_reduce_offsets_backward_fn, _segment_reduce_offsets_backward_stub);
 
 } // namespace native
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index bd321a0a88e7a..3188479b931f3 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -155,7 +155,7 @@ void host_softmax(
     const Tensor& input,
     const int64_t dim,
     bool* mask = nullptr,
-    const c10::optional<int64_t> mask_type_ = {}) {
+    const std::optional<int64_t> mask_type_ = {}) {
 
   if (MaskedSoftMax) {
     TORCH_CHECK(mask_type_.has_value(), "Mask Type should be defined");
@@ -449,7 +449,7 @@ static Tensor softmax(const Tensor& input_, const int64_t dim_) {
   return result;
 }
 
-Tensor softmax(const Tensor& input_, const int64_t dim_, c10::optional<ScalarType> dtype) {
+Tensor softmax(const Tensor& input_, const int64_t dim_, std::optional<ScalarType> dtype) {
   auto result = [&]() {
     NoNamesGuard guard;
     if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){
@@ -466,7 +466,7 @@ Tensor softmax(const Tensor& input_, const int64_t dim_, c10::optional<ScalarTyp
 Tensor& softmax_out(
     const Tensor& input_,
     const int64_t dim_,
-    c10::optional<ScalarType> dtype,
+    std::optional<ScalarType> dtype,
     Tensor& output_) {
   Tensor output_temp;
   if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half &&
@@ -501,7 +501,7 @@ Tensor& softmax_out(
 }
 
 // special_softmax, alias for softmax
-Tensor special_softmax(const Tensor& input_, const int64_t dim_, c10::optional<ScalarType> dtype) {
+Tensor special_softmax(const Tensor& input_, const int64_t dim_, std::optional<ScalarType> dtype) {
   return at::softmax(input_, dim_, dtype);
 }
 
@@ -514,7 +514,7 @@ static Tensor log_softmax(const Tensor& input_, const int64_t dim_) {
   return result;
 }
 
-Tensor log_softmax(const Tensor& input_, const int64_t dim_, c10::optional<ScalarType> dtype) {
+Tensor log_softmax(const Tensor& input_, const int64_t dim_, std::optional<ScalarType> dtype) {
   auto result = [&]() {
     NoNamesGuard guard;
     if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){
@@ -531,7 +531,7 @@ Tensor log_softmax(const Tensor& input_, const int64_t dim_, c10::optional<Scala
 Tensor& log_softmax_out(
     const Tensor& input_,
     const int64_t dim_,
-    c10::optional<ScalarType> dtype,
+    std::optional<ScalarType> dtype,
     Tensor& output_) {
   Tensor output_temp;
   if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half &&
@@ -565,7 +565,7 @@ Tensor& log_softmax_out(
   return output_;
 }
 
-Tensor special_log_softmax(const Tensor& input, const int64_t dim, c10::optional<ScalarType> dtype) {
+Tensor special_log_softmax(const Tensor& input, const int64_t dim, std::optional<ScalarType> dtype) {
   return at::log_softmax(input, dim, dtype);
 }
 
@@ -587,7 +587,7 @@ Tensor log_softmax(const Tensor& self, Dimname dim, optional<ScalarType> dtype)
   return at::log_softmax(self, dimname_to_position(self, dim), dtype);
 }
 
-Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const c10::optional<int64_t> dim_, const c10::optional<int64_t> mask_type_) {
+Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const std::optional<int64_t> dim_, const c10::optional<int64_t> mask_type_) {
 
   auto mask = mask_.contiguous();
   auto mask_type = mask_type_; // Mask type might get transformed below
@@ -652,7 +652,7 @@ Tensor masked_softmax_backward_cpu(
     const Tensor& grad_,
     const Tensor& output_,
     const Tensor& mask_,
-    const c10::optional<int64_t> dim_) {
+    const std::optional<int64_t> dim_) {
   TORCH_CHECK(
       grad_.sizes() == mask_.sizes(), "Mask shape should match grad shape");
   TORCH_CHECK(
diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
index b31007408c7ae..f9980ffd7229d 100644
--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@@ -71,7 +71,7 @@ TORCH_META_FUNC(topk)
 }
 
 TORCH_META_FUNC2(sort, stable)
-(const Tensor& self, c10::optional<bool> stable, int64_t dim, bool descending) {
+(const Tensor& self, std::optional<bool> stable, int64_t dim, bool descending) {
   maybe_wrap_dim(dim, self.dim());
 
   // See issue: https://github.com/pytorch/pytorch/issues/65863
@@ -939,7 +939,7 @@ Tensor nanmedian_cpu(const Tensor& self) {
 
 TORCH_IMPL_FUNC(sort_stable_out)
 (const Tensor& self,
- c10::optional<bool> stable,
+ std::optional<bool> stable,
  int64_t dim,
  bool descending,
  const Tensor& values,
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 7ed068874e68a..5f9ff1b838220 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -114,7 +114,7 @@ Tensor promote_tensor_fft(const Tensor& t, bool require_complex=false) {
 // Convert NumPy compatible normalization mode string to enum values
 // NOTE: NumPy's normalization modes have direction-specific meanings. For example,
 // "forward" translates to `by_n` for a forward transform and `none` for backward.
-fft_norm_mode norm_from_string(c10::optional<c10::string_view> norm, bool forward) {
+fft_norm_mode norm_from_string(std::optional<c10::string_view> norm, bool forward) {
   if (!norm || *norm == "backward") {
     return forward ? fft_norm_mode::none : fft_norm_mode::by_n;
   }
@@ -197,8 +197,8 @@ Tensor fft_c2c_maybe_out(
 
 // Complex to real FFT
 Tensor fft_c2r(c10::string_view function_name,
-               Tensor out, Tensor input, c10::optional<SymInt> n_opt,
-               int64_t unwrapped_dim, c10::optional<c10::string_view> norm_str,
+               Tensor out, Tensor input, std::optional<SymInt> n_opt,
+               int64_t unwrapped_dim, std::optional<c10::string_view> norm_str,
                bool forward) {
   TORCH_CHECK(!out.defined() || out.is_floating_point(), function_name,
               " expects a floating point output tensor, but got ", out.scalar_type());
@@ -221,8 +221,8 @@ Tensor fft_c2r(c10::string_view function_name,
 
 // Real to complex FFT
 Tensor fft_r2c(c10::string_view function_name,
-               Tensor out, Tensor input, c10::optional<SymInt> n_opt,
-               int64_t unwrapped_dim, c10::optional<c10::string_view> norm_str,
+               Tensor out, Tensor input, std::optional<SymInt> n_opt,
+               int64_t unwrapped_dim, std::optional<c10::string_view> norm_str,
                bool forward, bool onesided) {
   TORCH_CHECK(!input.is_complex(), function_name,
               " expects a real input tensor, but got ", input.scalar_type());
@@ -256,8 +256,8 @@ Tensor fft_r2c(c10::string_view function_name,
 
 // Complex to complex FFT
 Tensor fft_c2c(c10::string_view function_name,
-               Tensor out, Tensor input, c10::optional<SymInt> n_opt,
-               int64_t unwrapped_dim, c10::optional<c10::string_view> norm_str,
+               Tensor out, Tensor input, std::optional<SymInt> n_opt,
+               int64_t unwrapped_dim, std::optional<c10::string_view> norm_str,
                bool forward) {
   TORCH_CHECK(input.is_complex(), function_name,
               " expects a complex input tensor, but got ", input.scalar_type());
@@ -346,7 +346,7 @@ ShapeAndDims canonicalize_fft_shape_and_dim_args(
 Tensor fftn_c2c(
     c10::string_view function_name,
     Tensor out, const Tensor& input, SymIntArrayRef shape,
-    IntArrayRef dim, c10::optional<c10::string_view> norm_str, bool forward) {
+    IntArrayRef dim, std::optional<c10::string_view> norm_str, bool forward) {
   TORCH_CHECK(input.is_complex(), function_name, " expects a complex input tensor, but got", input.scalar_type());
   Tensor x = resize_fft_input(input, dim, shape);
   const auto norm = static_cast<int64_t>(norm_from_string(norm_str, forward));
@@ -357,15 +357,15 @@ Tensor fftn_c2c(
 }  // namespace (anonymous)
 
 // torch.fft.fft, analogous to NumPy's numpy.fft.fft
-Tensor fft_fft_symint(const Tensor& self, c10::optional<SymInt> n, int64_t dim,
-               c10::optional<c10::string_view> norm) {
+Tensor fft_fft_symint(const Tensor& self, std::optional<SymInt> n, int64_t dim,
+               std::optional<c10::string_view> norm) {
   return self.is_complex() ?
     fft_c2c("fft", {}, self, n, dim, norm, /*forward=*/true) :
     fft_r2c("fft", {}, self, n, dim, norm, /*forward=*/true, /*onesided=*/false);
 }
 
-Tensor& fft_fft_symint_out(const Tensor& self, c10::optional<SymInt> n,
-                    int64_t dim, c10::optional<c10::string_view> norm, Tensor& out) {
+Tensor& fft_fft_symint_out(const Tensor& self, std::optional<SymInt> n,
+                    int64_t dim, std::optional<c10::string_view> norm, Tensor& out) {
   if (self.is_complex()) {
     fft_c2c("fft", out, self, n, dim, norm, /*forward=*/true);
   } else {
@@ -374,15 +374,15 @@ Tensor& fft_fft_symint_out(const Tensor& self, c10::optional<SymInt> n,
   return out;
 }
 
-Tensor fft_ifft_symint(const Tensor& self, c10::optional<SymInt> n, int64_t dim,
-                c10::optional<c10::string_view> norm) {
+Tensor fft_ifft_symint(const Tensor& self, std::optional<SymInt> n, int64_t dim,
+                std::optional<c10::string_view> norm) {
   return self.is_complex() ?
     fft_c2c("ifft", {}, self, n, dim, norm, /*forward=*/false) :
     fft_r2c("ifft", {}, self, n, dim, norm, /*forward=*/false, /*onesided=*/false);
 }
 
-Tensor& fft_ifft_symint_out(const Tensor& self, c10::optional<SymInt> n,
-                     int64_t dim, c10::optional<c10::string_view> norm, Tensor& out) {
+Tensor& fft_ifft_symint_out(const Tensor& self, std::optional<SymInt> n,
+                     int64_t dim, std::optional<c10::string_view> norm, Tensor& out) {
   if (self.is_complex()) {
     fft_c2c("ifft", out, self, n, dim, norm, /*forward=*/false);
   } else {
@@ -391,53 +391,53 @@ Tensor& fft_ifft_symint_out(const Tensor& self, c10::optional<SymInt> n,
   return out;
 }
 
-Tensor fft_rfft_symint(const Tensor& self, c10::optional<SymInt> n, int64_t dim,
-                c10::optional<c10::string_view> norm) {
+Tensor fft_rfft_symint(const Tensor& self, std::optional<SymInt> n, int64_t dim,
+                std::optional<c10::string_view> norm) {
   return fft_r2c("rfft", {}, self, n, dim, norm, /*forward=*/true, /*onesided=*/true);
 }
 
-Tensor& fft_rfft_symint_out(const Tensor& self, c10::optional<SymInt> n,
-                     int64_t dim, c10::optional<c10::string_view> norm, Tensor& out) {
+Tensor& fft_rfft_symint_out(const Tensor& self, std::optional<SymInt> n,
+                     int64_t dim, std::optional<c10::string_view> norm, Tensor& out) {
   fft_r2c("rfft", out, self, n, dim, norm, /*forward=*/true, /*onesided=*/true);
   return out;
 }
 
-Tensor fft_irfft_symint(const Tensor& self, c10::optional<SymInt> n, int64_t dim,
-                 c10::optional<c10::string_view> norm) {
+Tensor fft_irfft_symint(const Tensor& self, std::optional<SymInt> n, int64_t dim,
+                 std::optional<c10::string_view> norm) {
   return fft_c2r("irfft", {}, self, n, dim, norm, /*forward=*/false);
 }
 
-Tensor& fft_irfft_symint_out(const Tensor& self, c10::optional<SymInt> n,
-                  int64_t dim, c10::optional<c10::string_view> norm, Tensor& out) {
+Tensor& fft_irfft_symint_out(const Tensor& self, std::optional<SymInt> n,
+                  int64_t dim, std::optional<c10::string_view> norm, Tensor& out) {
   fft_c2r("irfft", out, self, n, dim, norm, /*forward=*/false);
   return out;
 }
 
-Tensor fft_hfft_symint(const Tensor& self, c10::optional<SymInt> n, int64_t dim,
-                c10::optional<c10::string_view> norm) {
+Tensor fft_hfft_symint(const Tensor& self, std::optional<SymInt> n, int64_t dim,
+                std::optional<c10::string_view> norm) {
   return fft_c2r("hfft", {}, self, n, dim, norm, /*forward=*/true);
 }
 
-Tensor& fft_hfft_symint_out(const Tensor& self, c10::optional<SymInt> n,
-                     int64_t dim, c10::optional<c10::string_view> norm, Tensor& out) {
+Tensor& fft_hfft_symint_out(const Tensor& self, std::optional<SymInt> n,
+                     int64_t dim, std::optional<c10::string_view> norm, Tensor& out) {
   fft_c2r("hfft", out, self, n, dim, norm, /*forward=*/true);
   return out;
 }
 
-Tensor fft_ihfft_symint(const Tensor& self, c10::optional<SymInt> n, int64_t dim,
-                 c10::optional<c10::string_view> norm) {
+Tensor fft_ihfft_symint(const Tensor& self, std::optional<SymInt> n, int64_t dim,
+                 std::optional<c10::string_view> norm) {
   return fft_r2c("ihfft", {}, self, n, dim, norm, /*forward=*/false, /*onesided=*/true);
 }
 
-Tensor& fft_ihfft_symint_out(const Tensor& self, c10::optional<SymInt> n,
-                     int64_t dim, c10::optional<c10::string_view> norm, Tensor& out) {
+Tensor& fft_ihfft_symint_out(const Tensor& self, std::optional<SymInt> n,
+                     int64_t dim, std::optional<c10::string_view> norm, Tensor& out) {
   fft_r2c("ihfft", out, self, n, dim, norm, /*forward=*/false, /*onesided=*/true);
   return out;
 }
 
 Tensor fft_fftn_symint(const Tensor& self, at::OptionalSymIntArrayRef s,
                 at::OptionalIntArrayRef dim,
-                c10::optional<c10::string_view> norm) {
+                std::optional<c10::string_view> norm) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   // TODO: For real input, perform rfftn then mirror with conjugate symmetry
   Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
@@ -447,7 +447,7 @@ Tensor fft_fftn_symint(const Tensor& self, at::OptionalSymIntArrayRef s,
 Tensor& fft_fftn_symint_out(const Tensor& self,
                      at::OptionalSymIntArrayRef s,
                      at::OptionalIntArrayRef dim,
-                     c10::optional<c10::string_view> norm, Tensor& out) {
+                     std::optional<c10::string_view> norm, Tensor& out) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   // TODO: For real input, perform rfftn then mirror with conjugate symmetry
   Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
@@ -457,7 +457,7 @@ Tensor& fft_fftn_symint_out(const Tensor& self,
 
 Tensor fft_ifftn_symint(const Tensor& self, at::OptionalSymIntArrayRef s,
                 at::OptionalIntArrayRef dim,
-                c10::optional<c10::string_view> norm) {
+                std::optional<c10::string_view> norm) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
   return fftn_c2c("ifftn", {}, input, desc.shape, desc.dim, norm, /*forward=*/false);
@@ -466,7 +466,7 @@ Tensor fft_ifftn_symint(const Tensor& self, at::OptionalSymIntArrayRef s,
 Tensor& fft_ifftn_symint_out(const Tensor& self,
                       at::OptionalSymIntArrayRef s,
                       at::OptionalIntArrayRef dim,
-                      c10::optional<c10::string_view> norm, Tensor& out) {
+                      std::optional<c10::string_view> norm, Tensor& out) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
   fftn_c2c("ifftn", out, input, desc.shape, desc.dim, norm, /*forward=*/false);
@@ -476,7 +476,7 @@ Tensor& fft_ifftn_symint_out(const Tensor& self,
 static Tensor fft_rfftn_impl(Tensor out, const Tensor& self,
                              at::OptionalSymIntArrayRef s,
                              at::OptionalIntArrayRef dim,
-                             const c10::optional<c10::string_view>& norm_str) {
+                             const std::optional<c10::string_view>& norm_str) {
   TORCH_CHECK(!self.is_complex(), "rfftn expects a real-valued input tensor, but got ", self.scalar_type());
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   TORCH_CHECK(!desc.shape.empty(), "rfftn must transform at least one axis");
@@ -489,14 +489,14 @@ static Tensor fft_rfftn_impl(Tensor out, const Tensor& self,
 
 Tensor fft_rfftn_symint(const Tensor& self, at::OptionalSymIntArrayRef s,
                 at::OptionalIntArrayRef dim,
-                c10::optional<c10::string_view> norm_str) {
+                std::optional<c10::string_view> norm_str) {
   return fft_rfftn_impl({}, self, s, dim, norm_str);
 }
 
 Tensor& fft_rfftn_symint_out(const Tensor& self,
                       at::OptionalSymIntArrayRef s,
                       at::OptionalIntArrayRef dim,
-                      c10::optional<c10::string_view> norm_str, Tensor& out) {
+                      std::optional<c10::string_view> norm_str, Tensor& out) {
   fft_rfftn_impl(out, self, s, dim, norm_str);
   return out;
 }
@@ -528,7 +528,7 @@ static ShapeAndDims canonicalize_fft_c2r_shape_and_dim_args(
 static Tensor fft_irfftn_impl(Tensor out, const Tensor& self,
                               at::OptionalSymIntArrayRef s,
                               at::OptionalIntArrayRef dim,
-                              const c10::optional<c10::string_view>& norm_str) {
+                              const std::optional<c10::string_view>& norm_str) {
   SymInt last_dim_size = 0;
   auto desc = canonicalize_fft_c2r_shape_and_dim_args(
       "irfftn", self, s, dim, last_dim_size);
@@ -542,14 +542,14 @@ static Tensor fft_irfftn_impl(Tensor out, const Tensor& self,
 Tensor fft_irfftn_symint(const Tensor& self,
                   at::OptionalSymIntArrayRef s,
                   at::OptionalIntArrayRef dim,
-                  c10::optional<c10::string_view> norm_str) {
+                  std::optional<c10::string_view> norm_str) {
   return fft_irfftn_impl({}, self, s, dim, norm_str);
 }
 
 Tensor& fft_irfftn_symint_out(const Tensor& self,
                        at::OptionalSymIntArrayRef s,
                        at::OptionalIntArrayRef dim,
-                       c10::optional<c10::string_view> norm_str, Tensor& out) {
+                       std::optional<c10::string_view> norm_str, Tensor& out) {
   fft_irfftn_impl(out, self, s, dim, norm_str);
   return out;
 }
@@ -558,7 +558,7 @@ static Tensor fft_hfftn_impl(
     const Tensor& self,
     at::OptionalSymIntArrayRef s,
     at::OptionalIntArrayRef dim,
-    c10::optional<c10::string_view> norm_str,
+    std::optional<c10::string_view> norm_str,
     const Tensor& out) {
   constexpr c10::string_view fname = "hfftn";
   SymInt last_dim_size = 0;
@@ -586,14 +586,14 @@ Tensor fft_hfftn_symint(
     const Tensor& self,
     at::OptionalSymIntArrayRef s,
     at::OptionalIntArrayRef dim,
-    c10::optional<c10::string_view> norm) {
+    std::optional<c10::string_view> norm) {
   return fft_hfftn_impl(self, s, dim, norm, {});
 }
 
 const Tensor& fft_hfftn_symint_out(
     const Tensor& self,
     at::OptionalSymIntArrayRef s,
-    at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm,
+    at::OptionalIntArrayRef dim, std::optional<c10::string_view> norm,
     const Tensor& out) {
   fft_hfftn_impl(self, s, dim, norm, out);
   return out;
@@ -603,7 +603,7 @@ static Tensor fft_ihfftn_impl(
     const Tensor& self,
     const at::OptionalSymIntArrayRef& s,
     const at::OptionalIntArrayRef& dim,
-    const c10::optional<c10::string_view>& norm_str,
+    const std::optional<c10::string_view>& norm_str,
     const Tensor& out) {
   constexpr c10::string_view fname = "ihfftn";
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
@@ -628,7 +628,7 @@ Tensor fft_ihfftn_symint(
     const Tensor& self,
     at::OptionalSymIntArrayRef s,
     at::OptionalIntArrayRef dim,
-    c10::optional<c10::string_view> norm) {
+    std::optional<c10::string_view> norm) {
   return fft_ihfftn_impl(self, s, dim, norm, {});
 }
 
@@ -636,71 +636,71 @@ const Tensor& fft_ihfftn_symint_out(
     const Tensor& self,
     at::OptionalSymIntArrayRef s,
     at::OptionalIntArrayRef dim,
-    c10::optional<c10::string_view> norm,
+    std::optional<c10::string_view> norm,
     const Tensor& out) {
   fft_ihfftn_impl(self, s, dim, norm, out);
   return out;
 }
 
 Tensor fft_fft2_symint(const Tensor& self, at::OptionalSymIntArrayRef s,
-                IntArrayRef dim, c10::optional<c10::string_view> norm) {
+                IntArrayRef dim, std::optional<c10::string_view> norm) {
   return native::fft_fftn_symint(self, s, dim, std::move(norm));
 }
 
 Tensor& fft_fft2_symint_out(const Tensor& self, at::OptionalSymIntArrayRef s,
-                     IntArrayRef dim, c10::optional<c10::string_view> norm, Tensor& out) {
+                     IntArrayRef dim, std::optional<c10::string_view> norm, Tensor& out) {
   return native::fft_fftn_symint_out(self, s, dim, std::move(norm), out);
 }
 
 Tensor fft_ifft2_symint(const Tensor& self, at::OptionalSymIntArrayRef s,
-                IntArrayRef dim, c10::optional<c10::string_view> norm) {
+                IntArrayRef dim, std::optional<c10::string_view> norm) {
   return native::fft_ifftn_symint(self, s, dim, std::move(norm));
 }
 
 Tensor& fft_ifft2_symint_out(const Tensor& self, at::OptionalSymIntArrayRef s,
-                      IntArrayRef dim, c10::optional<c10::string_view> norm, Tensor& out) {
+                      IntArrayRef dim, std::optional<c10::string_view> norm, Tensor& out) {
   return native::fft_ifftn_symint_out(self, s, dim, std::move(norm), out);
 }
 
 Tensor fft_rfft2_symint(const Tensor& self, at::OptionalSymIntArrayRef s,
-                IntArrayRef dim, c10::optional<c10::string_view> norm) {
+                IntArrayRef dim, std::optional<c10::string_view> norm) {
   return native::fft_rfftn_symint(self, s, dim, std::move(norm));
 }
 
 Tensor& fft_rfft2_symint_out(const Tensor& self, at::OptionalSymIntArrayRef s,
-                      IntArrayRef dim, c10::optional<c10::string_view> norm, Tensor& out) {
+                      IntArrayRef dim, std::optional<c10::string_view> norm, Tensor& out) {
   return native::fft_rfftn_symint_out(self, s, dim, std::move(norm), out);
 }
 
 Tensor fft_irfft2_symint(const Tensor& self, at::OptionalSymIntArrayRef s,
-                  IntArrayRef dim, c10::optional<c10::string_view> norm) {
+                  IntArrayRef dim, std::optional<c10::string_view> norm) {
   return native::fft_irfftn_symint(self, s, dim, std::move(norm));
 }
 
 Tensor& fft_irfft2_symint_out(const Tensor& self, at::OptionalSymIntArrayRef s,
-                       IntArrayRef dim, c10::optional<c10::string_view> norm, Tensor& out) {
+                       IntArrayRef dim, std::optional<c10::string_view> norm, Tensor& out) {
   return native::fft_irfftn_symint_out(self, s, dim, std::move(norm), out);
 }
 
 const Tensor& fft_hfft2_symint_out(
     const Tensor& self, at::OptionalSymIntArrayRef s, IntArrayRef dim,
-    c10::optional<c10::string_view> norm, const Tensor& out) {
+    std::optional<c10::string_view> norm, const Tensor& out) {
   return native::fft_hfftn_symint_out(self, s, dim, std::move(norm), out);
 }
 
 Tensor fft_hfft2_symint(const Tensor& self, at::OptionalSymIntArrayRef s,
-                 IntArrayRef dim, c10::optional<c10::string_view> norm) {
+                 IntArrayRef dim, std::optional<c10::string_view> norm) {
   return native::fft_hfftn_symint(self, s, dim, std::move(norm));
 }
 
 const Tensor& fft_ihfft2_symint_out(
     const Tensor& self, at::OptionalSymIntArrayRef s, IntArrayRef dim,
-    c10::optional<c10::string_view> norm, const Tensor& out) {
+    std::optional<c10::string_view> norm, const Tensor& out) {
   return native::fft_ihfftn_symint_out(self, s, dim, std::move(norm), out);
 }
 
 Tensor fft_ihfft2_symint(const Tensor& self, at::OptionalSymIntArrayRef s,
-                  IntArrayRef dim, c10::optional<c10::string_view> norm) {
+                  IntArrayRef dim, std::optional<c10::string_view> norm) {
   return native::fft_ihfftn_symint(self, s, dim, std::move(norm));
 }
 
@@ -716,10 +716,10 @@ Tensor& fft_fftfreq_out(int64_t n, double d, Tensor& out) {
 }
 
 Tensor fft_fftfreq(int64_t n, double d,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -737,10 +737,10 @@ Tensor& fft_rfftfreq_out(int64_t n, double d, Tensor& out) {
 }
 
 Tensor fft_rfftfreq(int64_t n, double d,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -824,7 +824,7 @@ static Stream& write_opt(Stream& SS, const optional<T>& value) {
  * signals and complex windows.
  */
 Tensor stft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop_lengthOpt,
-            const optional<int64_t> win_lengthOpt, const c10::optional<Tensor>& window_opt,
+            const optional<int64_t> win_lengthOpt, const std::optional<Tensor>& window_opt,
             const bool center, c10::string_view mode, const bool normalized,
             const optional<bool> onesidedOpt, const optional<bool> return_complexOpt) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -980,7 +980,7 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop
 
 Tensor stft(
     const Tensor& self, const int64_t n_fft, const optional<int64_t> hop_lengthOpt,
-    const optional<int64_t> win_lengthOpt, const c10::optional<Tensor>& window_opt,
+    const optional<int64_t> win_lengthOpt, const std::optional<Tensor>& window_opt,
     const bool normalized,
     const optional<bool> onesidedOpt, const optional<bool> return_complexOpt) {
   return at::stft(
@@ -1011,8 +1011,8 @@ static Tensor as_complex(const Tensor& self) {
  * signals and complex windows.
  */
 Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop_lengthOpt,
-             const optional<int64_t> win_lengthOpt, const c10::optional<Tensor>& window_opt,
-             const bool center, const bool normalized, const c10::optional<bool> onesidedOpt,
+             const optional<int64_t> win_lengthOpt, const std::optional<Tensor>& window_opt,
+             const bool center, const bool normalized, const std::optional<bool> onesidedOpt,
              const optional<int64_t> lengthOpt, const bool return_complex) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> window_maybe_owned = at::borrow_from_optional_tensor(window_opt);
diff --git a/aten/src/ATen/native/SummaryOps.cpp b/aten/src/ATen/native/SummaryOps.cpp
index 4c158f81a47e9..1866f4353b535 100644
--- a/aten/src/ATen/native/SummaryOps.cpp
+++ b/aten/src/ATen/native/SummaryOps.cpp
@@ -68,7 +68,7 @@ Tensor _bincount_cpu_template(
 } // namespace
 
 Tensor
-_bincount_cpu(const Tensor& self, const c10::optional<Tensor>& weights_opt, int64_t minlength) {
+_bincount_cpu(const Tensor& self, const std::optional<Tensor>& weights_opt, int64_t minlength) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weights_maybe_owned = at::borrow_from_optional_tensor(weights_opt);
   const Tensor& weights = *weights_maybe_owned;
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index f1e385d8eeac8..395af8e5ef139 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -190,8 +190,8 @@ void scatter_meta_impl(
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
-    const c10::optional<Tensor>& src = nullopt,
-    const c10::optional<c10::string_view> reduce = nullopt) {
+    const std::optional<Tensor>& src = nullopt,
+    const std::optional<c10::string_view> reduce = nullopt) {
   int64_t wrapped_dim = at::maybe_wrap_dim(dim, self.dim());
   at::native::scatter_gather_dtype_check("scatter", self, index, src);
   at::native::scatter_shape_check(self, wrapped_dim, index, src);
@@ -629,7 +629,7 @@ TORCH_IMPL_FUNC(index_out)
   index_stub(device_type(), *this, sizes, strides);
 }
 
-Tensor quantized_index(const Tensor & self, const torch::List<c10::optional<Tensor>>& indices) {
+Tensor quantized_index(const Tensor & self, const torch::List<std::optional<Tensor>>& indices) {
   TORCH_INTERNAL_ASSERT(
       self.qscheme() == c10::kPerTensorAffine ||
       self.qscheme() == c10::kPerTensorSymmetric,
@@ -643,7 +643,7 @@ Tensor quantized_index(const Tensor & self, const torch::List<c10::optional<Tens
       result, self.q_scale(), self.q_zero_point(), self.scalar_type());
 }
 
-Tensor _unsafe_index(const Tensor& self, const torch::List<c10::optional<Tensor>>& indices) {
+Tensor _unsafe_index(const Tensor& self, const torch::List<std::optional<Tensor>>& indices) {
   // Disallow boolean indexing since it leads to dynamic output shapes
   for (auto i : c10::irange(indices.size())) {
     auto index = indices.get(i);
@@ -702,15 +702,15 @@ Tensor put(const Tensor & self, const Tensor& index, const Tensor & source, cons
   return self.clone(at::MemoryFormat::Preserve).put_(index, source, accumulate);
 }
 
-Tensor index_put(const Tensor & self, const torch::List<c10::optional<Tensor>>& indices, const Tensor & value, bool accumulate) {
+Tensor index_put(const Tensor & self, const torch::List<std::optional<Tensor>>& indices, const Tensor & value, bool accumulate) {
   return self.clone(at::MemoryFormat::Preserve).index_put_(indices, value, accumulate);
 }
 
-Tensor _unsafe_index_put(const Tensor& self, const torch::List<c10::optional<Tensor>>& indices, const Tensor& value, bool accumulate) {
+Tensor _unsafe_index_put(const Tensor& self, const torch::List<std::optional<Tensor>>& indices, const Tensor& value, bool accumulate) {
   return at::index_put(self, indices, value, accumulate);
 }
 
-Tensor & _index_put_impl_(Tensor & self, const torch::List<c10::optional<Tensor>>& indices, const Tensor & value, const bool accumulate, const bool unsafe) {
+Tensor & _index_put_impl_(Tensor & self, const torch::List<std::optional<Tensor>>& indices, const Tensor & value, const bool accumulate, const bool unsafe) {
   TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
   if (at::has_internal_overlap(self) == MemOverlap::Yes) {
     TORCH_WARN(
@@ -730,7 +730,7 @@ Tensor & _index_put_impl_(Tensor & self, const torch::List<c10::optional<Tensor>
   }
   at::assert_no_overlap(self, value);
   // NOLINTNEXTLINE(performance-implicit-conversion-in-loop)
-  for (const c10::optional<Tensor>& index: indices) {
+  for (const std::optional<Tensor>& index: indices) {
     if (index.has_value()) {
       at::assert_no_overlap(self, *index);
     }
@@ -788,7 +788,7 @@ Tensor take(const Tensor& self, const Tensor& index) {
     return out;
 }
 
-Tensor & index_put_(Tensor & self, const torch::List<c10::optional<Tensor>>& indices, const Tensor & value, const bool accumulate) {
+Tensor & index_put_(Tensor & self, const torch::List<std::optional<Tensor>>& indices, const Tensor & value, const bool accumulate) {
   return at::_index_put_impl_(self, indices, value, accumulate, /*unsafe=*/false);
 }
 
@@ -798,7 +798,7 @@ TORCH_IMPL_FUNC(index_copy_out)
 
     // See Note [Enabling Deterministic Operations]
     if (result.is_cuda() && globalContext().deterministicAlgorithms()){
-        torch::List<c10::optional<Tensor>> indices;
+        torch::List<std::optional<Tensor>> indices;
         indices.reserve(dim + 1);
         for (const auto i: c10::irange(dim)) {
           (void)i;
@@ -1624,7 +1624,7 @@ static void _scatter_via_index_put(
   const Tensor& mut_out,
   bool accumulate) {
   if (self.dim() == 1) {
-    torch::List<c10::optional<Tensor>> indices;
+    torch::List<std::optional<Tensor>> indices;
     indices.reserve(1);
     indices.push_back(index);
     mut_out.index_put_(indices, src, accumulate);
@@ -1698,7 +1698,7 @@ static void _scatter_via_index_put(
       src.strides()
     ).flatten();
 
-    torch::List<c10::optional<Tensor>> indices;
+    torch::List<std::optional<Tensor>> indices;
     indices.reserve(1);
     indices.push_back(index_flat);
 
@@ -1719,7 +1719,7 @@ void scatter_impl(
     const Tensor& out,
     ReduceStub& reduce_stub,
     FillStub& fill_stub,
-    const c10::optional<c10::string_view> reduce = nullopt,
+    const std::optional<c10::string_view> reduce = nullopt,
     bool reduce_includes_self = true) {
 
   dim = at::maybe_wrap_dim(dim, self.dim());
@@ -2123,7 +2123,7 @@ static inline void checkDevice(CheckedFrom c, at::ArrayRef<Tensor> tensors, Devi
 
 } // anonymous namespace
 
-Tensor take_along_dim(const Tensor& self, const Tensor& indices, c10::optional<int64_t> opt_dim) {
+Tensor take_along_dim(const Tensor& self, const Tensor& indices, std::optional<int64_t> opt_dim) {
   checkDevice("torch.take_along_dim():", {self, indices}, self.device());
   if (opt_dim.has_value()) {
     auto [self_broadcasted, indices_broadcasted, dim] =
@@ -2135,7 +2135,7 @@ Tensor take_along_dim(const Tensor& self, const Tensor& indices, c10::optional<i
   return self.view(-1).gather(0, indices.view(-1));
 }
 
-Tensor& take_along_dim_out(const Tensor& self, const Tensor& indices, c10::optional<int64_t> opt_dim, Tensor& result) {
+Tensor& take_along_dim_out(const Tensor& self, const Tensor& indices, std::optional<int64_t> opt_dim, Tensor& result) {
   checkDevice("torch.take_along_dim():", {self, indices, result}, self.device());
   if (opt_dim.has_value()) {
     auto [self_broadcasted, indices_broadcasted, dim] =
@@ -2241,7 +2241,7 @@ Tensor count_nonzero_cpu(const Tensor& self, IntArrayRef dims){
 }
 
 
-Tensor count_nonzero(const Tensor& self, c10::optional<int64_t> dim) {
+Tensor count_nonzero(const Tensor& self, std::optional<int64_t> dim) {
   if (dim) {
     return at::count_nonzero(self, IntArrayRef{*dim});
   }
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.h b/aten/src/ATen/native/TensorAdvancedIndexing.h
index c1464092a8e28..7b02b4201ffaa 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.h
@@ -13,8 +13,8 @@ struct TensorIterator;
 
 namespace at::native {
 
-using index_put_with_sort_fn = void(*)(Tensor &, const c10::List<c10::optional<Tensor>> &, const Tensor &, bool accumulate, bool unsafe);
-using index_put_with_sort_quantized_fn = void(*)(Tensor& self, const c10::List<c10::optional<Tensor>>& indices, const Tensor& value, double scale, int zero_point, bool unsafe);
+using index_put_with_sort_fn = void(*)(Tensor &, const c10::List<std::optional<Tensor>> &, const Tensor &, bool accumulate, bool unsafe);
+using index_put_with_sort_quantized_fn = void(*)(Tensor& self, const c10::List<std::optional<Tensor>>& indices, const Tensor& value, double scale, int zero_point, bool unsafe);
 using gather_fn = void (*)(const Tensor & result, const Tensor & self, int64_t dim, const Tensor & index);
 using scatter_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
 using scatter_fill_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src);
@@ -36,7 +36,7 @@ DECLARE_DISPATCH(scatter_reduce_fn, scatter_reduce_stub);
 DECLARE_DISPATCH(scatter_scalar_reduce_fn, scatter_scalar_reduce_stub);
 DECLARE_DISPATCH(scatter_reduce_two_fn, scatter_reduce_two_stub);
 
-TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List<c10::optional<at::Tensor>>& indices);
+TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List<std::optional<at::Tensor>>& indices);
 
 using scatter_add_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&);
 using scatter_reduce_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const ReductionType& reduce, bool);
diff --git a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
index 7b9d1446a087b..e46be1f878f72 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
@@ -21,7 +21,7 @@ static std::string shapes_as_str(TensorList tensors) {
 }
 } // anonymous namespace
 
-static std::tuple<bool, Tensor> canDispatchToMaskedFill(const Tensor& self, const torch::List<c10::optional<at::Tensor>>& indices,
+static std::tuple<bool, Tensor> canDispatchToMaskedFill(const Tensor& self, const torch::List<std::optional<at::Tensor>>& indices,
 const Tensor& value){
   if (!(value.numel() ==1 && value.device().is_cpu())){
     return std::make_tuple(false,Tensor());
@@ -29,7 +29,7 @@ const Tensor& value){
   int64_t num_ind = 0;
   Tensor mask;
   auto self_device = self.device();
-  for (const c10::optional<Tensor>& i: indices) {
+  for (const std::optional<Tensor>& i: indices) {
     if (!i.has_value() || !(*i).defined()){
       num_ind++;
     } else {
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index e9599b4898fcd..cbb79dfabc7eb 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -491,7 +491,7 @@ static void isin_sorting(
   if (assume_unique) {
     out.copy_(mask.slice(0, 0, elements.numel()).view_as(out));
   } else {
-    out.copy_(at::index(mask, {c10::optional<Tensor>(unique_order)}));
+    out.copy_(at::index(mask, {std::optional<Tensor>(unique_order)}));
   }
 }
 
@@ -746,27 +746,27 @@ TORCH_IMPL_FUNC(clamp_min_Tensor_out)
 }
 
 // Implements the "clip" alias for clamp
-Tensor& clip_out(const Tensor& self, const c10::optional<Scalar>& min, const c10::optional<Scalar>& max, Tensor& result) {
+Tensor& clip_out(const Tensor& self, const std::optional<Scalar>& min, const c10::optional<Scalar>& max, Tensor& result) {
   return at::clamp_outf(self, min, max, result);
 }
 
-Tensor& clip_out(const Tensor& self, const c10::optional<Tensor>& min, const c10::optional<Tensor>& max, Tensor& result) {
+Tensor& clip_out(const Tensor& self, const std::optional<Tensor>& min, const c10::optional<Tensor>& max, Tensor& result) {
   return at::clamp_outf(self, min, max, result);
 }
 
-Tensor clip(const Tensor& self, const c10::optional<Scalar>& min, const c10::optional<Scalar>& max) {
+Tensor clip(const Tensor& self, const std::optional<Scalar>& min, const c10::optional<Scalar>& max) {
   return at::clamp(self, min, max);
 }
 
-Tensor clip(const Tensor& self, const c10::optional<Tensor>& min, const c10::optional<Tensor>& max) {
+Tensor clip(const Tensor& self, const std::optional<Tensor>& min, const c10::optional<Tensor>& max) {
   return at::clamp(self, min, max);
 }
 
-Tensor& clip_(Tensor& self, const c10::optional<Scalar>& min, const c10::optional<Scalar>& max) {
+Tensor& clip_(Tensor& self, const std::optional<Scalar>& min, const c10::optional<Scalar>& max) {
   return at::clamp_(self, min, max);
 }
 
-Tensor& clip_(Tensor& self, const c10::optional<Tensor>& min, const c10::optional<Tensor>& max) {
+Tensor& clip_(Tensor& self, const std::optional<Tensor>& min, const c10::optional<Tensor>& max) {
   return at::clamp_(self, min, max);
 }
 
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index c70da8334a5e9..dfb0fe4eb0a05 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -229,12 +229,12 @@ static inline optional<Device> ensure_has_index(optional<Device> device) {
 
 Tensor _to_copy(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
     bool non_blocking,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   TORCH_CHECK(!layout.has_value() || self.layout() == layout.value(),
            "to(options) doesn't support converting to a different layout, "
            "but got self.layout being ", self.layout(),
@@ -387,7 +387,7 @@ Tensor _to_copy(
 }
 
 template <typename T>
-static inline bool is_null_or_equal_to(const c10::optional<T>& test, const T& value) {
+static inline bool is_null_or_equal_to(const std::optional<T>& test, const T& value) {
   if (!test.has_value()) {
     return true;
   }
@@ -399,11 +399,11 @@ static inline bool is_null_or_equal_to(const c10::optional<T>& test, const T& va
 // well.
 bool to_will_alias(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
     bool copy,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);
 
   return is_null_or_equal_to(dtype, self.dtype().toScalarType()) &&
@@ -416,13 +416,13 @@ bool to_will_alias(
 
 static inline Tensor to_impl(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
     bool non_blocking,
     bool copy,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<c10::MemoryFormat> optional_memory_format) {
 
   // fast path
   if (to_will_alias(self, dtype, layout, device, copy, optional_memory_format)) {
@@ -471,13 +471,13 @@ Tensor _autocast_to_full_precision(const Tensor& self, bool cuda_enabled, bool c
 
 Tensor to(
   const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
   bool non_blocking,
   bool copy,
-  c10::optional<c10::MemoryFormat> optional_memory_format
+  std::optional<c10::MemoryFormat> optional_memory_format
 ) {
   return to_impl(
       self,
@@ -490,7 +490,7 @@ Tensor to(
       optional_memory_format);
 }
 
-Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
+Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking, bool copy, std::optional<c10::MemoryFormat> optional_memory_format) {
   return to_impl(
       self,
       dtype,
@@ -502,7 +502,7 @@ Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking
       optional_memory_format);
 }
 
-Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
+Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking, bool copy, std::optional<c10::MemoryFormat> optional_memory_format) {
   return to_impl(
       self,
       dtype,
@@ -514,7 +514,7 @@ Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking, bool copy, c1
       optional_memory_format);
 }
 
-Tensor to(const Tensor& self, const Tensor& other, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
+Tensor to(const Tensor& self, const Tensor& other, bool non_blocking, bool copy, std::optional<c10::MemoryFormat> optional_memory_format) {
   auto options = other.options();
   return to_impl(
       self,
@@ -538,7 +538,7 @@ std::vector<Tensor> _to_cpu(TensorList tensors) {
     return cpu_tensors;
 }
 
-Tensor to_dense_backward(const Tensor& grad, const Tensor& input_, c10::optional<bool> masked_grad_) {
+Tensor to_dense_backward(const Tensor& grad, const Tensor& input_, std::optional<bool> masked_grad_) {
   /*
     For historical reasons, to_dense backward implements masked
     semantics for sparse tensors, that is, gradients with respect to
@@ -598,7 +598,7 @@ Tensor to_mkldnn_backward(const Tensor& grad, const Tensor& input_) {
   return grad.to_dense(input_.scalar_type());
 }
 
-Tensor to_dense(const Tensor& tensor, c10::optional<c10::ScalarType> dtype, c10::optional<bool> masked_grad) {
+Tensor to_dense(const Tensor& tensor, std::optional<c10::ScalarType> dtype, c10::optional<bool> masked_grad) {
   if (tensor.layout() == c10::kSparse) {
     return tensor._to_dense(dtype, masked_grad);
   }
@@ -621,7 +621,7 @@ Tensor to_dense(const Tensor& tensor, c10::optional<c10::ScalarType> dtype, c10:
   return tensor;
 }
 
-Tensor sparse_to_dense(const Tensor& self, c10::optional<ScalarType> dtype, c10::optional<bool> masked) {
+Tensor sparse_to_dense(const Tensor& self, std::optional<ScalarType> dtype, c10::optional<bool> masked) {
   TORCH_CHECK(
       !dtype.has_value(), "dtype argument is not supported by sparse_to_dense");
   Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided));
@@ -630,8 +630,8 @@ Tensor sparse_to_dense(const Tensor& self, c10::optional<ScalarType> dtype, c10:
 
 Tensor sparse_compressed_to_dense(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<bool> masked_grad) {
+    std::optional<ScalarType> dtype,
+    std::optional<bool> masked_grad) {
   TORCH_CHECK(
       !dtype.has_value(),
       "dtype argument is not supported by sparse_csr_to_dense");
@@ -954,7 +954,7 @@ void _to_sparse_check_arguments(const std::string& funcname, const Tensor& self,
 }
 
 static inline
-void _to_sparse_check_arguments(const std::string& funcname, const Tensor& self, c10::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+void _to_sparse_check_arguments(const std::string& funcname, const Tensor& self, std::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
   auto layout_from = self.layout();
   auto layout_to = layout.value_or(kSparse);
 
@@ -1036,7 +1036,7 @@ void _to_sparse_check_arguments(const std::string& funcname, const Tensor& self,
 }
 
 template<Layout target_layout>
-static Tensor dense_to_sparse_compressed(const Tensor& self, const Tensor& self_mask, IntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+static Tensor dense_to_sparse_compressed(const Tensor& self, const Tensor& self_mask, IntArrayRef blocksize, std::optional<int64_t> dense_dim_opt) {
   static_assert(target_layout == Layout::SparseCsr || target_layout == Layout::SparseCsc
                 || target_layout == Layout::SparseBsr || target_layout == Layout::SparseBsc,
                 "invalid layout template parameter for dense_to_sparse_compressed");
@@ -1109,7 +1109,7 @@ static Tensor dense_to_sparse_compressed(const Tensor& self, const Tensor& self_
         self.options().layout(target_layout));
 }
 
-Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, c10::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, std::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
   auto layout_to = layout.value_or(kSparse);
   TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "dense_to_sparse: unexpected same input and output layout");
   TORCH_INTERNAL_ASSERT(self.layout() == mask.layout(),
@@ -1137,35 +1137,35 @@ Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, c10::op
   return Tensor{};
 }
 
-Tensor dense_to_sparse_csr(const Tensor& self, c10::optional<int64_t> dense_dim_opt) {
+Tensor dense_to_sparse_csr(const Tensor& self, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseCsr;
   _to_sparse_check_arguments("dense_to_sparse_csr", self, layout_to, {}, dense_dim_opt);
 
   return dense_to_sparse_compressed<Layout::SparseCsr>(self, self != 0, {}, dense_dim_opt);
 }
 
-Tensor dense_to_sparse_csc(const Tensor& self, c10::optional<int64_t> dense_dim_opt) {
+Tensor dense_to_sparse_csc(const Tensor& self, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseCsc;
   _to_sparse_check_arguments("dense_to_sparse_csc", self, layout_to, {}, dense_dim_opt);
 
   return dense_to_sparse_compressed<Layout::SparseCsc>(self, self != 0, {}, dense_dim_opt);
 }
 
-Tensor dense_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+Tensor dense_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseBsr;
   _to_sparse_check_arguments("dense_to_sparse_bsr", self, layout_to, blocksize, dense_dim_opt);
 
   return dense_to_sparse_compressed<Layout::SparseBsr>(self, self != 0, blocksize, dense_dim_opt);
 }
 
-Tensor dense_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+Tensor dense_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseBsc;
   _to_sparse_check_arguments("dense_to_sparse_bsc", self, layout_to, blocksize, dense_dim_opt);
 
   return dense_to_sparse_compressed<Layout::SparseBsc>(self, self != 0, blocksize, dense_dim_opt);
 }
 
-Tensor dense_to_sparse(const Tensor& self, c10::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+Tensor dense_to_sparse(const Tensor& self, std::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
   auto layout_to = layout.value_or(kSparse);
   TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "dense_to_sparse: unexpected same input and output layout");
   _to_sparse_check_arguments("dense_to_sparse", self, layout, blocksize, dense_dim_opt);
@@ -1234,7 +1234,7 @@ Tensor dense_to_sparse(const Tensor& self, int64_t sparse_dim) {
 
 static Tensor sparse_compressed_to_flipped(
     const Tensor& self,
-    c10::optional<IntArrayRef> blocksize,
+    std::optional<IntArrayRef> blocksize,
     const std::string& name) {
   const auto layout = self.layout();
   // NOTE: errors on non-compressed sparse layouts.
@@ -1435,7 +1435,7 @@ static Tensor sparse_compressed_to_flipped(
       self.options().layout(flipped_layout));
 }
 
-Tensor sparse_compressed_to_sparse_csr(const Tensor& self, c10::optional<int64_t> dense_dim_opt) {
+Tensor sparse_compressed_to_sparse_csr(const Tensor& self, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseCsr;
   TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "sparse_compressed_to_sparse_csr: unexpected same input and output layout");
   _to_sparse_check_arguments("sparse_compressed_to_sparse_csr", self, layout_to, {}, dense_dim_opt);
@@ -1448,7 +1448,7 @@ Tensor sparse_compressed_to_sparse_csr(const Tensor& self, c10::optional<int64_t
   return Tensor{};
 }
 
-Tensor sparse_compressed_to_sparse_csc(const Tensor& self, c10::optional<int64_t> dense_dim_opt) {
+Tensor sparse_compressed_to_sparse_csc(const Tensor& self, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseCsc;
   TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "sparse_compressed_to_sparse_csc: unexpected same input and output layout");
   _to_sparse_check_arguments("sparse_compressed_to_sparse_csc", self, layout_to, {}, dense_dim_opt);
@@ -1461,7 +1461,7 @@ Tensor sparse_compressed_to_sparse_csc(const Tensor& self, c10::optional<int64_t
   return Tensor{};
 }
 
-Tensor coo_to_sparse_csr(const Tensor& self, c10::optional<int64_t> dense_dim_opt) {
+Tensor coo_to_sparse_csr(const Tensor& self, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseCsr;
   _to_sparse_check_arguments("coo_to_sparse_csr", self, layout_to, {}, dense_dim_opt);
 
@@ -1480,7 +1480,7 @@ Tensor coo_to_sparse_csr(const Tensor& self, c10::optional<int64_t> dense_dim_op
       coalesced_self.device());
 }
 
-Tensor coo_to_sparse_csc(const Tensor& self, c10::optional<int64_t> dense_dim_opt) {
+Tensor coo_to_sparse_csc(const Tensor& self, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseCsc;
   _to_sparse_check_arguments("coo_to_sparse_csc", self, layout_to, {}, dense_dim_opt);
 
@@ -1495,14 +1495,14 @@ Tensor coo_to_sparse_csc(const Tensor& self, c10::optional<int64_t> dense_dim_op
       transposed_csr.device());
 }
 
-Tensor coo_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+Tensor coo_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseBsr;
   _to_sparse_check_arguments("coo_to_sparse_bsr", self, layout_to, blocksize, dense_dim_opt);
 
   return self.to_sparse_csr(dense_dim_opt).to_sparse_bsr(blocksize);
 }
 
-Tensor coo_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+Tensor coo_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseBsc;
   _to_sparse_check_arguments("coo_to_sparse_bsc", self, layout_to, blocksize, dense_dim_opt);
 
@@ -1814,7 +1814,7 @@ Tensor _compressed_to_block_compressed_cpu(const Tensor& self, IntArrayRef block
       self.options().layout(target_layout));
 }
 
-Tensor sparse_compressed_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+Tensor sparse_compressed_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseBsr;
   TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "sparse_compressed_to_sparse_bsr: unexpected same input and output layout");
   _to_sparse_check_arguments("sparse_compressed_to_sparse_bsr", self, layout_to, blocksize, dense_dim_opt);
@@ -1836,7 +1836,7 @@ Tensor sparse_compressed_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize
   return Tensor{};
 }
 
-Tensor sparse_compressed_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+Tensor sparse_compressed_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseBsc;
   TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "sparse_compressed_to_sparse_bsc: unexpected same input and output layout");
   _to_sparse_check_arguments("sparse_compressed_to_sparse_bsc", self, layout_to, blocksize, dense_dim_opt);
@@ -1909,7 +1909,7 @@ Tensor sparse_compressed_to_sparse(const Tensor& self, const int64_t sparse_dim)
   return at::native::_sparse_coo_tensor_unsafe(indices, values, self.sizes())._coalesced_(coalesced);
 }
 
-Tensor sparse_compressed_to_sparse(const Tensor& self, c10::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+Tensor sparse_compressed_to_sparse(const Tensor& self, std::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
   auto layout_to = layout.value_or(kSparse);
   TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "sparse_compressed_to_sparse: unexpected same input and output layout");
   _to_sparse_check_arguments("sparse_compressed_to_sparse", self, layout_to, blocksize, dense_dim_opt);
@@ -1936,7 +1936,7 @@ Tensor sparse_compressed_to_sparse(const Tensor& self, c10::optional<c10::Layout
   return Tensor{};
 }
 
-Tensor sparse_coo_to_sparse(const Tensor& self, c10::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+Tensor sparse_coo_to_sparse(const Tensor& self, std::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
   auto layout_to = layout.value_or(kSparse);
   TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "sparse_coo_to_sparse: unexpected same input and output layout");
   _to_sparse_check_arguments("sparse_coo_to_sparse", self, layout_to, blocksize, dense_dim_opt);
@@ -1969,7 +1969,7 @@ Tensor to_sparse(const Tensor& self, const int64_t sparse_dim) {
   return self._to_sparse(sparse_dim);
 }
 
-Tensor to_sparse(const Tensor& self, c10::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+Tensor to_sparse(const Tensor& self, std::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
   auto layout_to = layout.value_or(kSparse);
   if (self.layout() == layout_to) {
     _to_sparse_check_arguments("to_sparse", self, layout, blocksize, dense_dim_opt);
@@ -1978,7 +1978,7 @@ Tensor to_sparse(const Tensor& self, c10::optional<c10::Layout> layout, Optional
   return self._to_sparse(layout, blocksize, dense_dim_opt);
 }
 
-Tensor to_sparse_csr(const Tensor& self, c10::optional<int64_t> dense_dim_opt) {
+Tensor to_sparse_csr(const Tensor& self, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseCsr;
   if (self.layout() == layout_to) {
     _to_sparse_check_arguments("to_sparse_csr", self, layout_to, {}, dense_dim_opt);
@@ -1987,7 +1987,7 @@ Tensor to_sparse_csr(const Tensor& self, c10::optional<int64_t> dense_dim_opt) {
   return self._to_sparse_csr(dense_dim_opt);
 }
 
-Tensor to_sparse_csc(const Tensor& self, c10::optional<int64_t> dense_dim_opt) {
+Tensor to_sparse_csc(const Tensor& self, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseCsc;
   if (self.layout() == layout_to) {
     _to_sparse_check_arguments("to_sparse_csc", self, layout_to, {}, dense_dim_opt);
@@ -1996,7 +1996,7 @@ Tensor to_sparse_csc(const Tensor& self, c10::optional<int64_t> dense_dim_opt) {
   return self._to_sparse_csc(dense_dim_opt);
 }
 
-Tensor to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+Tensor to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseBsr;
   if (self.layout() == layout_to) {
     _to_sparse_check_arguments("to_sparse_bsr", self, layout_to, blocksize, dense_dim_opt);
@@ -2005,7 +2005,7 @@ Tensor to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, c10::optional<in
   return self._to_sparse_bsr(blocksize, dense_dim_opt);
 }
 
-Tensor to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
+Tensor to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, std::optional<int64_t> dense_dim_opt) {
   auto layout_to = kSparseBsc;
   if (self.layout() == layout_to) {
     _to_sparse_check_arguments("to_sparse_bsc", self, layout_to, blocksize, dense_dim_opt);
@@ -2026,7 +2026,7 @@ Tensor to_meta(const Tensor& tensor) {
   }
   return out;
 }
-c10::optional<Tensor> to_meta(const c10::optional<Tensor>& tensor) {
+std::optional<Tensor> to_meta(const c10::optional<Tensor>& tensor) {
   if (tensor.has_value()) {
     return to_meta(*tensor);
   }
diff --git a/aten/src/ATen/native/TensorConversions.h b/aten/src/ATen/native/TensorConversions.h
index fa0d58f3c1299..0e2fd30c288ce 100644
--- a/aten/src/ATen/native/TensorConversions.h
+++ b/aten/src/ATen/native/TensorConversions.h
@@ -11,16 +11,16 @@ namespace at {
 namespace native {
 bool to_will_alias(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
     bool copy,
-    c10::optional<c10::MemoryFormat> optional_memory_format);
+    std::optional<c10::MemoryFormat> optional_memory_format);
 
 Tensor to_meta(const Tensor& tensor);
-c10::optional<Tensor> to_meta(const c10::optional<Tensor>& tensor);
+std::optional<Tensor> to_meta(const c10::optional<Tensor>& tensor);
 std::vector<Tensor> to_meta(at::ITensorListRef t_list);
-Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, c10::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt);
+Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, std::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index c8fddc3756353..195a792600f9b 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -133,18 +133,18 @@ DEFINE_DISPATCH(polar_stub);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arange ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor arange(const Scalar& end,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::arange(/*start=*/0, end, dtype, layout, device, pin_memory);
 }
 
 Tensor arange(const Scalar& start, const Scalar& end,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::arange(
       start, end, /*step=*/1, dtype, layout, device, pin_memory);
 }
@@ -153,10 +153,10 @@ Tensor arange(
     const Scalar& start,
     const Scalar& end,
     const Scalar& step,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -252,8 +252,8 @@ Tensor polar(const Tensor& abs, const Tensor& angle) {
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Tensor empty_cpu(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
-                 c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
+Tensor empty_cpu(IntArrayRef size, std::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
+                 std::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
   Tensor result = at::detail::empty_cpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
   // See Note [Enabling Deterministic Operations]
   if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) {
@@ -264,11 +264,11 @@ Tensor empty_cpu(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::opt
 
 Tensor empty_names(
     IntArrayRef size,
-    c10::optional<DimnameList> names,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
+    std::optional<DimnameList> names,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
     optional<MemoryFormat> optional_memory_format) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
@@ -285,8 +285,8 @@ Tensor empty_names(
   return result;
 }
 
-Tensor empty_permuted_symint(SymIntArrayRef size, IntArrayRef physical_layout, c10::optional<ScalarType> dtype_opt,
-  c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt
+Tensor empty_permuted_symint(SymIntArrayRef size, IntArrayRef physical_layout, std::optional<ScalarType> dtype_opt,
+  std::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt
 ) {
   // size is logical; aka, the output size you'll get from the operation overall
   //
@@ -324,8 +324,8 @@ Tensor empty_permuted_symint(SymIntArrayRef size, IntArrayRef physical_layout, c
   return phys_tensor.as_strided_symint(size, strides);
 }
 
-Tensor empty_strided_cpu(IntArrayRef size, IntArrayRef stride, c10::optional<ScalarType> dtype_opt,
-                         c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
+Tensor empty_strided_cpu(IntArrayRef size, IntArrayRef stride, std::optional<ScalarType> dtype_opt,
+                         std::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
   Tensor result = at::detail::empty_strided_cpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
   // See Note [Enabling Deterministic Operations]
   if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) {
@@ -335,7 +335,7 @@ Tensor empty_strided_cpu(IntArrayRef size, IntArrayRef stride, c10::optional<Sca
 }
 
 Tensor& empty_out(IntArrayRef size,
-    c10::optional<c10::MemoryFormat> optional_memory_format,
+    std::optional<c10::MemoryFormat> optional_memory_format,
     Tensor& result) {
   // Preferably, this argument would not be accepted by _out, but the code
   // generator requires the out and non-out overloads to match exactly
@@ -377,11 +377,11 @@ C10_DIAGNOSTIC_POP()
 
 Tensor empty_like(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -430,11 +430,11 @@ Tensor empty_like(
 
 Tensor empty_like_quantized(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -458,7 +458,7 @@ Tensor empty_like_quantized(
 
   // TODO: To support all features of MemoryFormat::Preserve we need to add
   // _empty_affine_quantized_strided function and use it similarly to
-  // Tensor clone(const Tensor& src, c10::optional<c10::MemoryFormat> optional_memory_format)
+  // Tensor clone(const Tensor& src, std::optional<c10::MemoryFormat> optional_memory_format)
   // if (self.is_non_overlapping_and_dense()) -> _empty_affine_quantized_strided
   if (memory_format == MemoryFormat::Preserve) {
     memory_format = self.suggest_memory_format();
@@ -508,10 +508,10 @@ Tensor empty_like_quantized(
 Tensor new_empty_symint(
     const Tensor& self,
     SymIntArrayRef size,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt
     ) {
   auto dtype = dtype_opt.has_value() ? dtype_opt : optTypeMetaToScalarType(self.options().dtype_opt());
   auto layout = layout_opt.has_value() ? layout_opt : self.options().layout_opt();
@@ -524,10 +524,10 @@ Tensor new_empty_strided_symint(
     const Tensor& self,
     c10::SymIntArrayRef size,
     c10::SymIntArrayRef stride,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory
     ) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
@@ -538,19 +538,19 @@ Tensor new_empty_strided_symint(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eye ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor eye(int64_t n,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // the default value of `m` equals to `n`
   return at::eye(n, n, dtype, layout, device, pin_memory);
 }
 
 Tensor eye(int64_t n, int64_t m,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -614,10 +614,10 @@ TensorOptions infer_full_options(
 } // anonymous namespace
 
 Tensor full(IntArrayRef size, const Scalar& fill_value,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -639,11 +639,11 @@ Tensor& full_out(IntArrayRef size, const Scalar& fill_value, Tensor& result) {
 Tensor full_like(
     const Tensor& self,
     const Scalar& fill_value,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -655,10 +655,10 @@ Tensor new_full(
     const Tensor& self,
     IntArrayRef size,
     const Scalar& fill_value,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory
     ) {
 
   Tensor r = self.new_empty(size, TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory));
@@ -693,10 +693,10 @@ Tensor linspace(
     const Scalar& start,
     const Scalar& end,
     int64_t steps,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -710,10 +710,10 @@ Tensor linspace(
     const Tensor& start,
     const Tensor& end,
     int64_t steps,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   TORCH_CHECK(start.dim() == 0 && end.dim() == 0, "linspace only supports 0-dimensional start and end tensors, "
     "but got start with ", start.dim(), " dimension(s) and end with ", end.dim()," dimension(s).");
   return at::linspace(start.item(), end.item(), steps, dtype, layout, device, pin_memory);
@@ -723,10 +723,10 @@ Tensor linspace(
     const Tensor& start,
     const Scalar& end,
     int64_t steps,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   TORCH_CHECK(start.dim() == 0, "linspace only supports 0-dimensional start and end tensors, "
     "but got start with ", start.dim(), " dimension(s).");
   return at::linspace(start.item(), end, steps, dtype, layout, device, pin_memory);
@@ -736,10 +736,10 @@ Tensor linspace(
     const Scalar& start,
     const Tensor& end,
     int64_t steps,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   TORCH_CHECK(end.dim() == 0, "linspace only supports 0-dimensional start and end tensors, "
     "but got end with ", end.dim()," dimension(s).");
   return at::linspace(start, end.item(), steps, dtype, layout, device, pin_memory);
@@ -752,10 +752,10 @@ Tensor logspace(
     const Scalar& end,
     int64_t steps,
     double base,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -770,10 +770,10 @@ Tensor logspace(
     const Tensor& end,
     int64_t steps,
     double base,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   TORCH_CHECK(start.dim() == 0 && end.dim() == 0, "logspace only supports 0-dimensional start and end tensors, "
     "but got start with ", start.dim(), " dimension(s) and end with ", end.dim()," dimension(s).");
   return at::logspace(start.item(), end.item(), steps, base, dtype, layout, device, pin_memory);
@@ -784,10 +784,10 @@ Tensor logspace(
     const Scalar& end,
     int64_t steps,
     double base,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   TORCH_CHECK(start.dim() == 0, "logspace only supports 0-dimensional start and end tensors, "
     "but got start with ", start.dim(), " dimension(s).");
   return at::logspace(start.item(), end, steps, base, dtype, layout, device, pin_memory);
@@ -798,10 +798,10 @@ Tensor logspace(
     const Tensor& end,
     int64_t steps,
     double base,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   TORCH_CHECK(end.dim() == 0, "logspace only supports 0-dimensional start and end tensors, "
     "but got end with ", end.dim()," dimension(s).");
   return at::logspace(start, end.item(), steps, base, dtype, layout, device, pin_memory);
@@ -810,10 +810,10 @@ Tensor logspace(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ones ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor ones(IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::full(size, /*fill_value=*/1., dtype, layout, device, pin_memory);
 }
 
@@ -823,11 +823,11 @@ Tensor& ones_out(IntArrayRef size, Tensor& result) {
 
 Tensor ones_like(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   auto result = at::empty_like(self, dtype, layout, device, pin_memory, optional_memory_format);
   return result.fill_(1.);
 }
@@ -835,10 +835,10 @@ Tensor ones_like(
 Tensor new_ones(
     const Tensor& self,
     IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   Tensor r = self.new_empty(size, TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory));
   r.fill_(1.);
@@ -848,10 +848,10 @@ Tensor new_ones(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ scalar_tensor ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor scalar_tensor(const Scalar& s,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -874,18 +874,18 @@ Tensor scalar_tensor(const Scalar& s,
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ rand ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor rand(IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
-  return native::rand(size, static_cast<c10::optional<Generator>>(c10::nullopt), dtype, layout, device, pin_memory);
-}
-
-Tensor rand(IntArrayRef size, c10::optional<Generator> generator,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
+  return native::rand(size, static_cast<std::optional<Generator>>(c10::nullopt), dtype, layout, device, pin_memory);
+}
+
+Tensor rand(IntArrayRef size, std::optional<Generator> generator,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -897,18 +897,18 @@ Tensor& rand_out(IntArrayRef size, Tensor& result) {
   return native::rand_out(size, c10::nullopt, result);
 }
 
-Tensor& rand_out(IntArrayRef size, c10::optional<Generator> generator, Tensor& result) {
+Tensor& rand_out(IntArrayRef size, std::optional<Generator> generator, Tensor& result) {
   result.resize_(size);
   return result.uniform_(0, 1, std::move(generator));
 }
 
 Tensor rand_like(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -919,21 +919,21 @@ Tensor rand_like(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randint ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor randint(int64_t high, IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::randint(high, size, c10::nullopt /* generator*/, dtype, layout, device, pin_memory);
 }
 
 Tensor randint(
     int64_t high,
     IntArrayRef size,
-    c10::optional<Generator> generator,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<Generator> generator,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::randint(0, high, size, std::move(generator), dtype, layout, device, pin_memory);
 }
 
@@ -941,10 +941,10 @@ Tensor randint(
     int64_t low,
     int64_t high,
     IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::randint(low, high, size, c10::nullopt, dtype, layout, device, pin_memory);
 }
 
@@ -952,11 +952,11 @@ Tensor randint(
     int64_t low,
     int64_t high,
     IntArrayRef size,
-    c10::optional<Generator> generator,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<Generator> generator,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -970,7 +970,7 @@ Tensor& randint_out(int64_t high, IntArrayRef size, Tensor& result) {
 
 Tensor& randint_out(int64_t high,
     IntArrayRef size,
-    c10::optional<Generator> generator,
+    std::optional<Generator> generator,
     Tensor& result) {
   result.resize_(size);
   return result.random_(0, high, std::move(generator));
@@ -983,7 +983,7 @@ Tensor& randint_out(int64_t low, int64_t high, IntArrayRef size, Tensor& result)
 Tensor& randint_out(int64_t low,
     int64_t high,
     IntArrayRef size,
-    c10::optional<Generator> generator,
+    std::optional<Generator> generator,
     Tensor& result) {
   result.resize_(size);
   return result.random_(low, high, std::move(generator));
@@ -992,11 +992,11 @@ Tensor& randint_out(int64_t low,
 Tensor randint_like(
     const Tensor& self,
     int64_t high,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -1008,11 +1008,11 @@ Tensor randint_like(
     const Tensor& self,
     int64_t low,
     int64_t high,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -1023,18 +1023,18 @@ Tensor randint_like(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor randn(IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
-  return native::randn(size, static_cast<c10::optional<Generator>>(c10::nullopt), dtype, layout, device, pin_memory);
-}
-
-Tensor randn(IntArrayRef size, c10::optional<Generator> generator,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
+  return native::randn(size, static_cast<std::optional<Generator>>(c10::nullopt), dtype, layout, device, pin_memory);
+}
+
+Tensor randn(IntArrayRef size, std::optional<Generator> generator,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -1046,17 +1046,17 @@ Tensor& randn_out(IntArrayRef size, Tensor& result) {
   return native::randn_out(size, c10::nullopt, result);
 }
 
-Tensor& randn_out(IntArrayRef size, c10::optional<Generator> generator, Tensor& result) {
+Tensor& randn_out(IntArrayRef size, std::optional<Generator> generator, Tensor& result) {
   result.resize_(size);
   return result.normal_(0, 1, std::move(generator));
 }
 
 Tensor normal(double mean, double std, IntArrayRef size,
-              c10::optional<Generator> generator,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+              std::optional<Generator> generator,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -1065,18 +1065,18 @@ Tensor normal(double mean, double std, IntArrayRef size,
 }
 
 Tensor& normal_out(double mean, double std,
-                   IntArrayRef size, c10::optional<Generator> generator, Tensor& result) {
+                   IntArrayRef size, std::optional<Generator> generator, Tensor& result) {
   result.resize_(size);
   return result.normal_(mean, std, std::move(generator));
 }
 
 Tensor randn_like(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -1113,18 +1113,18 @@ void randperm_cpu(Tensor& result, int64_t n, CPUGeneratorImpl* generator) {
 } // namespace
 
 Tensor randperm(int64_t n,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::randperm(n, c10::nullopt, dtype, layout, device, pin_memory);
 }
 
-Tensor randperm(int64_t n, c10::optional<Generator> generator,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+Tensor randperm(int64_t n, std::optional<Generator> generator,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   if (!dtype.has_value()) {
     dtype = ScalarType::Long;
   }
@@ -1140,7 +1140,7 @@ Tensor& randperm_out(int64_t n, Tensor& result) {
   return at::randperm_out(result, n, c10::nullopt);
 }
 
-Tensor& randperm_out_cpu(int64_t n, c10::optional<Generator> generator, Tensor& result) {
+Tensor& randperm_out_cpu(int64_t n, std::optional<Generator> generator, Tensor& result) {
   TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
   TORCH_CHECK(!generator.has_value() || (generator.has_value() && result.device() == generator->device()), "Expected a '", result.device(), "' generator device but found '", generator->device(), "'");
   check_supported_max_int_with_precision(n, result);
@@ -1161,10 +1161,10 @@ Tensor range(
     const Scalar& start,
     const Scalar& end,
     const Scalar& step,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -1175,18 +1175,18 @@ Tensor range(
 Tensor range(
     const Scalar& start,
     const Scalar& end,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return at::native::range(start, end, 1, dtype, layout, device, pin_memory);
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangle ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor tril_indices_cpu(
-    int64_t row, int64_t col, int64_t offset, c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
+    int64_t row, int64_t col, int64_t offset, std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
   if (!dtype_opt.has_value()) {
     dtype_opt = ScalarType::Long;
   }
@@ -1235,8 +1235,8 @@ Tensor tril_indices_cpu(
 }
 
 Tensor triu_indices_cpu(
-    int64_t row, int64_t col, int64_t offset, c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
+    int64_t row, int64_t col, int64_t offset, std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
   if (!dtype_opt.has_value()) {
     dtype_opt = ScalarType::Long;
   }
@@ -1278,10 +1278,10 @@ Tensor triu_indices_cpu(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ zeros ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 static Tensor zeros_sparse_compressed_symint(c10::SymIntArrayRef size,
-    c10::optional<ScalarType> dtype,
+    std::optional<ScalarType> dtype,
     Layout layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   check_size_nonnegative(size);
   TORCH_CHECK(size.size() >= 2, "torch.zeros: Only batched sparse compressed (non-block) tensors are supported, but got size ", size);
   auto size_ = C10_AS_INTARRAYREF_SLOW(size);
@@ -1312,10 +1312,10 @@ static Tensor zeros_sparse_compressed_symint(c10::SymIntArrayRef size,
 }
 
 Tensor zeros_symint(SymIntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   Layout layout_ = layout.value_or(Layout::Strided);
   if (at::sparse_csr::is_sparse_compressed(layout_)) {
     return zeros_sparse_compressed_symint(size, dtype, layout_, device, pin_memory);
@@ -1327,10 +1327,10 @@ Tensor zeros_symint(SymIntArrayRef size,
 }
 
 Tensor _efficientzerotensor(IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
     auto device_ = device_or_default(device);
     auto allocator = at::native::ZeroTensorAllocator(device_);
     auto dtype_ = dtype_or_default(dtype);
@@ -1340,10 +1340,10 @@ Tensor _efficientzerotensor(IntArrayRef size,
 }
 
 Tensor _efficientzerotensor_meta_symint(SymIntArrayRef size,
-                                        c10::optional<ScalarType> dtype,
-                                        c10::optional<Layout> layout,
-                                        c10::optional<Device> device,
-                                        c10::optional<bool> pin_memory) {
+                                        std::optional<ScalarType> dtype,
+                                        std::optional<Layout> layout,
+                                        std::optional<Device> device,
+                                        std::optional<bool> pin_memory) {
   auto device_ = device_or_default(device);
   auto allocator = at::native::ZeroTensorAllocator(device_);
   auto dtype_ = dtype_or_default(dtype);
@@ -1372,11 +1372,11 @@ Tensor& zeros_out(IntArrayRef size, Tensor& result) {
 
 Tensor zeros_like(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   // See [Note: hacky wrapper removal for TensorOptions]
   auto other_options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
   // Prefer values passed in explicitly, but default to value from self.
@@ -1423,10 +1423,10 @@ Tensor zeros_like(
 Tensor new_zeros(
     const Tensor& self,
     IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory
     ) {
   Tensor r = self.new_empty(size, TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory));
   r.zero_();
@@ -1436,10 +1436,10 @@ Tensor new_zeros(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ bartlett_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor bartlett_window(int64_t window_length,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::bartlett_window(
       window_length, /*periodic=*/true, dtype, layout, device, pin_memory);
 }
@@ -1447,10 +1447,10 @@ Tensor bartlett_window(int64_t window_length,
 Tensor bartlett_window(
     int64_t window_length,
     bool periodic,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   ScalarType dtype = c10::dtype_or_default(dtype_opt);
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
@@ -1475,10 +1475,10 @@ Tensor bartlett_window(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ blackman_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor blackman_window(int64_t window_length,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::blackman_window(
       window_length, /*periodic=*/true, dtype, layout, device, pin_memory);
 }
@@ -1486,10 +1486,10 @@ Tensor blackman_window(int64_t window_length,
 Tensor blackman_window(
     int64_t window_length,
     bool periodic,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   ScalarType dtype = c10::dtype_or_default(dtype_opt);
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
@@ -1515,10 +1515,10 @@ Tensor blackman_window(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ hamming_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor hamming_window(int64_t window_length,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::hamming_window(
       window_length, /*periodic=*/true, dtype, layout, device, pin_memory);
 }
@@ -1526,10 +1526,10 @@ Tensor hamming_window(int64_t window_length,
 Tensor hamming_window(
     int64_t window_length,
     bool periodic,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::hamming_window(
       window_length,
       periodic,
@@ -1544,10 +1544,10 @@ Tensor hamming_window(
     int64_t window_length,
     bool periodic,
     double alpha,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::hamming_window(
       window_length, periodic, alpha, /*beta=*/0.46, dtype, layout, device, pin_memory);
 }
@@ -1557,10 +1557,10 @@ Tensor hamming_window(
     bool periodic,
     double alpha,
     double beta,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   ScalarType dtype = c10::dtype_or_default(dtype_opt);
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
@@ -1583,20 +1583,20 @@ Tensor hamming_window(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ hann_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor hann_window(int64_t window_length,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::hann_window(window_length, /*periodic=*/true, dtype, layout, device, pin_memory);
 }
 
 Tensor hann_window(
     int64_t window_length,
     bool periodic,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -1608,10 +1608,10 @@ Tensor hann_window(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kaiser_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor kaiser_window(int64_t window_length,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::kaiser_window(
       window_length,
       /*periodic=*/true,
@@ -1623,10 +1623,10 @@ Tensor kaiser_window(int64_t window_length,
 }
 
 Tensor kaiser_window(int64_t window_length, bool periodic,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::kaiser_window(window_length, periodic, /*beta=*/12.0, dtype, layout, device, pin_memory);
 }
 
@@ -1634,10 +1634,10 @@ Tensor kaiser_window(
     int64_t window_length,
     bool periodic,
     double beta,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   ScalarType dtype = c10::dtype_or_default(dtype_opt);
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
@@ -1667,7 +1667,7 @@ Tensor kaiser_window(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~ vandermonde_matrix ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 
-Tensor vander(const Tensor& x, c10::optional<int64_t> N, bool increasing) {
+Tensor vander(const Tensor& x, std::optional<int64_t> N, bool increasing) {
   TORCH_CHECK(x.dim() == 1, "x must be a one-dimensional tensor.");
 
   // Acquires n, defaulting to size if not provided
@@ -1717,11 +1717,11 @@ Tensor tensor_complex_backend(ArrayRef<T> values, const TensorOptions& options)
   return at::detail::tensor_complex_backend(values, options);
 }
 
-Tensor from_file(c10::string_view filename, c10::optional<bool> shared, c10::optional<int64_t> size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+Tensor from_file(c10::string_view filename, std::optional<bool> shared, c10::optional<int64_t> size,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -1745,7 +1745,7 @@ Tensor from_file(c10::string_view filename, c10::optional<bool> shared, c10::opt
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ clone ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Tensor clone(const Tensor& src, c10::optional<c10::MemoryFormat> optional_memory_format) {
+Tensor clone(const Tensor& src, std::optional<c10::MemoryFormat> optional_memory_format) {
   auto memory_format =
       optional_memory_format.value_or(MemoryFormat::Preserve);
   Tensor self;
@@ -1777,10 +1777,10 @@ Tensor full(
     IntArrayRef size,
     const Scalar& fill_value,
     optional<DimnameList> names,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -1795,10 +1795,10 @@ Tensor full(
 Tensor ones(
     IntArrayRef size,
     optional<DimnameList> names,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
 
   return native::full(
@@ -1808,31 +1808,31 @@ Tensor ones(
 Tensor zeros(
     IntArrayRef size,
     optional<DimnameList> names,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::full(size, /*fill_value=*/0., names, dtype, layout, device, pin_memory);
 }
 
 Tensor randn(
     IntArrayRef size,
     optional<DimnameList> names,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::randn(size, c10::nullopt, names, dtype, layout, device, pin_memory);
 }
 
 Tensor randn(
     IntArrayRef size,
-    c10::optional<Generator> generator,
+    std::optional<Generator> generator,
     optional<DimnameList> names,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -1843,21 +1843,21 @@ Tensor randn(
 Tensor rand(
     IntArrayRef size,
     optional<DimnameList> names,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   return native::rand(size, c10::nullopt, names, dtype, layout, device, pin_memory);
 }
 
 Tensor rand(
     IntArrayRef size,
-    c10::optional<Generator> generator,
+    std::optional<Generator> generator,
     optional<DimnameList> names,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h
index f9b2893d768a9..58cbbfc4df334 100644
--- a/aten/src/ATen/native/TensorFactories.h
+++ b/aten/src/ATen/native/TensorFactories.h
@@ -63,7 +63,7 @@ inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) {
 }
 
 inline void check_args(
-    int64_t row, int64_t col, c10::optional<Layout> layout_opt) {
+    int64_t row, int64_t col, std::optional<Layout> layout_opt) {
   TORCH_CHECK(row >= 0, "row must be non-negative, got", row);
   TORCH_CHECK(col >= 0, "col must be non-negative, got", col);
   if (layout_opt.has_value()) {
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index a99e6e3a50c11..c4b8b12b67307 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -228,7 +228,7 @@ inline void cat_check_no_zero_dim(const MaterializedITensorListRef& tensors) {
 }
 
 inline c10::MemoryFormat cat_compute_output_memory_format(const MaterializedITensorListRef& inputs) {
-  c10::optional<c10::MemoryFormat> format = c10::nullopt;
+  std::optional<c10::MemoryFormat> format = c10::nullopt;
   for (const Tensor& t : inputs) {
     auto f = t.suggest_memory_format();
     if (f == c10::MemoryFormat::Contiguous) {
@@ -2511,8 +2511,8 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
 Tensor slice(
     const Tensor& self,
     int64_t dim,
-    c10::optional<int64_t> start,
-    c10::optional<int64_t> end,
+    std::optional<int64_t> start,
+    std::optional<int64_t> end,
     int64_t step) {
   int64_t ndim = self.dim();
   if (ndim == 0) {
@@ -2568,8 +2568,8 @@ Tensor slice_inverse_symint(
     const Tensor& self,
     const Tensor& base,
     int64_t /* dim */,
-    c10::optional<SymInt> /* start */,
-    c10::optional<SymInt> /* end */,
+    std::optional<SymInt> /* start */,
+    std::optional<SymInt> /* end */,
     SymInt /* step */) {
   // assume self has enough to storage to be viewed with base's metadata
   return self.as_strided_symint(base.sym_sizes(), base.sym_strides(), base.sym_storage_offset());
@@ -3227,16 +3227,28 @@ static inferSqueezeGeometry(const Tensor &tensor, std::bitset<dim_bitset_size> d
 namespace {
 // Named type instead of a pair/tuple so that we can be sure to
 // construct the vectors in place and get NRVO.
+template <typename T>
 struct InferUnsqueezeGeometryResult {
-  DimVector sizes;
-  DimVector strides;
-  InferUnsqueezeGeometryResult(IntArrayRef tensor_sizes, IntArrayRef tensor_strides)
+  SmallVector<T, kDimVectorStaticSize>sizes;
+  SmallVector<T, kDimVectorStaticSize> strides;
+  InferUnsqueezeGeometryResult(ArrayRef<T> tensor_sizes, ArrayRef<T> tensor_strides)
       : sizes(tensor_sizes.begin(), tensor_sizes.end())
       , strides(tensor_strides.begin(), tensor_strides.end()) {}
 };
-InferUnsqueezeGeometryResult
+
+InferUnsqueezeGeometryResult<c10::SymInt>
+inferUnsqueezeGeometry_symint(const Tensor& tensor, int64_t dim) {
+  InferUnsqueezeGeometryResult<c10::SymInt> result(tensor.sym_sizes(), tensor.sym_strides());
+  c10::SymInt new_stride = dim >= tensor.dim() ? 1 : result.sizes[dim] * result.strides[dim];
+  result.sizes.insert(result.sizes.begin() + dim, 1);
+  result.strides.insert(result.strides.begin() + dim, new_stride);
+
+  return result;
+}
+
+InferUnsqueezeGeometryResult<int64_t>
 inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) {
-  InferUnsqueezeGeometryResult result(tensor.sizes(), tensor.strides());
+  InferUnsqueezeGeometryResult<int64_t> result(tensor.sizes(), tensor.strides());
   int64_t new_stride = dim >= tensor.dim() ? 1 : result.sizes[dim] * result.strides[dim];
   result.sizes.insert(result.sizes.begin() + dim, 1);
   result.strides.insert(result.strides.begin() + dim, new_stride);
@@ -3377,8 +3389,8 @@ Tensor _unsafe_view(const Tensor& self, IntArrayRef size) {
 
 Tensor unsqueeze(const Tensor& self, int64_t dim) {
   dim = maybe_wrap_dim(dim, self.dim() + 1);
-  auto g = inferUnsqueezeGeometry(self, dim);
-  return self.as_strided(g.sizes, g.strides);
+  auto g = inferUnsqueezeGeometry_symint(self, dim);
+  return self.as_strided_symint(g.sizes, g.strides);
 }
 
 Tensor unsqueeze_sparse(Tensor const &self, int64_t dim) {
@@ -3507,7 +3519,7 @@ static inline void handle_unflatten_exception(const std::runtime_error &e,
                                               const Tensor &self,
                                               int64_t dim,
                                               SymIntArrayRef sizes,
-                                              c10::optional <DimnameList> names) {
+                                              std::optional <DimnameList> names) {
   if (!strstr(e.what(), "is invalid for input of size")) {
     TORCH_CHECK(false, "unflatten got an unexpected error:\n", e.what());
   }
@@ -3524,7 +3536,7 @@ static inline void handle_unflatten_exception(const std::runtime_error &e,
   }
 }
 
-static Tensor unflatten_impl(const Tensor& self, int64_t dim, SymIntArrayRef sizes, c10::optional<DimnameList> names) {
+static Tensor unflatten_impl(const Tensor& self, int64_t dim, SymIntArrayRef sizes, std::optional<DimnameList> names) {
   dim = maybe_wrap_dim(dim, self.dim());
 
   TORCH_CHECK(!sizes.empty(), "unflatten: sizes must be non-empty");
@@ -4001,7 +4013,7 @@ at::Tensor clone_preserve_strides(const at::Tensor& self) {
 }
 
 
-at::Tensor slice_scatter(const at::Tensor& self, const at::Tensor& src, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) {
+at::Tensor slice_scatter(const at::Tensor& self, const at::Tensor& src, int64_t dim, std::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) {
     // See Note [*_scatter ops preserve strides]
     auto output = clone_preserve_strides(self);
     auto slice = output.slice(dim, start, end, step);
@@ -4024,7 +4036,7 @@ at::Tensor diagonal_scatter(const at::Tensor& self, const at::Tensor& src, int64
     slice.copy_(src);
     return output;
 }
-at::Tensor as_strided_scatter_symint(const at::Tensor& self, const at::Tensor& src, at::SymIntArrayRef size, at::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset) {
+at::Tensor as_strided_scatter_symint(const at::Tensor& self, const at::Tensor& src, at::SymIntArrayRef size, at::SymIntArrayRef stride, std::optional<c10::SymInt> storage_offset) {
     // See Note [as_strided_scatter backward support]
     TORCH_INTERNAL_ASSERT(!self.requires_grad() || self.is_contiguous(), "as_strided_scatter is currently only supported for contiguous inputs");
     // See Note [*_scatter ops preserve strides]
diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp
index 5a7c3a6de965f..b13f28d56a86a 100644
--- a/aten/src/ATen/native/TensorTransformations.cpp
+++ b/aten/src/ATen/native/TensorTransformations.cpp
@@ -230,7 +230,7 @@ std::vector<Tensor> atleast_3d(TensorList tensors) {
   return result;
 }
 
-Tensor chalf(const Tensor& self, c10::optional<MemoryFormat> memory_format) {
+Tensor chalf(const Tensor& self, std::optional<MemoryFormat> memory_format) {
   return self.to(kComplexHalf, false, false, memory_format);
 }
 
diff --git a/aten/src/ATen/native/TestOps.cpp b/aten/src/ATen/native/TestOps.cpp
index e2fce123035ba..f9fa0839a51ae 100644
--- a/aten/src/ATen/native/TestOps.cpp
+++ b/aten/src/ATen/native/TestOps.cpp
@@ -49,7 +49,7 @@ Tensor _test_optional_intlist(
 /// Else, return a new tensor containing the elementwise sums.
 Tensor _test_optional_floatlist(
     const Tensor& values,
-    c10::optional<ArrayRef<double>> addends) {
+    std::optional<ArrayRef<double>> addends) {
   if (!addends) {
     return values;
   }
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 6c22d2583f130..3520620280fee 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -772,23 +772,23 @@ Tensor square(const Tensor& self) { return at::pow(self, 2); }
 Tensor& square_(Tensor& self) { return self.pow_(2); }
 
 Tensor& logit_out(const Tensor& self,
-    c10::optional<double> eps,
+    std::optional<double> eps,
     Tensor& result) {
   return unary_op_impl_float_out(
       result, self, logit_stub, Scalar(eps ? eps.value() : -1.0));
 }
-Tensor logit(const Tensor& self, c10::optional<double> eps) {
+Tensor logit(const Tensor& self, std::optional<double> eps) {
   return unary_op_impl_float(
       self, logit_stub, Scalar(eps ? eps.value() : -1.0));
 }
-Tensor& logit_(Tensor& self, c10::optional<double> eps) {
+Tensor& logit_(Tensor& self, std::optional<double> eps) {
   return at::logit_out(self, self, eps);
 }
 
-Tensor& special_logit_out(const Tensor& self, c10::optional<double> eps, Tensor& result) {
+Tensor& special_logit_out(const Tensor& self, std::optional<double> eps, Tensor& result) {
   return at::logit_out(result, self, eps);
 }
-Tensor special_logit(const Tensor& self, c10::optional<double> eps) {
+Tensor special_logit(const Tensor& self, std::optional<double> eps) {
   return self.logit(eps);
 }
 
@@ -801,9 +801,9 @@ Tensor special_expit(const Tensor& self) {
 }
 
 Tensor& nan_to_num_out(const Tensor& self,
-    c10::optional<double> nan,
-    c10::optional<double> pos_inf,
-    c10::optional<double> neg_inf,
+    std::optional<double> nan,
+    std::optional<double> pos_inf,
+    std::optional<double> neg_inf,
     Tensor& result) {
   TORCH_CHECK(
       self.scalar_type() == result.scalar_type(),
@@ -825,18 +825,18 @@ Tensor& nan_to_num_out(const Tensor& self,
 
 Tensor nan_to_num(
     const Tensor& self,
-    c10::optional<double> nan,
-    c10::optional<double> pos_inf,
-    c10::optional<double> neg_inf) {
+    std::optional<double> nan,
+    std::optional<double> pos_inf,
+    std::optional<double> neg_inf) {
   auto result = at::empty_like(self);
   return at::nan_to_num_out(result, self, nan, pos_inf, neg_inf);
 }
 
 Tensor& nan_to_num_(
     Tensor& self,
-    c10::optional<double> nan,
-    c10::optional<double> pos_inf,
-    c10::optional<double> neg_inf) {
+    std::optional<double> nan,
+    std::optional<double> pos_inf,
+    std::optional<double> neg_inf) {
   return at::nan_to_num_out(self, self, nan, pos_inf, neg_inf);
 }
 
diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h
index 91d4d84d4630c..3d99fdc40d048 100644
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@@ -93,30 +93,30 @@ DECLARE_DISPATCH(unary_fn, special_scaled_modified_bessel_k1_stub);
 DECLARE_DISPATCH(unary_fn, special_spherical_bessel_j0_stub);
 
 // NB: these are actually defined in Distribution
-DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, c10::optional<Generator>), bernoulli_tensor_stub);
-DECLARE_DISPATCH(void(*)(const TensorBase&, const double, c10::optional<Generator>), bernoulli_scalar_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), cauchy_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), exponential_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), geometric_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), log_normal_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), uniform_stub);
-DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, c10::optional<Generator>), normal_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, c10::optional<Generator>), random_from_to_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_full_64_bits_range_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, std::optional<Generator>), bernoulli_tensor_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, std::optional<Generator>), bernoulli_scalar_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional<Generator>), cauchy_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, std::optional<Generator>), exponential_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, std::optional<Generator>), geometric_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional<Generator>), log_normal_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional<Generator>), uniform_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, std::optional<Generator>), normal_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, std::optional<Generator>), random_from_to_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, std::optional<Generator>), random_full_64_bits_range_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, std::optional<Generator>), random_stub);
 
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t, const double), kaiser_window_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t), polygamma_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const Scalar& a, const Scalar& b), clamp_stub);
 DECLARE_DISPATCH(
-    void (*)(Tensor&, const Tensor&, int64_t, c10::optional<Generator>),
+    void (*)(Tensor&, const Tensor&, int64_t, std::optional<Generator>),
     multinomial_with_replacement_stub);
 DECLARE_DISPATCH(
     void (*)(
         TensorIteratorBase&,
-        c10::optional<double>,
-        c10::optional<double>,
-        c10::optional<double>),
+        std::optional<double>,
+        std::optional<double>,
+        std::optional<double>),
     nan_to_num_stub);
 DECLARE_DISPATCH(void (*)(TensorIteratorBase&, int64_t), round_decimals_stub);
 
diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp
index 801af5d5e79fe..5c0deff804a33 100644
--- a/aten/src/ATen/native/Unique.cpp
+++ b/aten/src/ATen/native/Unique.cpp
@@ -484,7 +484,7 @@ unique_dim_consecutive_cpu(const Tensor& self, const int64_t dim, const bool ret
 }
 
 std::tuple<Tensor, Tensor, Tensor>
-unique_consecutive_cpu(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional<int64_t> dim) {
+unique_consecutive_cpu(const Tensor& self, const bool return_inverse, const bool return_counts, std::optional<int64_t> dim) {
   if (!dim.has_value() || (dim.value() == 0 && self.dim() == 1)) {
     return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
       return unique_consecutive_cpu_template<scalar_t>(self, return_inverse, return_counts);
diff --git a/aten/src/ATen/native/UpSample.cpp b/aten/src/ATen/native/UpSample.cpp
index 2403d11e4604e..e0e3f82ac32fc 100644
--- a/aten/src/ATen/native/UpSample.cpp
+++ b/aten/src/ATen/native/UpSample.cpp
@@ -10,7 +10,7 @@ namespace at::native::upsample {
 TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
     c10::IntArrayRef input_size,  // Full input tensor size.
     at::OptionalIntArrayRef output_size,
-    c10::optional<c10::ArrayRef<double>> scale_factors) {
+    std::optional<c10::ArrayRef<double>> scale_factors) {
   const auto spatial_dimensions = static_cast<int64_t>(input_size.size()) - 2;
   if (output_size) {
     TORCH_CHECK(!scale_factors, "Must specify exactly one of output_size and scale_factors");
diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
index 8dadc7cee3ae4..e2b3c36b5d775 100644
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@@ -55,9 +55,9 @@ namespace upsample {
 TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
     c10::IntArrayRef input_size,  // Full input tensor size.
     at::OptionalIntArrayRef output_size,
-    c10::optional<c10::ArrayRef<double>> scale_factors);
+    std::optional<c10::ArrayRef<double>> scale_factors);
 
-inline c10::optional<double> get_scale_value(c10::optional<c10::ArrayRef<double>> scales, int idx) {
+inline std::optional<double> get_scale_value(c10::optional<c10::ArrayRef<double>> scales, int idx) {
   if (!scales) {
     return c10::nullopt;
   }
@@ -66,7 +66,7 @@ inline c10::optional<double> get_scale_value(c10::optional<c10::ArrayRef<double>
 
 } // namespace upsample
 
-using scale_t = c10::optional<double>;
+using scale_t = std::optional<double>;
 using upsampling_nearest1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w);
 using _upsampling_nearest_exact1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w);
 using upsampling_nearest2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w);
@@ -252,7 +252,7 @@ static inline void upsample_2d_shape_check(
 
 template <typename scalar_t>
 static inline scalar_t compute_scales_value(
-    const c10::optional<double> scale,
+    const std::optional<double> scale,
     int64_t input_size,
     int64_t output_size) {
       // see Note [compute_scales_value]
@@ -267,7 +267,7 @@ static inline scalar_t area_pixel_compute_scale(
     int64_t input_size,
     int64_t output_size,
     bool align_corners,
-    const c10::optional<double> scale) {
+    const std::optional<double> scale) {
   // see Note [area_pixel_compute_scale]
   if(align_corners) {
     if(output_size > 1) {
@@ -335,7 +335,7 @@ static inline int64_t nearest_idx(
     int64_t output_index,
     int64_t input_size,
     int64_t output_size,
-    c10::optional<double> scales) {
+    std::optional<double> scales) {
   // This method specificly treats cases: output_size == input_size or
   // output_size == 2 * input_size, that we would like to get rid of
   // We keep this method for BC and consider as deprecated.
@@ -356,13 +356,13 @@ static inline int64_t nearest_exact_idx(
     int64_t output_index,
     int64_t input_size,
     int64_t output_size,
-    c10::optional<double> scales) {
+    std::optional<double> scales) {
   float scale = compute_scales_value<float>(scales, input_size, output_size);
     return nearest_neighbor_exact_compute_source_index(scale, output_index, input_size);
 }
 
 // Define a typedef to dispatch to nearest_idx or nearest_exact_idx
-typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, c10::optional<double>);
+typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, std::optional<double>);
 
 template <typename scalar_t>
 static scalar_t upsample_get_value_bounded(
diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp
index f5e523c4a9114..8f5046534103b 100644
--- a/aten/src/ATen/native/UpSampleBicubic2d.cpp
+++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp
@@ -23,7 +23,7 @@
 namespace at::meta {
 
 TORCH_META_FUNC(upsample_bicubic2d) (
-  const Tensor& input, IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w
+  const Tensor& input, IntArrayRef output_size, bool align_corners, std::optional<double> scales_h, c10::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_2d_common_check(input.sizes(), output_size);
 
@@ -41,8 +41,8 @@ TORCH_META_FUNC(upsample_bicubic2d_backward) (
   IntArrayRef output_size,
   IntArrayRef input_size,
   bool align_corners,
-  c10::optional<double> scales_h,
-  c10::optional<double> scales_w
+  std::optional<double> scales_h,
+  std::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_2d_common_check(input_size, output_size);
 
@@ -62,7 +62,7 @@ TORCH_META_FUNC(upsample_bicubic2d_backward) (
 }
 
 TORCH_META_FUNC(_upsample_bicubic2d_aa) (
-  const Tensor& input, IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w
+  const Tensor& input, IntArrayRef output_size, bool align_corners, std::optional<double> scales_h, c10::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_2d_common_check(input.sizes(), output_size);
 
@@ -80,8 +80,8 @@ TORCH_META_FUNC(_upsample_bicubic2d_aa_backward) (
   IntArrayRef output_size,
   IntArrayRef input_size,
   bool align_corners,
-  c10::optional<double> scales_h,
-  c10::optional<double> scales_w
+  std::optional<double> scales_h,
+  std::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_2d_common_check(input_size, output_size);
 
@@ -115,8 +115,8 @@ static void upsample_bicubic2d_backward_out_frame(
     int64_t nbatch,
     int64_t channels,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   channels = channels * nbatch;
   auto input_slice_size = input_height * input_width;
   auto output_slice_size = output_height * output_width;
@@ -185,8 +185,8 @@ static void upsample_bicubic2d_backward_kernel(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
 
   int64_t output_height = output_size[0];
   int64_t output_width = output_size[1];
@@ -227,8 +227,8 @@ TORCH_IMPL_FUNC(upsample_bicubic2d_out_cpu) (
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output
 ) {
   upsample_bicubic2d_kernel(kCPU, output, input, align_corners, scales_h, scales_w);
@@ -239,8 +239,8 @@ TORCH_IMPL_FUNC(upsample_bicubic2d_backward_out_cpu) (
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input
 ) {
   grad_input.zero_();
@@ -251,8 +251,8 @@ TORCH_IMPL_FUNC(_upsample_bicubic2d_aa_out_cpu) (
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output
 ) {
   _upsample_bicubic2d_aa_kernel(kCPU, output, input, align_corners, scales_h, scales_w);
@@ -263,8 +263,8 @@ TORCH_IMPL_FUNC(_upsample_bicubic2d_aa_backward_out_cpu) (
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input
 ) {
   grad_input.zero_();
@@ -280,7 +280,7 @@ Tensor upsample_bicubic2d(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
     bool align_corners,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_h = get_scale_value(scale_factors, 0);
   auto scale_w = get_scale_value(scale_factors, 1);
@@ -291,7 +291,7 @@ Tensor _upsample_bicubic2d_aa(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
     bool align_corners,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_h = get_scale_value(scale_factors, 0);
   auto scale_w = get_scale_value(scale_factors, 1);
diff --git a/aten/src/ATen/native/UpSampleBilinear2d.cpp b/aten/src/ATen/native/UpSampleBilinear2d.cpp
index 202f33ab7970e..2cc8b56678c74 100644
--- a/aten/src/ATen/native/UpSampleBilinear2d.cpp
+++ b/aten/src/ATen/native/UpSampleBilinear2d.cpp
@@ -24,7 +24,7 @@
 namespace at::meta {
 
 TORCH_META_FUNC(upsample_bilinear2d) (
-  const Tensor& input, IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w
+  const Tensor& input, IntArrayRef output_size, bool align_corners, std::optional<double> scales_h, c10::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_2d_common_check(input.sizes(), output_size);
 
@@ -42,8 +42,8 @@ TORCH_META_FUNC(upsample_bilinear2d_backward) (
   IntArrayRef output_size,
   IntArrayRef input_size,
   bool align_corners,
-  c10::optional<double> scales_h,
-  c10::optional<double> scales_w
+  std::optional<double> scales_h,
+  std::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_2d_common_check(input_size, output_size);
 
@@ -63,7 +63,7 @@ TORCH_META_FUNC(upsample_bilinear2d_backward) (
 }
 
 TORCH_META_FUNC(_upsample_bilinear2d_aa) (
-  const Tensor& input, IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w
+  const Tensor& input, IntArrayRef output_size, bool align_corners, std::optional<double> scales_h, c10::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_2d_common_check(input.sizes(), output_size);
 
@@ -81,8 +81,8 @@ TORCH_META_FUNC(_upsample_bilinear2d_aa_backward) (
   IntArrayRef output_size,
   IntArrayRef input_size,
   bool align_corners,
-  c10::optional<double> scales_h,
-  c10::optional<double> scales_w
+  std::optional<double> scales_h,
+  std::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_2d_common_check(input_size, output_size);
 
@@ -109,8 +109,8 @@ TORCH_IMPL_FUNC(upsample_bilinear2d_out_cpu) (
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output
 ) {
   upsample_bilinear2d_kernel(kCPU, output, input, align_corners, scales_h, scales_w);
@@ -121,8 +121,8 @@ TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_cpu) (
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input
 ) {
   grad_input.zero_();
@@ -134,8 +134,8 @@ TORCH_IMPL_FUNC(_upsample_bilinear2d_aa_out_cpu) (
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output
 ) {
   _upsample_bilinear2d_aa_kernel(kCPU, output, input, align_corners, scales_h, scales_w);
@@ -146,8 +146,8 @@ TORCH_IMPL_FUNC(_upsample_bilinear2d_aa_backward_out_cpu) (
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input
 ) {
   grad_input.zero_();
@@ -161,7 +161,7 @@ Tensor upsample_bilinear2d(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
     bool align_corners,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_h = get_scale_value(scale_factors, 0);
   auto scale_w = get_scale_value(scale_factors, 1);
@@ -172,7 +172,7 @@ Tensor _upsample_bilinear2d_aa(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
     bool align_corners,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_h = get_scale_value(scale_factors, 0);
   auto scale_w = get_scale_value(scale_factors, 1);
diff --git a/aten/src/ATen/native/UpSampleLinear1d.cpp b/aten/src/ATen/native/UpSampleLinear1d.cpp
index 7d80d5c2dc2b8..affbcaa4f06d9 100644
--- a/aten/src/ATen/native/UpSampleLinear1d.cpp
+++ b/aten/src/ATen/native/UpSampleLinear1d.cpp
@@ -23,7 +23,7 @@ TORCH_META_FUNC(upsample_linear1d) (
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales
+    std::optional<double> scales
 ) {
   auto full_output_size = native::upsample_1d_common_check(input.sizes(), output_size);
 
@@ -41,7 +41,7 @@ TORCH_META_FUNC(upsample_linear1d_backward) (
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales
+    std::optional<double> scales
 ) {
   auto full_output_size = native::upsample_1d_common_check(input_size, output_size);
 
@@ -65,7 +65,7 @@ TORCH_IMPL_FUNC(upsample_linear1d_out_cpu) (
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales,
+    std::optional<double> scales,
     const Tensor& output
 ) {
   upsample_linear1d_kernel(kCPU, output, input, align_corners, scales);
@@ -76,7 +76,7 @@ TORCH_IMPL_FUNC(upsample_linear1d_backward_out_cpu) (
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales,
+    std::optional<double> scales,
     const Tensor& grad_input
 ) {
   grad_input.zero_();
@@ -92,7 +92,7 @@ Tensor upsample_linear1d(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
     bool align_corners,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_w = get_scale_value(scale_factors, 0);
   return at::upsample_linear1d(input, osize, align_corners, scale_w);
diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp
index 94441d6c3df97..7555d421d4afd 100644
--- a/aten/src/ATen/native/UpSampleNearest1d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest1d.cpp
@@ -21,7 +21,7 @@
 namespace at::meta {
 
 TORCH_META_FUNC(upsample_nearest1d) (
-    const Tensor& input, IntArrayRef output_size, c10::optional<double> scales
+    const Tensor& input, IntArrayRef output_size, std::optional<double> scales
 ) {
   auto full_output_size = native::upsample_1d_common_check(input.sizes(), output_size);
 
@@ -35,7 +35,7 @@ TORCH_META_FUNC(upsample_nearest1d) (
 }
 
 TORCH_META_FUNC(_upsample_nearest_exact1d) (
-  const Tensor& input, IntArrayRef output_size, c10::optional<double> scales
+  const Tensor& input, IntArrayRef output_size, std::optional<double> scales
 ) {
   auto full_output_size = native::upsample_1d_common_check(input.sizes(), output_size);
 
@@ -49,7 +49,7 @@ TORCH_META_FUNC(_upsample_nearest_exact1d) (
 }
 
 TORCH_META_FUNC(upsample_nearest1d_backward) (
-    const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional<double> scales
+    const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, std::optional<double> scales
 ) {
   auto full_output_size = native::upsample_1d_common_check(input_size, output_size);
 
@@ -61,7 +61,7 @@ TORCH_META_FUNC(upsample_nearest1d_backward) (
 }
 
 TORCH_META_FUNC(_upsample_nearest_exact1d_backward) (
-  const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional<double> scales
+  const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, std::optional<double> scales
 ) {
   auto full_output_size = native::upsample_1d_common_check(input_size, output_size);
 
@@ -80,7 +80,7 @@ namespace at::native {
 TORCH_IMPL_FUNC(upsample_nearest1d_out_cpu) (
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales,
+    std::optional<double> scales,
     const Tensor& output
 ) {
   upsample_nearest1d_kernel(kCPU, output, input, scales);
@@ -89,7 +89,7 @@ TORCH_IMPL_FUNC(upsample_nearest1d_out_cpu) (
 TORCH_IMPL_FUNC(_upsample_nearest_exact1d_out_cpu) (
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales,
+    std::optional<double> scales,
     const Tensor& output
 ) {
   _upsample_nearest_exact1d_kernel(kCPU, output, input, scales);
@@ -99,7 +99,7 @@ TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_cpu) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales,
+    std::optional<double> scales,
     const Tensor& grad_input
 ) {
   grad_input.zero_();
@@ -110,7 +110,7 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact1d_backward_out_cpu) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales,
+    std::optional<double> scales,
     const Tensor& grad_input
 ) {
   grad_input.zero_();
@@ -125,7 +125,7 @@ using at::native::upsample::get_scale_value;
 Tensor upsample_nearest1d(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_w = get_scale_value(scale_factors, 0);
   return at::upsample_nearest1d(input, osize, scale_w);
@@ -134,7 +134,7 @@ Tensor upsample_nearest1d(
 Tensor _upsample_nearest_exact1d(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_w = get_scale_value(scale_factors, 0);
   return at::_upsample_nearest_exact1d(input, osize, scale_w);
diff --git a/aten/src/ATen/native/UpSampleNearest2d.cpp b/aten/src/ATen/native/UpSampleNearest2d.cpp
index 592108291cf76..0ee2db0597023 100644
--- a/aten/src/ATen/native/UpSampleNearest2d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest2d.cpp
@@ -22,7 +22,7 @@
 namespace at::meta {
 
 TORCH_META_FUNC(upsample_nearest2d) (
-    const Tensor& input, IntArrayRef output_size, c10::optional<double> scales_h, c10::optional<double> scales_w
+    const Tensor& input, IntArrayRef output_size, std::optional<double> scales_h, c10::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_2d_common_check(input.sizes(), output_size);
 
@@ -36,7 +36,7 @@ TORCH_META_FUNC(upsample_nearest2d) (
 }
 
 TORCH_META_FUNC(_upsample_nearest_exact2d) (
-  const Tensor& input, IntArrayRef output_size, c10::optional<double> scales_h, c10::optional<double> scales_w
+  const Tensor& input, IntArrayRef output_size, std::optional<double> scales_h, c10::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_2d_common_check(input.sizes(), output_size);
 
@@ -53,8 +53,8 @@ TORCH_META_FUNC(upsample_nearest2d_backward) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w
+    std::optional<double> scales_h,
+    std::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_2d_common_check(input_size, output_size);
 
@@ -77,8 +77,8 @@ TORCH_META_FUNC(_upsample_nearest_exact2d_backward) (
   const Tensor& grad_output,
   IntArrayRef output_size,
   IntArrayRef input_size,
-  c10::optional<double> scales_h,
-  c10::optional<double> scales_w
+  std::optional<double> scales_h,
+  std::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_2d_common_check(input_size, output_size);
 
@@ -104,8 +104,8 @@ namespace at::native {
 TORCH_IMPL_FUNC(upsample_nearest2d_out_cpu) (
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output
 ) {
   upsample_nearest2d_kernel(kCPU, output, input, scales_h, scales_w);
@@ -114,8 +114,8 @@ TORCH_IMPL_FUNC(upsample_nearest2d_out_cpu) (
 TORCH_IMPL_FUNC(_upsample_nearest_exact2d_out_cpu) (
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output
 ) {
   _upsample_nearest_exact2d_kernel(kCPU, output, input, scales_h, scales_w);
@@ -125,8 +125,8 @@ TORCH_IMPL_FUNC(upsample_nearest2d_backward_out_cpu) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input) {
   grad_input.zero_();
   upsample_nearest2d_backward_kernel(kCPU, grad_input, grad_output, scales_h, scales_w);
@@ -136,8 +136,8 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact2d_backward_out_cpu) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input) {
   grad_input.zero_();
   _upsample_nearest_exact2d_backward_kernel(kCPU, grad_input, grad_output, scales_h, scales_w);
@@ -149,7 +149,7 @@ using at::native::upsample::get_scale_value;
 Tensor upsample_nearest2d(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_h = get_scale_value(scale_factors, 0);
   auto scale_w = get_scale_value(scale_factors, 1);
@@ -159,7 +159,7 @@ Tensor upsample_nearest2d(
 Tensor _upsample_nearest_exact2d(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_h = get_scale_value(scale_factors, 0);
   auto scale_w = get_scale_value(scale_factors, 1);
diff --git a/aten/src/ATen/native/UpSampleNearest3d.cpp b/aten/src/ATen/native/UpSampleNearest3d.cpp
index 0c4851b7be513..ac4dc1796252e 100644
--- a/aten/src/ATen/native/UpSampleNearest3d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest3d.cpp
@@ -23,9 +23,9 @@ namespace at::meta {
 TORCH_META_FUNC(upsample_nearest3d) (
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_3d_common_check(input.sizes(), output_size);
 
@@ -41,9 +41,9 @@ TORCH_META_FUNC(upsample_nearest3d) (
 TORCH_META_FUNC(_upsample_nearest_exact3d) (
   const Tensor& input,
   IntArrayRef output_size,
-  c10::optional<double> scales_d,
-  c10::optional<double> scales_h,
-  c10::optional<double> scales_w
+  std::optional<double> scales_d,
+  std::optional<double> scales_h,
+  std::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_3d_common_check(input.sizes(), output_size);
 
@@ -60,9 +60,9 @@ TORCH_META_FUNC(upsample_nearest3d_backward) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_3d_common_check(input_size, output_size);
 
@@ -85,9 +85,9 @@ TORCH_META_FUNC(_upsample_nearest_exact3d_backward) (
   const Tensor& grad_output,
   IntArrayRef output_size,
   IntArrayRef input_size,
-  c10::optional<double> scales_d,
-  c10::optional<double> scales_h,
-  c10::optional<double> scales_w
+  std::optional<double> scales_d,
+  std::optional<double> scales_h,
+  std::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_3d_common_check(input_size, output_size);
 
@@ -113,9 +113,9 @@ namespace at::native {
 TORCH_IMPL_FUNC(upsample_nearest3d_out_cpu) (
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output
 ) {
   upsample_nearest3d_kernel(kCPU, output, input, scales_d, scales_h, scales_w);
@@ -124,9 +124,9 @@ TORCH_IMPL_FUNC(upsample_nearest3d_out_cpu) (
 TORCH_IMPL_FUNC(_upsample_nearest_exact3d_out_cpu) (
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output
 ) {
   _upsample_nearest_exact3d_kernel(kCPU, output, input, scales_d, scales_h, scales_w);
@@ -136,9 +136,9 @@ TORCH_IMPL_FUNC(upsample_nearest3d_backward_out_cpu) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input) {
   grad_input.zero_();
   upsample_nearest3d_backward_kernel(kCPU, grad_input, grad_output, scales_d, scales_h, scales_w);
@@ -148,9 +148,9 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact3d_backward_out_cpu) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input) {
   grad_input.zero_();
   _upsample_nearest_exact3d_backward_kernel(kCPU, grad_input, grad_output, scales_d, scales_h, scales_w);
@@ -164,7 +164,7 @@ using at::native::upsample::get_scale_value;
 Tensor upsample_nearest3d(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_d = get_scale_value(scale_factors, 0);
   auto scale_h = get_scale_value(scale_factors, 1);
@@ -175,7 +175,7 @@ Tensor upsample_nearest3d(
 Tensor _upsample_nearest_exact3d(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_d = get_scale_value(scale_factors, 0);
   auto scale_h = get_scale_value(scale_factors, 1);
diff --git a/aten/src/ATen/native/UpSampleTrilinear3d.cpp b/aten/src/ATen/native/UpSampleTrilinear3d.cpp
index 24a915d5d9a42..9aa8f9c5cb73c 100644
--- a/aten/src/ATen/native/UpSampleTrilinear3d.cpp
+++ b/aten/src/ATen/native/UpSampleTrilinear3d.cpp
@@ -23,9 +23,9 @@ TORCH_META_FUNC(upsample_trilinear3d) (
   const Tensor& input,
   IntArrayRef output_size,
   bool align_corners,
-  c10::optional<double> scales_d,
-  c10::optional<double> scales_h,
-  c10::optional<double> scales_w
+  std::optional<double> scales_d,
+  std::optional<double> scales_h,
+  std::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_3d_common_check(input.sizes(), output_size);
 
@@ -43,9 +43,9 @@ TORCH_META_FUNC(upsample_trilinear3d_backward) (
   IntArrayRef output_size,
   IntArrayRef input_size,
   bool align_corners,
-  c10::optional<double> scales_d,
-  c10::optional<double> scales_h,
-  c10::optional<double> scales_w
+  std::optional<double> scales_d,
+  std::optional<double> scales_h,
+  std::optional<double> scales_w
 ) {
   auto full_output_size = native::upsample_3d_common_check(input_size, output_size);
 
@@ -71,9 +71,9 @@ TORCH_IMPL_FUNC(upsample_trilinear3d_out_cpu) (
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output
 ) {
   upsample_trilinear3d_kernel(kCPU, output, input, align_corners, scales_d, scales_h, scales_w);
@@ -84,9 +84,9 @@ TORCH_IMPL_FUNC(upsample_trilinear3d_backward_out_cpu) (
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input
 ) {
   grad_input.zero_();
@@ -102,7 +102,7 @@ Tensor upsample_trilinear3d(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
     bool align_corners,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_d = get_scale_value(scale_factors, 0);
   auto scale_h = get_scale_value(scale_factors, 1);
diff --git a/aten/src/ATen/native/VariableMethodStubs.cpp b/aten/src/ATen/native/VariableMethodStubs.cpp
index 477979d190be2..ed99aed399cb1 100644
--- a/aten/src/ATen/native/VariableMethodStubs.cpp
+++ b/aten/src/ATen/native/VariableMethodStubs.cpp
@@ -24,7 +24,7 @@
 
 namespace at::native {
 
-void _backward(const Tensor& self, TensorList inputs, const c10::optional<Tensor>& gradient_opt, c10::optional<bool> keep_graph, bool create_graph) {
+void _backward(const Tensor& self, TensorList inputs, const std::optional<Tensor>& gradient_opt, c10::optional<bool> keep_graph, bool create_graph) {
   return self._backward(inputs, gradient_opt, keep_graph, create_graph);
 }
 
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h
index 58a1a43fe67bc..a14fd4efc1b15 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h
@@ -14,7 +14,7 @@ namespace sparse {
 struct TORCH_API PackedLinearWeight
     : public LinearPackedParamsBase {
   PackedLinearWeight(std::unique_ptr<fbgemm::BCSRMatrix<int8_t>> w,
-                     c10::optional<at::Tensor> bias,
+                     std::optional<at::Tensor> bias,
                      std::vector<int32_t> col_offsets,
                      std::vector<float> w_scale,
                      std::vector<int32_t> w_zp,
@@ -31,7 +31,7 @@ struct TORCH_API PackedLinearWeight
         w_zp(std::move(w_zp)),
         q_scheme(q_scheme) {}
   std::unique_ptr<fbgemm::BCSRMatrix<int8_t>> w;
-  c10::optional<at::Tensor> bias_;
+  std::optional<at::Tensor> bias_;
   std::vector<int32_t> col_offsets;
   std::vector<float> w_scale;
   std::vector<int32_t> w_zp;
@@ -68,13 +68,13 @@ struct TORCH_API PackedLinearWeight
   static c10::intrusive_ptr<LinearPackedParamsBase> deserialize(
       const BCSRSerializationType& serialized);
 
-  c10::optional<at::Tensor> bias() override {
+  std::optional<at::Tensor> bias() override {
     return bias_;
   }
 
   static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
       const at::Tensor& weight,
-      const c10::optional<at::Tensor>& bias,
+      const std::optional<at::Tensor>& bias,
       const int64_t out_features_block_size,
       const int64_t in_features_block_size);
 
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/packed_params.h b/aten/src/ATen/native/ao_sparse/quantized/cpu/packed_params.h
index 1ca66bf536a77..db8ee9d619066 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/packed_params.h
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/packed_params.h
@@ -9,14 +9,14 @@ namespace sparse {
 
 // <Weight, bias, out_features_block_size, in_features_block_size>
 using LinearPackedSerializationType =
-    std::tuple<at::Tensor, c10::optional<at::Tensor>, std::vector<int64_t>>;
+    std::tuple<at::Tensor, std::optional<at::Tensor>, std::vector<int64_t>>;
 
 #define SPARSE_LINEAR_PACKED_PARAM_SERIALIZATION_VERSION 2
 
 using BCSRSerializationType =
     std::tuple<
         int64_t,                    // Serialization Version
-        c10::optional<at::Tensor>,  // Bias
+        std::optional<at::Tensor>,  // Bias
         int64_t,                    // Out Features (Row) Block Size
         int64_t,                    // In Features (Column) Block Size
         at::Tensor,                 // Weight Scales (single element vector if per-tensor) (float)
@@ -60,9 +60,9 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
 
   virtual BCSRSerializationType serialize() = 0;
 
-  virtual c10::optional<at::Tensor> bias() = 0;
+  virtual std::optional<at::Tensor> bias() = 0;
 
-  virtual void set_bias(const c10::optional<at::Tensor>& bias) {
+  virtual void set_bias(const std::optional<at::Tensor>& bias) {
     throw std::runtime_error(
         "set_bias is not implemented for this packed "
         "parameter type");
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
index 8f80d920e3652..f5032f4d425b8 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
@@ -53,7 +53,7 @@ void calc_col_offsets_transpose(
 c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeight::
     prepack(
         const at::Tensor& weight,
-        const c10::optional<at::Tensor>& bias,
+        const std::optional<at::Tensor>& bias,
         const int64_t out_features_block_size,
         const int64_t in_features_block_size) {
   TORCH_CHECK(
@@ -110,7 +110,7 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeight::
       /*col_offsets=*/col_offsets.data(),
       /*qtype=*/qtype);
 
-  c10::optional<at::Tensor> bias_contig;
+  std::optional<at::Tensor> bias_contig;
   if (bias.has_value()) {
     const at::Tensor& bias_vec = bias.value();
     TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)");
@@ -139,7 +139,7 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeight::
 c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightQnnp::
     prepack(
         const at::Tensor& weight,
-        const c10::optional<at::Tensor>& bias,
+        const std::optional<at::Tensor>& bias,
         const int64_t out_features_block_size,
         const int64_t in_features_block_size) {
   at::native::initQNNPACK();
@@ -150,7 +150,7 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightQnnp::
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 PackedLinearWeightQnnp::PackedLinearWeightQnnp(
     const at::Tensor& weight,
-    const c10::optional<at::Tensor>& bias,
+    const std::optional<at::Tensor>& bias,
     const int64_t out_features_block_size,
     const int64_t in_features_block_size)
     : LinearPackedParamsBase(out_features_block_size, in_features_block_size),
@@ -215,7 +215,7 @@ class QLinearPackWeightInt8 final {
  public:
   static c10::intrusive_ptr<LinearPackedParamsBase> run(
       const at::Tensor& weight,
-      const c10::optional<at::Tensor>& bias,
+      const std::optional<at::Tensor>& bias,
       const int64_t out_features_block_size,
       const int64_t in_features_block_size) {
     auto& ctx = at::globalContext();
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h b/aten/src/ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h
index 6ac89681899c5..b791cbe845756 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h
@@ -16,9 +16,9 @@ namespace sparse {
 
 struct TORCH_API PackedLinearWeightQnnp
     : public LinearPackedParamsBase {
-  PackedLinearWeightQnnp(const at::Tensor& weight, const c10::optional<at::Tensor>& bias, const int64_t out_features_block_size /* block sparsity size across output_features */, const int64_t in_features_block_size /* block sparsity size across input_features */);
+  PackedLinearWeightQnnp(const at::Tensor& weight, const std::optional<at::Tensor>& bias, const int64_t out_features_block_size /* block sparsity size across output_features */, const int64_t in_features_block_size /* block sparsity size across input_features */);
   explicit PackedLinearWeightQnnp(const BCSRSerializationType& serialized);
-  c10::optional<at::Tensor> orig_bias_;
+  std::optional<at::Tensor> orig_bias_;
   // Separate copy of bias exist so that we can fill in zeros when
   // optional bias does not exist. This is to compy with qnnpack operator that
   // expects bias to be present.
@@ -67,13 +67,13 @@ struct TORCH_API PackedLinearWeightQnnp
   static c10::intrusive_ptr<LinearPackedParamsBase> deserialize(
       const BCSRSerializationType& serialized);
 
-  c10::optional<at::Tensor> bias() override {
+  std::optional<at::Tensor> bias() override {
     return orig_bias_;
   }
 
   static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
       const at::Tensor& weight,
-      const c10::optional<at::Tensor>& bias,
+      const std::optional<at::Tensor>& bias,
       const int64_t out_features_block_size,
       const int64_t in_features_block_size);
 
diff --git a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
index 572d5af43f651..4bf03b12b1446 100644
--- a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
@@ -21,7 +21,7 @@ void cpu_avg_pool2d(
     int64_t dW, int64_t dH,
     int64_t padW, int64_t padH,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   using acc_t = at::opmath_type<scalar_t>;
 
   auto input = input_.contiguous();
@@ -108,7 +108,7 @@ void cpu_avg_pool2d_channels_last(
     int64_t dW, int64_t dH,
     int64_t padW, int64_t padH,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_CHECK(input_.ndimension() == 4,
               "2d average pooling with channels last format supports tensors with 4 dims");
   auto memory_format = at::MemoryFormat::ChannelsLast;
@@ -222,7 +222,7 @@ void cpu_avg_pool2d_channels_last(
     int64_t dW, int64_t dH,
     int64_t padW, int64_t padH,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_CHECK(input_.ndimension() == 4,
               "2d average pooling with channels last format supports tensors with 4 dims");
   auto memory_format = at::MemoryFormat::ChannelsLast;
@@ -354,7 +354,7 @@ void cpu_avg_pool2d_backward(
     int dW, int dH,
     int padW, int padH,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   auto grad_output = grad_output_.contiguous();
   auto grad_input = grad_input_.contiguous();
 
@@ -422,7 +422,7 @@ void cpu_avg_pool2d_backward_channels_last(
     int dW, int dH,
     int padW, int padH,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   auto memory_format = at::MemoryFormat::ChannelsLast;
   auto grad_input = grad_input_.contiguous(memory_format);
   auto grad_output = grad_output_.contiguous(memory_format);
@@ -501,7 +501,7 @@ void avg_pool2d_kernel_impl(
     int64_t dW, int64_t dH,
     int64_t padW, int64_t padH,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   switch (input.suggest_memory_format()) {
     case at::MemoryFormat::Contiguous: {
       AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool2d", [&] {
@@ -527,7 +527,7 @@ void avg_pool2d_backward_kernel_impl(
     int dW, int dH,
     int padW, int padH,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   switch (grad_output.suggest_memory_format()) {
     case at::MemoryFormat::Contiguous: {
       AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool2d_backward", [&] {
@@ -555,7 +555,7 @@ void cpu_avg_pool3d(
     int64_t dW, int64_t dH, int64_t dD,
     int64_t padW, int64_t padH, int64_t padD,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   using acc_t = at::opmath_type<scalar_t>;
 
   auto input = input_.contiguous();
@@ -651,7 +651,7 @@ void cpu_avg_pool3d_channels_last(
     int64_t dW, int64_t dH, int64_t dD,
     int64_t padW, int64_t padH, int64_t padD,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_CHECK(input_.ndimension() == 5,
               "3d average pooling with channels last format supports tensors with 5 dims");
   auto memory_format = at::MemoryFormat::ChannelsLast3d;
@@ -774,7 +774,7 @@ void cpu_avg_pool3d_channels_last(
     int64_t dW, int64_t dH, int64_t dD,
     int64_t padW, int64_t padH, int64_t padD,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_CHECK(input_.ndimension() == 5,
               "3d average pooling with channels last format supports tensors with 5 dims");
   auto memory_format = at::MemoryFormat::ChannelsLast3d;
@@ -915,7 +915,7 @@ void cpu_avg_pool3d_backward(
     int dW, int dH, int dD,
     int padW, int padH, int padD,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   auto grad_output = grad_output_.contiguous();
   auto grad_input = grad_input_.contiguous();
 
@@ -992,7 +992,7 @@ void cpu_avg_pool3d_backward_channels_last(
     int dW, int dH, int dD,
     int padW, int padH, int padD,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   auto memory_format = at::MemoryFormat::ChannelsLast3d;
   auto grad_input = grad_input_.contiguous(memory_format);
   auto grad_output = grad_output_.contiguous(memory_format);
@@ -1083,7 +1083,7 @@ void avg_pool3d_kernel_impl(
     int64_t dW, int64_t dH, int64_t dD,
     int64_t padW, int64_t padH, int64_t padD,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   switch (input.suggest_memory_format()) {
     case at::MemoryFormat::Contiguous: {
       AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool3d", [&] {
@@ -1110,7 +1110,7 @@ void avg_pool3d_backward_kernel_impl(
     int dW, int dH, int dD,
     int padW, int padH, int padD,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   switch (grad_output.suggest_memory_format()) {
     case at::MemoryFormat::Contiguous: {
       AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool3d_backward", [&] {
diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp
index 6dce481853ac2..7ee014058d70d 100644
--- a/aten/src/ATen/native/cpu/DistributionKernels.cpp
+++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp
@@ -26,27 +26,27 @@
 namespace at::native {
 namespace {
 
-static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
+static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::cauchy_kernel(iter, median, sigma, generator);
 }
 
-void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
+void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::bernoulli_kernel(self, p_, generator);
 }
 
 #if !AT_MKL_ENABLED()
-void bernoulli_scalar_kernel_default(const TensorBase &self, double p, c10::optional<Generator> gen) {
+void bernoulli_scalar_kernel_default(const TensorBase &self, double p, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::bernoulli_kernel(self, p, generator);
 }
 
-void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
+void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional<Generator> gen) {
   bernoulli_scalar_kernel_default(self, p, gen);
 }
 #else
-void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
+void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   int64_t seed;
   {
@@ -99,17 +99,17 @@ void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Gen
 }
 #endif
 
-static void exponential_kernel_default(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
+static void exponential_kernel_default(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::exponential_kernel(iter, lambda, generator);
 }
 
 #if (!AT_MKL_ENABLED() || defined(FBCODE_CAFFE2))
-void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
+void exponential_kernel(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
   exponential_kernel_default(iter, lambda, gen);
 }
 #else
-void exponential_kernel(TensorIteratorBase &iter, double lambda, c10::optional<Generator> gen) {
+void exponential_kernel(TensorIteratorBase &iter, double lambda, std::optional<Generator> gen) {
   TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype());
 
   Tensor self = iter.tensor(0);
@@ -195,32 +195,32 @@ void exponential_kernel(TensorIteratorBase &iter, double lambda, c10::optional<G
 }
 #endif
 
-static void geometric_kernel(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
+static void geometric_kernel(TensorIteratorBase& iter, double p, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::geometric_kernel(iter, p, generator);
 }
 
-static void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
+static void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::log_normal_kernel(iter, mean, std, generator);
 }
 
-void uniform_kernel(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
+void uniform_kernel(TensorIteratorBase& iter, double from, double to, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::uniform_kernel(iter, from, to, generator);
 }
 
-void normal_kernel(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
+void normal_kernel(const TensorBase &self, double mean, double std, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::normal_kernel(self, mean, std, generator);
 }
 
-static void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
+static void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_from_to_kernel(iter, range, base, generator);
 }
 
-static void random_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen) {
+static void random_kernel(TensorIteratorBase& iter, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_kernel(iter, generator);
 }
@@ -228,7 +228,7 @@ static void random_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen
 // This is the special kernel to handle single specific case:
 // from(inclusive) = std::numeric_limits<int64_t>::lowest()
 // to(exclusive) = None (= std::numeric_limits<int64_t>::max() + 1)
-static void random_full_64_bits_range_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen) {
+static void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_full_64_bits_range_kernel(iter, generator);
 }
diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h
index 93a9b33b29285..961c0a3811ec1 100644
--- a/aten/src/ATen/native/cpu/DistributionTemplates.h
+++ b/aten/src/ATen/native/cpu/DistributionTemplates.h
@@ -57,10 +57,10 @@ void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG generator) {
 
 template<typename RNG>
 struct RandomFromToKernel {
-  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional<Generator> gen) {
     random_from_to_kernel(iter, range, base, check_generator<RNG>(gen));
   }
-  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, std::optional<Generator> gen) {
     random_full_64_bits_range_kernel(iter, check_generator<RNG>(gen));
   }
 };
@@ -78,7 +78,7 @@ void random_kernel(TensorIteratorBase& iter, RNG generator) {
 
 template<typename RNG>
 struct RandomKernel {
-  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, std::optional<Generator> gen) {
     random_kernel(iter, check_generator<RNG>(gen));
   }
 };
@@ -257,7 +257,7 @@ void normal_kernel(const TensorBase &self, double mean, double std, RNG generato
 
 template<typename RNG>
 struct NormalKernel {
-  void operator()(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  void operator()(Tensor& self, double mean, double std, std::optional<Generator> gen) {
     normal_kernel(self, mean, std, check_generator<RNG>(gen));
   }
 };
@@ -279,7 +279,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gene
 
 template<typename RNG>
 struct UniformKernel {
-  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double from, double to, std::optional<Generator> gen) {
     uniform_kernel(iter, from, to, check_generator<RNG>(gen));
   }
 };
@@ -299,7 +299,7 @@ void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, RNG ge
 
 template<typename RNG>
 struct CauchyKernel {
-  void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, std::optional<Generator> gen) {
     cauchy_kernel(iter, median, sigma, check_generator<RNG>(gen));
   }
 };
@@ -319,7 +319,7 @@ void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, RNG ge
 
 template<typename RNG>
 struct LogNormalKernel {
-  void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double mean, double std, std::optional<Generator> gen) {
     log_normal_kernel(iter, mean, std, check_generator<RNG>(gen));
   }
 };
@@ -339,7 +339,7 @@ void geometric_kernel(TensorIteratorBase& iter, double p, RNG generator) {
 
 template<typename RNG>
 struct GeometricKernel {
-  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double p, std::optional<Generator> gen) {
     geometric_kernel(iter, p, check_generator<RNG>(gen));
   }
 };
@@ -360,7 +360,7 @@ void exponential_kernel(TensorIteratorBase& iter, double lambda, RNG generator)
 
 template<typename RNG>
 struct ExponentialKernel {
-  void operator()(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
     exponential_kernel(iter, lambda, check_generator<RNG>(gen));
   }
 };
@@ -415,10 +415,10 @@ void bernoulli_kernel(const TensorBase &self, double p, RNG generator) {
 
 template<typename RNG>
 struct BernoulliKernel {
-  void operator()(const TensorBase &self, double p, c10::optional<Generator> gen) {
+  void operator()(const TensorBase &self, double p, std::optional<Generator> gen) {
     bernoulli_kernel(self, p, check_generator<RNG>(gen));
   }
-  void operator()(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
+  void operator()(const TensorBase &self, const TensorBase &p_, std::optional<Generator> gen) {
     bernoulli_kernel(self, p_, check_generator<RNG>(gen));
   }
 };
diff --git a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
index cb96f24ebdde6..28422330403c6 100644
--- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
+++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
@@ -151,8 +151,8 @@ void cpu_flash_attention(
     const at::Tensor& v,
     double dropout_p,
     bool is_causal,
-    c10::optional<Tensor> attn_mask,
-    c10::optional<double> scale) {
+    std::optional<Tensor> attn_mask,
+    std::optional<double> scale) {
   // Query (Batch x Num_heads  x Q_seq_len  x Dim_per_head)
   //    -> (Batch x Q_seq_len  x Num_heads  x Dim_per_head)
   // Key   (Batch x Num_heads  x KV_seq_len x Dim_per_head)
@@ -400,8 +400,8 @@ void cpu_flash_attention_backward(
     const at::Tensor& logsumexp,
     double dropout_p,
     bool is_causal,
-    c10::optional<Tensor> attn_mask,
-    c10::optional<double> scale) {
+    std::optional<Tensor> attn_mask,
+    std::optional<double> scale) {
   constexpr bool is_reduced_type = is_reduced_floating_point_v<scalar_t>;
   using accum_t = at::opmath_type<scalar_t>;
   using Vec = vec::Vectorized<accum_t>;
@@ -694,8 +694,8 @@ void flash_attention_kernel_impl(
     const at::Tensor& value,
     double dropout_p,
     bool is_causal,
-    c10::optional<Tensor> attn_mask,
-    c10::optional<double> scale) {
+    std::optional<Tensor> attn_mask,
+    std::optional<double> scale) {
   auto q_seq_len = query.size(2);
 
   AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, query.scalar_type(), "flash_attention", [&] {
@@ -727,8 +727,8 @@ void flash_attention_backward_kernel_impl(
     const at::Tensor& logsumexp,
     double dropout_p,
     bool is_causal,
-    c10::optional<Tensor> attn_mask,
-    c10::optional<double> scale) {
+    std::optional<Tensor> attn_mask,
+    std::optional<double> scale) {
   // make sure grad_out has no zero strides (broadcasted dimensions)
   // since we are going to call gemm next
   // zero stride in leading dimension would lead to slow impl for gemm
diff --git a/aten/src/ATen/native/cpu/FusedAdagradKernel.cpp b/aten/src/ATen/native/cpu/FusedAdagradKernel.cpp
deleted file mode 100644
index 70085fde1e907..0000000000000
--- a/aten/src/ATen/native/cpu/FusedAdagradKernel.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
-#include <ATen/Parallel.h>
-#include <ATen/OpMathType.h>
-#include <ATen/native/DispatchStub.h>
-#include <ATen/native/FusedAdagrad.h>
-#include <ATen/Dispatch.h>
-#include <ATen/cpu/vec/vec.h>
-#include <ATen/cpu/vec/functional.h>
-namespace at::native {
-
-namespace{
-
-template <typename scalar_t, typename opmath_t>
-typename std::enable_if<
-    std::is_same<scalar_t, Half>::value || std::is_same<scalar_t, BFloat16>::value,
-    void>::
-    type inline adagrad_math(
-  scalar_t* param_ptr,
-  scalar_t* grad_ptr,
-  scalar_t* state_sum_ptr,
-  const double clr,
-  const double eps,
-  const double weight_decay,
-  const bool maximize,
-  const float* grad_scale_ptr,
-  int64_t size
-){
-  using lpVec = at::vec::Vectorized<scalar_t>;
-  using fVec = at::vec::Vectorized<opmath_t>;
-  lpVec grad_vec_to_store;
-  fVec param_vec1, param_vec2;
-  fVec grad_vec1, grad_vec2;
-  fVec state_sum_vec1, state_sum_vec2;
-  int64_t d = 0;
-  for (; d < size - (size % lpVec::size()); d += lpVec::size()) {
-    lpVec param_lpvec = lpVec::loadu(param_ptr + d);
-    std::tie(param_vec1, param_vec2) = vec::convert_to_float<scalar_t>(param_lpvec);
-    lpVec grad_lpvec = lpVec::loadu(grad_ptr + d);
-    std::tie(grad_vec1, grad_vec2) = vec::convert_to_float<scalar_t>(grad_lpvec);
-    if (grad_scale_ptr) {
-      grad_vec1 = grad_vec1 / fVec(float(*grad_scale_ptr));
-      grad_vec2 = grad_vec2 / fVec(float(*grad_scale_ptr));
-      grad_vec_to_store = vec::convert_from_float<scalar_t>(grad_vec1, grad_vec2);
-      grad_vec_to_store.store(grad_ptr + d);
-    }
-    if (maximize){
-      grad_vec1 = grad_vec1 * fVec(opmath_t(-1.0));
-      grad_vec2 = grad_vec2 * fVec(opmath_t(-1.0));
-    }
-    if (weight_decay != 0.0){
-      grad_vec1 += param_vec1 * fVec(scalar_t(weight_decay));
-      grad_vec2 += param_vec2 * fVec(scalar_t(weight_decay));
-    }
-    std::tie(state_sum_vec1, state_sum_vec2) = vec::convert_to_float<scalar_t>(lpVec::loadu(state_sum_ptr + d));
-    state_sum_vec1 += grad_vec1 * grad_vec1;
-    state_sum_vec2 += grad_vec2 * grad_vec2;
-    vec::convert_from_float<scalar_t>(state_sum_vec1, state_sum_vec2).store(state_sum_ptr + d);
-
-    fVec std_vec1 = state_sum_vec1.sqrt() + fVec(scalar_t(eps));
-    fVec std_vec2 = state_sum_vec2.sqrt() + fVec(scalar_t(eps));
-    param_vec1 = param_vec1 - fVec(scalar_t(clr)) * grad_vec1 / std_vec1;
-    param_vec2 = param_vec2 - fVec(scalar_t(clr)) * grad_vec2 / std_vec2;
-    vec::convert_from_float<scalar_t>(param_vec1, param_vec2).store(param_ptr + d);
-  }
-  scalar_t grad_val_to_store;
-  for (; d < size; d++) {
-    opmath_t grad_val = grad_ptr[d];
-    opmath_t param_val = param_ptr[d];
-    if (grad_scale_ptr) {
-      grad_val = grad_ptr[d] / opmath_t(*grad_scale_ptr);
-      grad_val_to_store = grad_val;
-      grad_ptr[d] = grad_val_to_store;
-    }
-    if (maximize) grad_val = -grad_val;
-    if (weight_decay != 0.0){
-      grad_val += param_val * opmath_t(weight_decay);
-    }
-    opmath_t state_sum_val = state_sum_ptr[d];
-    state_sum_val += grad_val * grad_val;
-    state_sum_ptr[d] = state_sum_val;
-    opmath_t std_val = std::sqrt(state_sum_val) + opmath_t(eps);
-    param_val -= opmath_t(clr) * grad_val / std_val;
-    param_ptr[d] = param_val;
-  }
-}
-
-
-template <typename scalar_t, typename opmath_t>
-typename std::enable_if<
-    std::is_same<scalar_t, float>::value || std::is_same<scalar_t, double>::value,
-    void>::
-    type inline adagrad_math(
-  scalar_t* param_ptr,
-  scalar_t* grad_ptr,
-  scalar_t* state_sum_ptr,
-  const double clr,
-  const double eps,
-  const double weight_decay,
-  const bool maximize,
-  const float* grad_scale_ptr,
-  int64_t size
-){
-  using Vec = at::vec::Vectorized<scalar_t>;
-  Vec grad_vec_to_store;
-  int64_t d = 0;
-  for (; d < size - (size % Vec::size()); d += Vec::size()) {
-    Vec param_vec = Vec::loadu(param_ptr + d);
-    Vec grad_vec = Vec::loadu(grad_ptr + d);
-    if (grad_scale_ptr) {
-      grad_vec = grad_vec / Vec(scalar_t(*grad_scale_ptr));
-      grad_vec_to_store = grad_vec;
-      grad_vec_to_store.store(grad_ptr + d);
-    }
-    if (maximize) grad_vec = grad_vec * Vec(scalar_t(-1.0));
-    if (weight_decay != 0.0){
-      grad_vec += param_vec * Vec(scalar_t(weight_decay));
-    }
-
-    Vec sum_vec = Vec::loadu(state_sum_ptr + d) + grad_vec * grad_vec;
-    sum_vec.store(state_sum_ptr + d);
-
-    Vec std_vec = sum_vec.sqrt() + Vec(scalar_t(eps));
-    param_vec = param_vec - Vec(scalar_t(clr)) * grad_vec / std_vec;
-    param_vec.store(param_ptr + d);
-  }
-  scalar_t grad_val_to_store;
-  for (; d < size; d++) {
-    scalar_t grad_val = grad_ptr[d];
-    if (grad_scale_ptr) {
-      grad_val = grad_ptr[d] / scalar_t(*grad_scale_ptr);
-      grad_val_to_store = grad_val;
-      grad_ptr[d] = grad_val_to_store;
-    }
-    if (maximize) grad_val = -grad_val;
-    if (weight_decay != 0.0){
-      grad_val += param_ptr[d] * scalar_t(weight_decay);
-    }
-    state_sum_ptr[d] += grad_val * grad_val;
-
-    scalar_t std_val = std::sqrt(state_sum_ptr[d]) + scalar_t(eps);
-    param_ptr[d] -= scalar_t(clr) * grad_val / std_val;
-  }
-}
-
-template <typename scalar_t>
-void adagrad_fused_step_impl(
-    const at::Tensor& param,
-    const at::Tensor& grad,
-    const at::Tensor& state_sum,
-    const at::Tensor& state_step,
-    const double lr,
-    const double lr_decay,
-    const double weight_decay,
-    const double eps,
-    const bool maximize,
-    const float* grad_scale_ptr) {
-  using opmath_t = at::opmath_type<scalar_t>;
-  scalar_t* param_data = param.data_ptr<scalar_t>();
-  scalar_t* grad_data = grad.data_ptr<scalar_t>();
-  scalar_t* state_sum_data = state_sum.data_ptr<scalar_t>();
-  double step = state_step.item<float>();
-  double clr = lr / (1.0 + (step - 1.0) * lr_decay);
-
-  constexpr size_t cache_line_size = 64;
-  constexpr int64_t cache_line_aligned_task_unit = cache_line_size / sizeof(scalar_t);
-  size_t num_units = divup(param.numel(), cache_line_aligned_task_unit);
-
-  auto adagrad_fn = [&](int64_t begin, int64_t end) {
-        // local pointers
-        begin *= cache_line_aligned_task_unit;
-        end = std::min(end * cache_line_aligned_task_unit, param.numel());
-        scalar_t* param_ptr = param_data + begin;
-        scalar_t* grad_ptr = grad_data + begin;
-        scalar_t* state_sum_ptr = state_sum_data + begin;
-
-        const int64_t size = end - begin;
-        adagrad_math<scalar_t, opmath_t>(
-          param_ptr,
-          grad_ptr,
-          state_sum_ptr,
-          clr,
-          eps,
-          weight_decay,
-          maximize,
-          grad_scale_ptr,
-          size
-        );
-      };
-  at::parallel_for(
-      0, num_units, 0, adagrad_fn);
-}
-
-void fused_adagrad_kernel(
-    const at::Tensor& param,
-    const at::Tensor& grad,
-    const at::Tensor& state_sum,
-    const at::Tensor& state_step,
-    const double lr,
-    const double lr_decay,
-    const double weight_decay,
-    const double eps,
-    const bool maximize,
-    const float* grad_scale_ptr
-  ) {
-  Tensor grad_contiguous = grad.contiguous();
-  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, param.scalar_type(), "fused_adagrad_kernel", [&] {
-    adagrad_fused_step_impl<scalar_t>(
-      param,
-      grad,
-      state_sum,
-      state_step,
-      lr,
-      lr_decay,
-      weight_decay,
-      eps,
-      maximize,
-      grad_scale_ptr);
-  });
-}
-
-}
-
-REGISTER_DISPATCH(fused_adagrad_stub, &fused_adagrad_kernel);
-} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/HistogramKernel.cpp b/aten/src/ATen/native/cpu/HistogramKernel.cpp
index 196bfd5647a76..829ec71fbd07c 100644
--- a/aten/src/ATen/native/cpu/HistogramKernel.cpp
+++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp
@@ -78,7 +78,7 @@ enum BIN_SELECTION_ALGORITHM {
 };
 template<typename input_t, BIN_SELECTION_ALGORITHM algorithm>
 void histogramdd_cpu_contiguous(Tensor& hist, const TensorList& bin_edges,
-        const Tensor& input, const c10::optional<Tensor>& weight) {
+        const Tensor& input, const std::optional<Tensor>& weight) {
     TORCH_INTERNAL_ASSERT(input.dim() == 2);
 
     const int64_t N = input.size(0);
@@ -100,12 +100,12 @@ void histogramdd_cpu_contiguous(Tensor& hist, const TensorList& bin_edges,
 
     TensorAccessor<const input_t, 2> accessor_in = input.accessor<const input_t, 2>();
 
-    /* Constructs a c10::optional<TensorAccessor> containing an accessor iff
+    /* Constructs a std::optional<TensorAccessor> containing an accessor if
      * the optional weight tensor has a value.
      */
     const auto accessor_wt = weight.has_value()
-            ? c10::optional<TensorAccessor<const input_t, 1>>(weight.value().accessor<const input_t, 1>())
-            : c10::optional<TensorAccessor<const input_t, 1>>();
+            ? std::optional<TensorAccessor<const input_t, 1>>(weight.value().accessor<const input_t, 1>())
+            : std::optional<TensorAccessor<const input_t, 1>>();
 
     std::vector<input_t*> bin_seq(D);
     std::vector<int64_t> num_bin_edges(D);
@@ -208,7 +208,7 @@ void histogramdd_cpu_contiguous(Tensor& hist, const TensorList& bin_edges,
  * Initializes hist to 0, calls into the main algorithm, and normalizes output if necessary.
  */
 template<BIN_SELECTION_ALGORITHM bin_algorithm>
-void histogramdd_out_cpu_template(const Tensor& self, const c10::optional<Tensor>& weight, bool density,
+void histogramdd_out_cpu_template(const Tensor& self, const std::optional<Tensor>& weight, bool density,
         Tensor& hist, const TensorList& bin_edges) {
     hist.fill_(0);
 
@@ -219,8 +219,8 @@ void histogramdd_out_cpu_template(const Tensor& self, const c10::optional<Tensor
     const Tensor reshaped_input = self.reshape({M, N});
 
     const auto reshaped_weight = weight.has_value()
-            ? c10::optional<Tensor>(weight.value().reshape({M}))
-            : c10::optional<Tensor>();
+            ? std::optional<Tensor>(weight.value().reshape({M}))
+            : std::optional<Tensor>();
 
     std::vector<Tensor> bin_edges_contig(bin_edges.size());
     for (const auto dim : c10::irange(bin_edges_contig.size())) {
@@ -259,7 +259,7 @@ void histogramdd_out_cpu_template(const Tensor& self, const c10::optional<Tensor
  *
  * Refer to histogramdd_out_cpu_template for more details.
  */
-static void histogramdd_kernel_impl(const Tensor& self, const c10::optional<Tensor>& weight, bool density,
+static void histogramdd_kernel_impl(const Tensor& self, const std::optional<Tensor>& weight, bool density,
         Tensor& hist, const TensorList& bin_edges) {
     histogramdd_out_cpu_template<BINARY_SEARCH>(self, weight, density, hist, bin_edges);
 }
@@ -269,7 +269,7 @@ static void histogramdd_kernel_impl(const Tensor& self, const c10::optional<Tens
  *
  * Refer to histogramdd_out_cpu_template for more details.
  */
-static void histogramdd_linear_kernel_impl(const Tensor& self, const c10::optional<Tensor>& weight,
+static void histogramdd_linear_kernel_impl(const Tensor& self, const std::optional<Tensor>& weight,
         bool density, Tensor& hist, const TensorList& bin_edges, bool local_search) {
     if (local_search) {
         // histogramdd codepath: both hist and bin_edges are eventually returned as output,
diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
index d5af5d23e8b10..0ebe127c6a8dc 100644
--- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
@@ -54,7 +54,7 @@ void cpu_max_unpool(
   int64_t input_image_size = numel / channels;
   int64_t output_image_size = output.numel() / channels;
 
-  c10::optional<int64_t> optional_error_index;
+  std::optional<int64_t> optional_error_index;
 
   // parallel on dim N, C, D, H, W: [channels, input_image_size]
   at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
@@ -118,7 +118,7 @@ void cpu_max_unpool_channels_last(
   int64_t input_image_size = input_height * input_width;
   int64_t output_image_size = output_height * output_width;
 
-  c10::optional<int64_t> optional_error_index;
+  std::optional<int64_t> optional_error_index;
 
   // parallel on dim N, H, W
   at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) {
@@ -191,7 +191,7 @@ void cpu_max_unpool_backward(
   int64_t input_image_size = numel / channels;
   int64_t output_image_size = grad_output.numel() / channels;
 
-  c10::optional<int64_t> optional_error_index;
+  std::optional<int64_t> optional_error_index;
 
   // parallel on dim N, C, D, H, W
   at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
index 1c4054abdf239..f15292bd21fdb 100644
--- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp
+++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
@@ -24,7 +24,7 @@ multinomial_with_replacement_apply(
     Tensor& result,
     const Tensor& self,
     const int64_t n_sample,
-    c10::optional<Generator> generator) {
+    std::optional<Generator> generator) {
   auto gen = get_generator_or_default<CPUGeneratorImpl>(
       generator, detail::getDefaultCPUGenerator());
   // See Note [Acquire lock when using random generators]
@@ -128,7 +128,7 @@ multinomial_with_replacement_apply(
     Tensor& result,
     const Tensor& self,
     const int64_t n_sample,
-    c10::optional<Generator> generator) {
+    std::optional<Generator> generator) {
   auto gen = get_generator_or_default<CPUGeneratorImpl>(
       generator, detail::getDefaultCPUGenerator());
   // See Note [Acquire lock when using random generators]
@@ -230,7 +230,7 @@ static void multinomial_with_replacement_kernel_impl(
     Tensor& result,
     const Tensor& self,
     const int64_t n_sample,
-    c10::optional<Generator> gen) {
+    std::optional<Generator> gen) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kHalf, kBFloat16, self.scalar_type(), "multinomial", [&] {
         multinomial_with_replacement_apply<scalar_t>(
diff --git a/aten/src/ATen/native/cpu/ReduceUtils.h b/aten/src/ATen/native/cpu/ReduceUtils.h
index d6afac295aff6..8c6424f8b0eac 100644
--- a/aten/src/ATen/native/cpu/ReduceUtils.h
+++ b/aten/src/ATen/native/cpu/ReduceUtils.h
@@ -60,7 +60,7 @@ inline vec_scalar_t<scalar_t> init_value() {
 }
 
 template <typename scalar_t, ReductionType reduce>
-inline vec_scalar_t<scalar_t> init_value(const c10::optional<Scalar>& initial) {
+inline vec_scalar_t<scalar_t> init_value(const std::optional<Scalar>& initial) {
   using acc_t = vec_scalar_t<scalar_t>;
   if (initial.has_value()) {
     return initial.value().to<acc_t>();
@@ -80,7 +80,7 @@ inline void init(scalar_t* out, int64_t size, const vec_scalar_t<scalar_t>& val)
 }
 
 template <typename scalar_t, ReductionType reduce>
-inline void init(scalar_t* out, int64_t size, const c10::optional<Scalar>& initial) {
+inline void init(scalar_t* out, int64_t size, const std::optional<Scalar>& initial) {
   using acc_t = vec_scalar_t<scalar_t>;
   acc_t val = init_value<scalar_t, reduce>(initial);
   init(out, size, val);
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 461ceb2f36383..9754b003e19c6 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -496,9 +496,9 @@ inline Vectorized<c10::complex<scalar_t>> _nan_to_num_replace(
 
 static void nan_to_num_kernel(
     TensorIteratorBase& iter,
-    c10::optional<double> nan,
-    c10::optional<double> pos_inf,
-    c10::optional<double> neg_inf) {
+    std::optional<double> nan,
+    std::optional<double> pos_inf,
+    std::optional<double> neg_inf) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "nan_to_num", [&]() {
     using value_t = c10::scalar_value_type<scalar_t>::type;
     value_t nan_replacement = static_cast<value_t>(nan.value_or(0.));
diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
index 67fe50c1d2a62..17b6d0a543f34 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@@ -21,7 +21,7 @@
 namespace at::native {
 namespace {
 
-using scale_t = std::vector<c10::optional<double>>;
+using scale_t = std::vector<std::optional<double>>;
 
 // TODO: this file could benefit from a global renaming of its functions /
 // classes and terms, as well as from adding more comments. In particular:
@@ -987,7 +987,7 @@ struct HelperInterpBase {
   template <typename aa_filter_fn_t>
   static inline std::tuple<std::vector<Tensor>, int, unsigned int> _compute_index_ranges_int16_weights(
     int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
-    int64_t reshape_dim, bool align_corners, const c10::optional<double> opt_scale,
+    int64_t reshape_dim, bool align_corners, const std::optional<double> opt_scale,
     int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, bool align_i32=false
   ) {
 
@@ -1072,7 +1072,7 @@ struct HelperInterpNearest : public HelperInterpBase {
   static inline std::vector<Tensor> compute_indices_weights(
     at::ScalarType scalar_type,
     int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
-    int64_t reshape_dim, bool align_corners, const c10::optional<double> opt_scale
+    int64_t reshape_dim, bool align_corners, const std::optional<double> opt_scale
   ) {
 
     TORCH_INTERNAL_ASSERT(!align_corners);
@@ -1123,7 +1123,7 @@ struct HelperInterpNearestExact : public HelperInterpNearest {
   static inline std::vector<Tensor> compute_indices_weights(
     at::ScalarType scalar_type,
     int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
-    int64_t reshape_dim, bool align_corners, const c10::optional<double> opt_scale
+    int64_t reshape_dim, bool align_corners, const std::optional<double> opt_scale
   ) {
 
     TORCH_INTERNAL_ASSERT(!align_corners);
@@ -1175,7 +1175,7 @@ struct HelperInterpLinear : public HelperInterpBase {
   static inline std::vector<Tensor> compute_indices_weights(
     at::ScalarType scalar_type,
     int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, int64_t reshape_dim,
-    bool align_corners, const c10::optional<double> opt_scale
+    bool align_corners, const std::optional<double> opt_scale
   ) {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     std::vector<Tensor> output;
@@ -1230,7 +1230,7 @@ struct HelperInterpLinear : public HelperInterpBase {
     int64_t ndims,
     int64_t reshape_dim,
     bool align_corners,
-    const c10::optional<double> opt_scale,
+    const std::optional<double> opt_scale,
     bool antialias
   ) {
 
@@ -1266,7 +1266,7 @@ struct HelperInterpLinear : public HelperInterpBase {
     int64_t ndims,
     int64_t reshape_dim,
     bool align_corners,
-    const c10::optional<double> opt_scale,
+    const std::optional<double> opt_scale,
     bool antialias,
     bool align_i32=false
   ) {
@@ -1296,7 +1296,7 @@ struct HelperInterpCubic : public HelperInterpBase {
   static inline std::vector<Tensor> compute_indices_weights(
     at::ScalarType scalar_type,
     int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, int64_t reshape_dim,
-    bool align_corners, const c10::optional<double> opt_scale
+    bool align_corners, const std::optional<double> opt_scale
   ) {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     std::vector<Tensor> output;
@@ -1364,7 +1364,7 @@ struct HelperInterpCubic : public HelperInterpBase {
     int64_t ndims,
     int64_t reshape_dim,
     bool align_corners,
-    const c10::optional<double> opt_scale,
+    const std::optional<double> opt_scale,
     bool antialias
   ) {
 
@@ -1400,7 +1400,7 @@ struct HelperInterpCubic : public HelperInterpBase {
     int64_t ndims,
     int64_t reshape_dim,
     bool align_corners,
-    const c10::optional<double> opt_scale,
+    const std::optional<double> opt_scale,
     bool antialias,
     bool align_i32=false
   ) {
@@ -1422,7 +1422,7 @@ struct HelperInterpCubic : public HelperInterpBase {
 //
 // Internally, it uses TensorIterator to optimize the computations.
 // - out_ndims is the number of interpolated dims: 1, 2, 3
-// - scale_type is template type for scales, typically c10::optional<double>
+// - scale_type is template type for scales, typically std::optional<double>
 // - template<typename> class F is one of the above structs to compute indices and weights
 template <int out_ndims, typename scale_type, class F>
 void upsample_generic_Nd_kernel_impl(
@@ -1686,7 +1686,7 @@ void separable_upsample_generic_Nd_kernel_impl(
 void upsample_nearest1d_kernel_impl(
     const Tensor& output,
     const Tensor& input,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_w) {
   upsample_generic_Nd_kernel_impl<1, scale_t, HelperInterpNearest>(
       output, input, false, {scales_w});
 }
@@ -1694,7 +1694,7 @@ void upsample_nearest1d_kernel_impl(
 void _upsample_nearest_exact1d_kernel_impl(
     const Tensor& output,
     const Tensor& input,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_w) {
   upsample_generic_Nd_kernel_impl<1, scale_t, HelperInterpNearestExact>(
     output, input, false, {scales_w});
 }
@@ -1726,8 +1726,8 @@ int _use_vectorized_kernel_cond_3d(
 void upsample_nearest2d_kernel_impl(
     const Tensor& output,
     const Tensor& input,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   if (_use_vectorized_kernel_cond_2d(output, input)) {
     AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf,
         input.scalar_type(), "upsample_nearest2d_channels_last", [&] {
@@ -1742,8 +1742,8 @@ void upsample_nearest2d_kernel_impl(
 void _upsample_nearest_exact2d_kernel_impl(
     const Tensor& output,
     const Tensor& input,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   if (_use_vectorized_kernel_cond_2d(output, input)) {
     AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf, input.scalar_type(), "upsample_nearest2d_channels_last", [&] {
       cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_exact_idx>(output, input, {scales_h, scales_w});
@@ -1757,9 +1757,9 @@ void _upsample_nearest_exact2d_kernel_impl(
 void upsample_nearest3d_kernel_impl(
     const Tensor& output,
     const Tensor& input,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   if (_use_vectorized_kernel_cond_3d(output, input)) {
     AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf,
         input.scalar_type(), "upsample_nearest3d_channels_last", [&] {
@@ -1774,9 +1774,9 @@ void upsample_nearest3d_kernel_impl(
 void _upsample_nearest_exact3d_kernel_impl(
     const Tensor& output,
     const Tensor& input,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   if (_use_vectorized_kernel_cond_3d(output, input)) {
     AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf, input.scalar_type(), "upsample_nearest3d_channels_last", [&] {
       cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_exact_idx>(output, input, {scales_d, scales_h, scales_w});
@@ -1791,7 +1791,7 @@ void upsample_linear1d_kernel_impl(
     const Tensor& output,
     const Tensor& input,
     bool align_corners,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_w) {
   upsample_generic_Nd_kernel_impl<1, scale_t, HelperInterpLinear>(
     output, input, align_corners, {scales_w});
 }
@@ -1801,8 +1801,8 @@ void upsample_bilinear2d_kernel_impl_float(
     const Tensor& output,
     const Tensor& input,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
 
   // See note above about _use_vectorized_kernel_cond_2d(output, input). The extra cond is present
   // because benchmarks showed that with only 1 thread, images (C == 3) were
@@ -1823,8 +1823,8 @@ void upsample_bilinear2d_kernel_impl(
     const Tensor& output,
     const Tensor& input,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
 
   if (input.dtype() == at::kByte){
     #ifdef CPU_CAPABILITY_AVX2
@@ -1852,8 +1852,8 @@ void upsample_bilinear2d_aa_kernel_impl(
     const Tensor& output,
     const Tensor& input,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
 #ifdef CPU_CAPABILITY_AVX2
   if (input.dtype() == at::kByte && input.size(1) <= 4) {
     upsample_avx_bilinear_bicubic_uint8<scale_t, HelperInterpLinear>(
@@ -1875,9 +1875,9 @@ void upsample_trilinear3d_kernel_impl(
     const Tensor& output,
     const Tensor& input,
     bool align_corners,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   if ((_use_vectorized_kernel_cond_3d(output, input))) {
     AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "upsample_trilinear3d_channels_last", [&] {
       cpu_upsample_linear_channels_last<scalar_t, scale_t>(output, input, align_corners, {scales_d, scales_h, scales_w});
@@ -1892,8 +1892,8 @@ void upsample_bicubic2d_kernel_impl(
     const Tensor& output,
     const Tensor& input,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
 
   if (input.dtype() == at::kByte){
     #ifdef CPU_CAPABILITY_AVX2
@@ -1922,8 +1922,8 @@ void upsample_bicubic2d_aa_kernel_impl(
     const Tensor& output,
     const Tensor& input,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
 
 #ifdef CPU_CAPABILITY_AVX2
   if (input.dtype() == at::kByte && input.size(1) <= 4) {
@@ -2061,8 +2061,8 @@ void upsample_bilinear2d_aa_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   AT_DISPATCH_FLOATING_TYPES(
       grad_output.scalar_type(), "upsample_bilinear2d_aa_backward_cpu", [&] {
         cpu_upsample_genNd_backward_aa<scalar_t, scale_t, HelperInterpLinear>(
@@ -2074,8 +2074,8 @@ void upsample_bicubic2d_aa_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   AT_DISPATCH_FLOATING_TYPES(
       grad_output.scalar_type(), "upsample_bicubic2d_aa_backward_cpu", [&] {
         cpu_upsample_genNd_backward_aa<scalar_t, scale_t, HelperInterpCubic>(
diff --git a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
index b97b5cefee2c8..fae70686591ee 100644
--- a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
@@ -12,7 +12,7 @@
 namespace at::native {
 namespace {
 
-using scale_t = std::vector<c10::optional<double>>;
+using scale_t = std::vector<std::optional<double>>;
 
 template <typename acc_t, typename scalar_t,
           typename scalar_nonconst_t = std::remove_const_t<scalar_t>,
@@ -337,7 +337,7 @@ void cpu_upsample_nearest_backward_channels_last(
 void upsample_nearest1d_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_w) {
   AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_nearest1d_backward", [&] {
     cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_w});
   });
@@ -346,7 +346,7 @@ void upsample_nearest1d_backward_kernel_impl(
 void _upsample_nearest_exact1d_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_w) {
   AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact1d_backward", [&] {
     cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_w});
   });
@@ -355,8 +355,8 @@ void _upsample_nearest_exact1d_backward_kernel_impl(
 void upsample_nearest2d_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) {
     AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_nearest2d_backward_cl", [&] {
       cpu_upsample_nearest_backward_channels_last<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_h, scales_w});
@@ -371,8 +371,8 @@ void upsample_nearest2d_backward_kernel_impl(
 void _upsample_nearest_exact2d_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) {
     AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward_cl", [&] {
       cpu_upsample_nearest_backward_channels_last<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_h, scales_w});
@@ -387,9 +387,9 @@ void _upsample_nearest_exact2d_backward_kernel_impl(
 void upsample_nearest3d_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
     AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest3d_backward_cl", [&] {
       cpu_upsample_nearest_backward_channels_last<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_d, scales_h, scales_w});
@@ -404,9 +404,9 @@ void upsample_nearest3d_backward_kernel_impl(
 void _upsample_nearest_exact3d_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
     AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact3d_backward_cl", [&] {
       cpu_upsample_nearest_backward_channels_last<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_d, scales_h, scales_w});
@@ -745,7 +745,7 @@ void upsample_linear1d_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
     bool align_corners,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_w) {
   AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_linear1d_backward", [&] {
     cpu_upsample_linear_backward<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_w});
   });
@@ -755,8 +755,8 @@ void upsample_bilinear2d_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) {
     AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_bilinear2d_backward_channels_last", [&] {
       cpu_upsample_linear_backward_channels_last<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_h, scales_w});
@@ -772,9 +772,9 @@ void upsample_trilinear3d_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
     bool align_corners,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
     AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_trilinear3d_backward_channels_last", [&] {
       cpu_upsample_linear_backward_channels_last<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_d, scales_h, scales_w});
diff --git a/aten/src/ATen/native/cuda/AveragePool2d.cu b/aten/src/ATen/native/cuda/AveragePool2d.cu
index e55b9e5e96ef1..3ea9dcc854a3f 100644
--- a/aten/src/ATen/native/cuda/AveragePool2d.cu
+++ b/aten/src/ATen/native/cuda/AveragePool2d.cu
@@ -250,7 +250,7 @@ TORCH_IMPL_FUNC(avg_pool2d_out_cuda)
  int64_t padW_,
  bool ceil_mode,
  bool count_include_pad,
- c10::optional<int64_t> divisor_override,
+ std::optional<int64_t> divisor_override,
  const Tensor& output) {
   TensorArg output_arg{ output, "output", 1 };
   TensorArg input_arg{ input_, "input_", 2 };
@@ -362,7 +362,7 @@ TORCH_IMPL_FUNC(avg_pool2d_backward_out_cuda) (
   IntArrayRef padding,
   bool ceil_mode,
   bool count_include_pad,
-  c10::optional<int64_t> divisor_override,
+  std::optional<int64_t> divisor_override,
   const Tensor& gradInput
 ) {
   TensorArg gradInput_arg{ gradInput, "gradInput", 1 };
diff --git a/aten/src/ATen/native/cuda/AveragePool3d.cu b/aten/src/ATen/native/cuda/AveragePool3d.cu
index f4b0ee00d9a9a..dabcf5b63be99 100644
--- a/aten/src/ATen/native/cuda/AveragePool3d.cu
+++ b/aten/src/ATen/native/cuda/AveragePool3d.cu
@@ -351,7 +351,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cuda) (
   IntArrayRef padding,
   bool ceil_mode,
   bool count_include_pad,
-  c10::optional<int64_t> divisor_override,
+  std::optional<int64_t> divisor_override,
   const Tensor& output
 ) {
   TensorArg output_arg{ output, "output", 1 };
@@ -451,7 +451,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cuda) (
   IntArrayRef padding,
   bool ceil_mode,
   bool count_include_pad,
-  c10::optional<int64_t> divisor_override,
+  std::optional<int64_t> divisor_override,
   const Tensor& gradInput
 ) {
   // See Note [Writing Nondeterministic Operations]
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 9e76aad45f644..c0ed650cf0219 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -839,11 +839,11 @@ static bool _scaled_mm_allowed_device() {
 
 std::tuple<Tensor&, Tensor&>
 _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
-          const c10::optional<at::Tensor>& bias,
-          c10::optional<c10::ScalarType> out_dtype,
-          const c10::optional<at::Tensor>& scale_a,
-          const c10::optional<at::Tensor>& scale_b,
-          const c10::optional<at::Tensor>& scale_result,
+          const std::optional<at::Tensor>& bias,
+          std::optional<c10::ScalarType> out_dtype,
+          const std::optional<at::Tensor>& scale_a,
+          const std::optional<at::Tensor>& scale_b,
+          const std::optional<at::Tensor>& scale_result,
           bool use_fast_accum,
           Tensor& out, Tensor& amax) {
   // Check sizes
@@ -1022,11 +1022,11 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
 
 std::tuple<Tensor, Tensor>
 _scaled_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
-          const c10::optional<at::Tensor>& bias,
-          c10::optional<c10::ScalarType> out_dtype,
-          const c10::optional<at::Tensor>& scale_a,
-          const c10::optional<at::Tensor>& scale_b,
-          const c10::optional<at::Tensor>& scale_result,
+          const std::optional<at::Tensor>& bias,
+          std::optional<c10::ScalarType> out_dtype,
+          const std::optional<at::Tensor>& scale_a,
+          const std::optional<at::Tensor>& scale_b,
+          const std::optional<at::Tensor>& scale_result,
           bool use_fast_accum) {
   const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
   Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
diff --git a/aten/src/ATen/native/cuda/Bucketization.cu b/aten/src/ATen/native/cuda/Bucketization.cu
index 05d5421b046f8..73a68683b6c04 100644
--- a/aten/src/ATen/native/cuda/Bucketization.cu
+++ b/aten/src/ATen/native/cuda/Bucketization.cu
@@ -134,8 +134,8 @@ Tensor& searchsorted_out_cuda(
     const Tensor& self,
     bool out_int32,
     bool right,
-    const c10::optional<c10::string_view> side_opt,
-    const c10::optional<Tensor>& sorter_opt,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter_opt,
     Tensor& result) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> sorter_maybe_owned = at::borrow_from_optional_tensor(sorter_opt);
@@ -180,8 +180,8 @@ Tensor& searchsorted_out_cuda(
     const Scalar& self,
     bool out_int32,
     bool right,
-    const c10::optional<c10::string_view> side_opt,
-    const c10::optional<Tensor>& sorter_opt,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter_opt,
     Tensor& result) {
   const Tensor& scalar_tensor = searchsorted_scalar_tensor(self, sorted_sequence.device());
   return searchsorted_out_cuda(sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter_opt, result);
@@ -192,8 +192,8 @@ Tensor searchsorted_cuda(
     const Tensor& self,
     bool out_int32,
     bool right,
-    const c10::optional<c10::string_view> side_opt,
-    const c10::optional<Tensor>& sorter) {
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter) {
   ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
   c10::TensorOptions options = TensorOptions().device(self.options().device()).dtype(scalar_type);
   Tensor result = at::empty({0}, options, MemoryFormat::Contiguous);
@@ -206,8 +206,8 @@ Tensor searchsorted_cuda(
     const Scalar& self,
     bool out_int32,
     bool right,
-    const c10::optional<c10::string_view> side_opt,
-    const c10::optional<Tensor>& sorter) {
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter) {
   const Tensor& scalar_tensor = searchsorted_scalar_tensor(self, sorted_sequence.device());
   return searchsorted_cuda(sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter);
 }
diff --git a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu
index 9e45e2693cb0f..4f6ef77eb7e05 100644
--- a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu
+++ b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu
@@ -376,7 +376,7 @@ Tensor& slow_conv2d_forward_out_cuda(
     const Tensor &self_,
     const Tensor &weight_,
     IntArrayRef kernel_size,
-    const c10::optional<Tensor> &bias_,
+    const std::optional<Tensor> &bias_,
     IntArrayRef stride,
     IntArrayRef padding,
     Tensor &output) {
@@ -409,7 +409,7 @@ Tensor slow_conv2d_forward_cuda(
     const Tensor &self,
     const Tensor &weight,
     IntArrayRef kernel_size,
-    const c10::optional<Tensor> &bias,
+    const std::optional<Tensor> &bias,
     IntArrayRef stride,
     IntArrayRef padding) {
   auto output = at::empty({0}, self.options());
diff --git a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
index 69757df220886..b87dd41dd59ef 100644
--- a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
+++ b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
@@ -29,9 +29,120 @@ PackedTensorAccessor32<scalar_t, ndim, PtrTraits> dummy_packed_accessor32() {
   return {nullptr, zeros.data(), zeros.data()};
 }
 
+template <typename scalar_t, typename index_t>
+__global__ void
+#if !defined(USE_ROCM)
+C10_LAUNCH_BOUNDS_1(at::cuda::detail::CUDA_NUM_THREADS)
+#endif
+conv_depthwise2d_forward_kernel_generic(
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> input,
+    PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> output,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> weight,
+    const PackedTensorAccessor32<const scalar_t, 1, DefaultPtrTraits> bias,
+    bool biasEnabled,
+    index_t totalElements,
+    const int outputChannels,
+    const int depthwiseMultiplier,
+    const int inputWidth, const int inputHeight,
+    const int outputWidth, const int outputHeight,
+    const int kernelWidth, const int kernelHeight,
+    const int strideWidth, const int strideHeight,
+    const int padWidth, const int padHeight,
+    const int dilationWidth, const int dilationHeight) {
+  using acc_t = at::acc_type<scalar_t, true>;
+
+  CUDA_KERNEL_LOOP_TYPE(linearIndex, totalElements, index_t) {
+    //calculate n,c,h,w indices, replacing modulos by divide and multiply add,
+    //result is same as would be in the code below
+    //const int n = linearIndex / batchStride; //batchStride = outputChannels * outputHeight * outputWidth
+    //const int c = (linearIndex / channelStride) % outputChannels; //channelStride = outputHeight * outputWidth
+    //const int h = (linearIndex / outputWidth) % outputHeight;
+    //const int w = linearIndex % outputWidth;
+
+    int indtmp1 = linearIndex/outputWidth;
+    const int w = linearIndex - indtmp1 * outputWidth;
+    int indtmp2 = indtmp1/outputHeight;
+    const int h = indtmp1 - indtmp2 * outputHeight;
+    indtmp1 = indtmp2;
+    indtmp2 = indtmp1/outputChannels;
+    const int c = indtmp1 - indtmp2 * outputChannels;
+    const int n = indtmp2;
+
+    int inputChannel = c;
+    int inputChannels = outputChannels;
+    if (depthwiseMultiplier !=1) {
+      inputChannel /= depthwiseMultiplier;
+      inputChannels /= depthwiseMultiplier;
+    }
+
+    int weightOffset = c * kernelHeight * kernelWidth;
+
+    // By precisely computing the filtering boundaries, we avoid repeating several
+    // expensive edge condition checks for every fetched item. If the input element is
+    // resident in L1, then the extra branches and comparisons would have been
+    // comparable in terms of cycles with the actual data fetch. Therefore computing
+    // boundaries ahead of the loop showed significant performance boost.
+
+    int kHmin = 0, kHmax = kernelHeight, kWmin = 0, kWmax = kernelWidth;
+
+    // Top
+    int h_in_min = -padHeight + h * strideHeight;
+    if (h_in_min < 0) {
+      kHmin =  -h_in_min / dilationHeight;
+      if ((-h_in_min) % dilationHeight > 0) {
+        kHmin++;
+      }
+    }
+
+    // Bottom
+    int h_in_max = h_in_min + (kernelHeight - 1) * dilationHeight - inputHeight + 1;
+    if (h_in_max >= 0) {
+      kHmax = kernelHeight - h_in_max / dilationHeight;
+      if (h_in_max % dilationHeight > 0) {
+        kHmax--;
+      }
+    }
+
+    // Left
+    int w_in_min = -padWidth + w * strideWidth;
+    if (w_in_min < 0) {
+      kWmin = -w_in_min / dilationWidth;
+      if ((-w_in_min) % dilationWidth > 0) {
+        kWmin++;
+      }
+    }
+
+    // Right
+    int w_in_max = w_in_min + (kernelWidth - 1) * dilationWidth - inputWidth + 1;
+    if (w_in_max >= 0) {
+      kWmax = kernelWidth - w_in_max / dilationWidth;
+      if (w_in_max % dilationWidth > 0) {
+        kWmax--;
+      }
+    }
+
+    acc_t value = biasEnabled ? static_cast<acc_t>(bias.data()[c]) : acc_t(0);
+    const index_t offset0 = (n * inputChannels + inputChannel) * inputHeight * inputWidth;
+
+    for (int kH = kHmin; kH < kHmax; ++kH) {
+      const int h_in = -padHeight + h * strideHeight + kH * dilationHeight;
+      for (int kW = kWmin; kW < kWmax; ++kW) {
+        const int w_in = -padWidth + w * strideWidth + kW * dilationWidth;
+        const index_t offset = offset0 + h_in * inputWidth + w_in;
+        value += (static_cast<acc_t>(weight.data()[weightOffset + kH * kernelWidth + kW]) *
+                    static_cast<acc_t>(input.data()[offset]));
+      }
+    }
+    output.data()[linearIndex] = static_cast<scalar_t>(value);
+  }
+}
 
 template <int kSize, typename scalar_t, typename index_t>
-__global__ void conv_depthwise2d_forward_kernel(
+__global__ void
+#if !defined(USE_ROCM)
+C10_LAUNCH_BOUNDS_1(at::cuda::detail::CUDA_NUM_THREADS)
+#endif
+conv_depthwise2d_forward_kernel(
     const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> input,
     PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> output,
     const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> weight,
@@ -315,7 +426,13 @@ void conv_depthwise2d_forward_out(
     const auto bias_a = has_bias ?
       bias.packed_accessor32<const scalar_t, 1>() :
       dummy_packed_accessor32<const scalar_t, 1>();
-    if (kW == 3 && kH == 3) {
+    if (kW == 5 && kH == 5) {
+      conv_depthwise2d_forward_kernel<5> <<<grid, block, 0, stream>>>(
+        input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
+        width, height, outputWidth, outputHeight,
+        kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    } else if (kW == 3 && kH == 3) {
       conv_depthwise2d_forward_kernel<3> <<<grid, block, 0, stream>>>(
         input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
         width, height, outputWidth, outputHeight,
@@ -328,7 +445,7 @@ void conv_depthwise2d_forward_out(
         kW, kH, dW, dH, padW, padH, dilationW, dilationH);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
-      conv_depthwise2d_forward_kernel<0> <<<grid, block, 0, stream>>>(
+      conv_depthwise2d_forward_kernel_generic<<<grid, block, 0, stream>>>(
         input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
         width, height, outputWidth, outputHeight,
         kW, kH, dW, dH, padW, padH, dilationW, dilationH);
@@ -521,7 +638,7 @@ const Tensor& conv_depthwise2d_cuda_out(
     const Tensor &input_,
     const Tensor &weight_,
     IntArrayRef kernel_size,
-    const c10::optional<Tensor> &bias_opt,
+    const std::optional<Tensor> &bias_opt,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
@@ -556,7 +673,7 @@ Tensor conv_depthwise2d_cuda(
     const Tensor &input,
     const Tensor &weight,
     IntArrayRef kernel_size,
-    const c10::optional<Tensor> &bias,
+    const std::optional<Tensor> &bias,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation) {
diff --git a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu
index 991471a6ef82f..62c36d66ee40e 100644
--- a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu
+++ b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu
@@ -390,7 +390,7 @@ void conv_depthwise_shape_check(
 Tensor conv_depthwise3d_cuda(
     const Tensor& input,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation) {
diff --git a/aten/src/ATen/native/cuda/DistributionBernoulli.cu b/aten/src/ATen/native/cuda/DistributionBernoulli.cu
index 89a518267d25e..5a04ae9b3450f 100644
--- a/aten/src/ATen/native/cuda/DistributionBernoulli.cu
+++ b/aten/src/ATen/native/cuda/DistributionBernoulli.cu
@@ -23,12 +23,12 @@
 
 namespace at::native {
 
-void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen_) {
+void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, std::optional<Generator> gen_) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::bernoulli_kernel(self, p_, generator);
 }
 
-void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
+void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional<Generator> gen) {
   auto iter = TensorIterator::borrowing_nullary_op(self);
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::bernoulli_kernel(iter, p, generator);
diff --git a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
index a66d3cf3288fd..e6a4629930659 100644
--- a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
@@ -5,7 +5,7 @@
 
 namespace at::native {
 
-void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
+void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::cauchy_kernel(iter, median, sigma, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
index 76cb94f6fd878..78ee9e745d36b 100644
--- a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
@@ -5,7 +5,7 @@
 
 namespace at::native {
 
-void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
+void exponential_kernel(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::exponential_kernel(iter, lambda, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
index 0fe49d7bbd4b5..783863f99a9aa 100644
--- a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
@@ -5,7 +5,7 @@
 
 namespace at::native {
 
-void geometric_kernel(TensorIteratorBase& iter, double p_, c10::optional<Generator> gen) {
+void geometric_kernel(TensorIteratorBase& iter, double p_, std::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::geometric_kernel(iter, p_, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
index f394d4fea39db..148e8e00dd99b 100644
--- a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
@@ -5,7 +5,7 @@
 
 namespace at::native {
 
-void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
+void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::log_normal_kernel(iter, mean, std, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionNormal.cu b/aten/src/ATen/native/cuda/DistributionNormal.cu
index a17c3e3da0556..bd4763e269f89 100644
--- a/aten/src/ATen/native/cuda/DistributionNormal.cu
+++ b/aten/src/ATen/native/cuda/DistributionNormal.cu
@@ -5,7 +5,7 @@
 
 namespace at::native {
 
-void normal_kernel(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
+void normal_kernel(const TensorBase &self, double mean, double std, std::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::normal_kernel(self, mean, std, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
index 034a19c512f4f..827a12b3f28be 100644
--- a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
@@ -5,17 +5,17 @@
 
 namespace at::native {
 
-void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen_) {
+void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::random_from_to_kernel(iter, range, base, gen);
 }
 
-void random_full_64_bits_range_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen_) {
+void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::random_full_64_bits_range_kernel(iter, gen);
 }
 
-void random_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen_) {
+void random_kernel(TensorIteratorBase& iter, std::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::random_kernel(iter, gen);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h
index 8ac91f3114511..8f8860f04ad1b 100644
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@@ -352,10 +352,10 @@ void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG gen) {
 
 template<typename RNG>
 struct RandomFromToKernel {
-  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional<Generator> gen) {
     random_from_to_kernel(iter, range, base, check_generator<RNG>(gen));
   }
-  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, std::optional<Generator> gen) {
     random_full_64_bits_range_kernel(iter, check_generator<RNG>(gen));
   }
 };
@@ -448,7 +448,7 @@ void normal_kernel(const TensorBase &self, double mean_, double std_, RNG gen) {
 
 template<typename RNG>
 struct NormalKernel {
-  void operator()(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
+  void operator()(const TensorBase &self, double mean, double std, std::optional<Generator> gen) {
     normal_kernel(self, mean, std, check_generator<RNG>(gen));
   }
 };
@@ -481,7 +481,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen)
 
 template<typename RNG>
 struct UniformKernel {
-  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double from, double to, std::optional<Generator> gen) {
     uniform_kernel(iter, from, to, check_generator<RNG>(gen));
   }
 };
@@ -504,7 +504,7 @@ void log_normal_kernel(TensorIteratorBase& iter, double mean_, double std_, RNG
 
 template<typename RNG>
 struct LogNormalKernel {
-  void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double mean, double std, std::optional<Generator> gen) {
     log_normal_kernel(iter, mean, std, check_generator<RNG>(gen));
   }
 };
@@ -525,7 +525,7 @@ void geometric_kernel(TensorIteratorBase& iter, double p, RNG gen) {
 
 template<typename RNG>
 struct GeometricKernel {
-  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double p, std::optional<Generator> gen) {
     geometric_kernel(iter, p, check_generator<RNG>(gen));
   }
 };
@@ -548,7 +548,7 @@ void exponential_kernel(TensorIteratorBase& iter, double lambda_, RNG gen) {
 
 template<typename RNG>
 struct ExponentialKernel {
-  void operator()(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
     exponential_kernel(iter, lambda, check_generator<RNG>(gen));
   }
 };
@@ -571,7 +571,7 @@ void cauchy_kernel(TensorIteratorBase& iter, double median_, double sigma_, RNG
 
 template<typename RNG>
 struct CauchyKernel {
-  void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, std::optional<Generator> gen) {
     cauchy_kernel(iter, median, sigma, check_generator<RNG>(gen));
   }
 };
@@ -661,10 +661,10 @@ void bernoulli_kernel(TensorIteratorBase& iter, double p, RNG gen) {
 
 template<typename RNG>
 struct BernoulliKernel {
-  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
+  void operator()(TensorIteratorBase& iter, double p, std::optional<Generator> gen) {
     bernoulli_kernel(iter, p, check_generator<RNG>(gen));
   }
-  void operator()(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
+  void operator()(const TensorBase &self, const TensorBase &p_, std::optional<Generator> gen) {
     bernoulli_kernel(self, p_, check_generator<RNG>(gen));
   }
 };
diff --git a/aten/src/ATen/native/cuda/DistributionUniform.cu b/aten/src/ATen/native/cuda/DistributionUniform.cu
index 2ebdfa4464598..ed34b78727dbd 100644
--- a/aten/src/ATen/native/cuda/DistributionUniform.cu
+++ b/aten/src/ATen/native/cuda/DistributionUniform.cu
@@ -5,7 +5,7 @@
 
 namespace at::native {
 
-void uniform_kernel(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
+void uniform_kernel(TensorIteratorBase& iter, double from, double to, std::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   templates::cuda::uniform_kernel(iter, from, to, generator);
 }
diff --git a/aten/src/ATen/native/cuda/Distributions.cpp b/aten/src/ATen/native/cuda/Distributions.cpp
index c0d5abb49bf6a..21ce151276fe5 100644
--- a/aten/src/ATen/native/cuda/Distributions.cpp
+++ b/aten/src/ATen/native/cuda/Distributions.cpp
@@ -18,14 +18,14 @@
 
 namespace at::native {
 
-Tensor _s_poisson_cuda(const Tensor& lambda, c10::optional<Generator> gen_) {
+Tensor _s_poisson_cuda(const Tensor& lambda, std::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   Tensor ret = at::empty(lambda.sizes(), lambda.options());
   launch_poisson_cuda_kernel(ret, lambda, gen);
   return ret;
 }
 
-Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, c10::optional<Generator> gen_) {
+Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, std::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   Tensor ret = at::empty(count.sizes(), count.options());
   at::TensorIterator iter = at::TensorIteratorConfig()
@@ -37,14 +37,14 @@ Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, c10::optional<G
   return ret;
 }
 
-Tensor _s_gamma_cuda(const Tensor& alpha, c10::optional<Generator> gen_) {
+Tensor _s_gamma_cuda(const Tensor& alpha, std::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   Tensor ret = at::empty(alpha.sizes(), alpha.options());
   launch_gamma_kernel(ret, alpha, gen);
   return ret;
 }
 
-Tensor _s_dirichlet_cuda(const Tensor& alpha, c10::optional<Generator> gen_) {
+Tensor _s_dirichlet_cuda(const Tensor& alpha, std::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   Tensor ret = at::empty(alpha.sizes(), alpha.options());
   launch_gamma_kernel(ret, alpha, gen);
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
index a749872ba38f3..690051e679082 100644
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -366,7 +366,7 @@ dropout_cuda(CUDAGeneratorImpl* gen, const Tensor& self, double p){
 }
 
 std::tuple<Tensor,Tensor>
-native_dropout_cuda(const Tensor& self, double p, c10::optional<bool> train){
+native_dropout_cuda(const Tensor& self, double p, std::optional<bool> train){
   // short-cut for train == false
   if (train.has_value() && !train.value()) {
     return std::make_tuple(self.clone(), at::ones_like(self, self.options().dtype(c10::CppTypeToScalarType<bool>::value)));
@@ -387,7 +387,7 @@ native_dropout_cuda(const Tensor& self, double p, c10::optional<bool> train){
 
 // TODO: _fused_dropout_cuda is to be removed, see PR #63937
 std::tuple<Tensor,Tensor>
-fused_dropout_cuda(const Tensor& self, double p, c10::optional<Generator> gen_){
+fused_dropout_cuda(const Tensor& self, double p, std::optional<Generator> gen_){
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   return dropout_cuda<uint8_t>(gen, self, p);
 }
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index 64852ae79b1f9..7c9f845b7ee26 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -312,7 +312,7 @@ Tensor embedding_bag_backward_cuda_max(const Tensor &grad,
 std::tuple<Tensor, Tensor, Tensor, Tensor>
 _embedding_bag_forward_only_cuda(const Tensor &weight, const Tensor &indices,
                    const Tensor &offsets, const bool scale_grad_by_freq,
-                   const int64_t mode, bool sparse, const c10::optional<Tensor>& per_sample_weights_opt,
+                   const int64_t mode, bool sparse, const std::optional<Tensor>& per_sample_weights_opt,
                    bool include_last_offset, int64_t padding_idx) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
@@ -335,7 +335,7 @@ _embedding_bag_forward_only_cuda(const Tensor &weight, const Tensor &indices,
 std::tuple<Tensor, Tensor, Tensor, Tensor>
 _embedding_bag_cuda(const Tensor &weight, const Tensor &indices_,
                    const Tensor &offsets_, const bool scale_grad_by_freq,
-                   const int64_t mode, bool sparse, const c10::optional<Tensor>& per_sample_weights_opt,
+                   const int64_t mode, bool sparse, const std::optional<Tensor>& per_sample_weights_opt,
                    bool include_last_offset, int64_t padding_idx) {
   TORCH_CHECK(indices_.dim() == 1 || indices_.dim() == 2,
       "input has to be a 1D or 2D Tensor, but got Tensor of dimension ",
@@ -432,7 +432,7 @@ Tensor _embedding_bag_dense_backward_cuda(const Tensor &grad_, const Tensor &ind
                                    const Tensor &bag_size_,
                                    const Tensor &max_indices,
                                    int64_t num_weights,
-                                   bool scale_grad_by_freq, int64_t mode, const c10::optional<Tensor>& per_sample_weights_opt,
+                                   bool scale_grad_by_freq, int64_t mode, const std::optional<Tensor>& per_sample_weights_opt,
                                    int64_t padding_idx) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
diff --git a/aten/src/ATen/native/cuda/FusedAdamKernel.cu b/aten/src/ATen/native/cuda/FusedAdamKernel.cu
index 9365f9a34ea76..99120ffc2816e 100644
--- a/aten/src/ATen/native/cuda/FusedAdamKernel.cu
+++ b/aten/src/ATen/native/cuda/FusedAdamKernel.cu
@@ -27,8 +27,8 @@ void _fused_adam_kernel_cuda_(
     const double eps,
     const bool amsgrad,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   if (amsgrad) {
     TORCH_CHECK(
         at::native::check_fast_path_restrictions(
@@ -86,8 +86,8 @@ void _fused_adam_kernel_cuda_(
     const double eps,
     const bool amsgrad,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   if (lr.is_cpu()) {
     _fused_adam_kernel_cuda_(
         params,
diff --git a/aten/src/ATen/native/cuda/FusedAdamWKernel.cu b/aten/src/ATen/native/cuda/FusedAdamWKernel.cu
index f926199ae9680..b0fcfe23dee81 100644
--- a/aten/src/ATen/native/cuda/FusedAdamWKernel.cu
+++ b/aten/src/ATen/native/cuda/FusedAdamWKernel.cu
@@ -28,8 +28,8 @@ void _fused_adamw_kernel_cuda_(
     const double eps,
     const bool amsgrad,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   if (amsgrad) {
     TORCH_CHECK(
         at::native::check_fast_path_restrictions(
@@ -87,8 +87,8 @@ void _fused_adamw_kernel_cuda_(
     const double eps,
     const bool amsgrad,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   if (lr.is_cpu()) {
     _fused_adamw_kernel_cuda_(
         params,
diff --git a/aten/src/ATen/native/cuda/FusedSgdKernel.cu b/aten/src/ATen/native/cuda/FusedSgdKernel.cu
index 36ac7401a2d0b..61da02ce0b888 100644
--- a/aten/src/ATen/native/cuda/FusedSgdKernel.cu
+++ b/aten/src/ATen/native/cuda/FusedSgdKernel.cu
@@ -157,8 +157,8 @@ void _fused_sgd_with_momentum_kernel_cuda_(
     const bool nesterov,
     const bool maximize,
     const bool is_first_step,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   TORCH_CHECK_GT(momentum, 0);
   TORCH_CHECK(at::native::check_fast_path_restrictions(
       {params, grads, momentum_buffer_list}));
@@ -203,8 +203,8 @@ void _fused_sgd_with_momentum_kernel_cuda_(
     const bool nesterov,
     const bool maximize,
     const bool is_first_step,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   if (lr.is_cpu()) {
     _fused_sgd_with_momentum_kernel_cuda_(
         params,
@@ -279,8 +279,8 @@ void _fused_sgd_kernel_cuda_(
     const bool nesterov,
     const bool maximize,
     const bool is_first_step,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   if (!momentum_buffer_list.empty()) {
     _fused_sgd_with_momentum_kernel_cuda_(
         params,
@@ -343,8 +343,8 @@ void _fused_sgd_kernel_cuda_(
     const bool nesterov,
     const bool maximize,
     const bool is_first_step,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   if (!momentum_buffer_list.empty()) {
     _fused_sgd_with_momentum_kernel_cuda_(
         params,
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cpp b/aten/src/ATen/native/cuda/IndexKernel.cpp
index 68770bc64e0ac..4c7ee5339afe0 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cpp
+++ b/aten/src/ATen/native/cuda/IndexKernel.cpp
@@ -42,7 +42,7 @@ static Tensor & masked_select_out_cuda_impl(Tensor & result, const Tensor & self
   auto mask_self_expanded = expand_outplace(*mask_temp, *self_temp);
   at::cuda::index_out(
       result, *std::get<1>(mask_self_expanded),
-      c10::List<c10::optional<at::Tensor>>({*std::move(std::get<0>(mask_self_expanded))}));
+      c10::List<std::optional<at::Tensor>>({*std::move(std::get<0>(mask_self_expanded))}));
 
   return result;
 }
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index ca37b2cefd411..b0a5d0a5a6a1b 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -426,7 +426,7 @@ int64_t largestIndex(const Tensor &self) {
   return result;
 }
 
-void index_put_with_sort_kernel(Tensor & self, const c10::List<c10::optional<Tensor>>& indices, const Tensor & value, bool accumulate, bool unsafe) {
+void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Tensor>>& indices, const Tensor & value, bool accumulate, bool unsafe) {
   TORCH_CHECK(!indices.empty() || is_expandable_to(value.sizes(), self.sizes()), "shape mismatch: value tensor of shape ", value.sizes(),
              " cannot be broadcast to indexing result of shape ", self.sizes());
   if (indices.size() > (size_t)self.dim()) {
@@ -561,7 +561,7 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<c10::optional<Ten
 
 REGISTER_CUDA_DISPATCH(index_put_with_sort_stub, &index_put_with_sort_kernel);
 
-void index_put_with_sort_quantized(Tensor & self, const c10::List<c10::optional<Tensor>>& indices, const Tensor & value, double scale, int zero_point, bool unsafe) {
+void index_put_with_sort_quantized(Tensor & self, const c10::List<std::optional<Tensor>>& indices, const Tensor & value, double scale, int zero_point, bool unsafe) {
   if (indices.size() > (size_t)self.dim()) {
     TORCH_CHECK_INDEX(false, "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
   }
@@ -861,7 +861,7 @@ void index_add_cuda_impl(const Tensor& self, int64_t dim, const Tensor& index, c
   TORCH_CHECK(index.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims");
 
   if (globalContext().deterministicAlgorithms()){
-    torch::List<c10::optional<Tensor>> indices;
+    torch::List<std::optional<Tensor>> indices;
     indices.reserve(dim + 1);
     for (const auto i: c10::irange(dim)) {
       indices.emplace_back();
diff --git a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
index 045bfa8d1f90b..701669bf709e5 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
+++ b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
@@ -98,7 +98,7 @@ void lazy_linalg_eig_kernel(Tensor& eigenvalues, Tensor& eigenvectors, Tensor& i
 void lazy_svd_kernel(const Tensor& A,
                      const bool full_matrices,
                      const bool compute_uv,
-                     const c10::optional<c10::string_view>& driver,
+                     const std::optional<c10::string_view>& driver,
                      const Tensor& U,
                      const Tensor& S,
                      const Tensor& Vh,
diff --git a/aten/src/ATen/native/cuda/Loss.cu b/aten/src/ATen/native/cuda/Loss.cu
index 1691adca87253..d87f1aa97873b 100644
--- a/aten/src/ATen/native/cuda/Loss.cu
+++ b/aten/src/ATen/native/cuda/Loss.cu
@@ -62,7 +62,7 @@ void binary_cross_entropy_backward_out_kernel(Tensor& grad_input, const Tensor&
 
 namespace at::native {
 
-Tensor binary_cross_entropy_cuda(const Tensor& input, const Tensor& target, const c10::optional<Tensor>& weight_opt, int64_t reduction) {
+Tensor binary_cross_entropy_cuda(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -72,7 +72,7 @@ Tensor binary_cross_entropy_cuda(const Tensor& input, const Tensor& target, cons
         input, target, weight, reduction, loss);
 }
 
-Tensor& binary_cross_entropy_out_cuda(const Tensor& input, const Tensor& target, const c10::optional<Tensor>& weight_opt, int64_t reduction, Tensor& loss) {
+Tensor& binary_cross_entropy_out_cuda(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& loss) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -121,7 +121,7 @@ Tensor& binary_cross_entropy_out_cuda(const Tensor& input, const Tensor& target,
   return loss;
 }
 
-Tensor binary_cross_entropy_backward_cuda(const Tensor& grad, const Tensor& input, const Tensor& target, const c10::optional<Tensor>& weight_opt, int64_t reduction) {
+Tensor binary_cross_entropy_backward_cuda(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -131,7 +131,7 @@ Tensor binary_cross_entropy_backward_cuda(const Tensor& grad, const Tensor& inpu
       grad, input, target, weight, reduction, grad_input);
 }
 
-Tensor& binary_cross_entropy_backward_out_cuda(const Tensor& grad, const Tensor& input, const Tensor& target, const c10::optional<Tensor>& weight_opt, int64_t reduction, Tensor& grad_input) {
+Tensor& binary_cross_entropy_backward_out_cuda(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& grad_input) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
diff --git a/aten/src/ATen/native/cuda/MixedDtypesLinear.cu b/aten/src/ATen/native/cuda/MixedDtypesLinear.cu
index 7b55c7a952442..27563c1017fbf 100644
--- a/aten/src/ATen/native/cuda/MixedDtypesLinear.cu
+++ b/aten/src/ATen/native/cuda/MixedDtypesLinear.cu
@@ -196,8 +196,8 @@ mixed_dtypes_linear_dispatch_bias_activation(
 Tensor
 _mixed_dtypes_linear(const Tensor& input, const Tensor& weight,
                      const Tensor& scale,
-                     const c10::optional<Tensor>& bias_opt,
-                     const c10::optional<c10::string_view> activation_opt) {
+                     const std::optional<Tensor>& bias_opt,
+                     const std::optional<c10::string_view> activation_opt) {
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
   AT_ERROR("_mixed_dtypes_linear: not compiled for this platform");
   return Tensor{};
diff --git a/aten/src/ATen/native/cuda/MultiMarginLoss.cu b/aten/src/ATen/native/cuda/MultiMarginLoss.cu
index 989a3e116ad62..0424fcc8e3d38 100644
--- a/aten/src/ATen/native/cuda/MultiMarginLoss.cu
+++ b/aten/src/ATen/native/cuda/MultiMarginLoss.cu
@@ -132,7 +132,7 @@ void multi_margin_loss_shape_check(
     const int64_t& ndims,
     const Tensor& input,
     const Tensor& target,
-    const c10::optional<Tensor>& weight) {
+    const std::optional<Tensor>& weight) {
     TORCH_CHECK(
         (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0,
         "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
@@ -162,7 +162,7 @@ void multi_margin_loss_shape_check(
 
 Tensor& multi_margin_loss_cuda_out(
     const Tensor &input_, const Tensor &target_, const Scalar &p_, const Scalar &margin_,
-    const c10::optional<Tensor> &weights_, int64_t reduction, Tensor& out_) {
+    const std::optional<Tensor> &weights_, int64_t reduction, Tensor& out_) {
   auto p = p_.toLong();
   int64_t nframe, dim;
   const auto ndims = input_.dim();
@@ -288,7 +288,7 @@ Tensor& multi_margin_loss_cuda_out(
 
 Tensor multi_margin_loss_cuda(
     const Tensor &input, const Tensor &target, const Scalar &p, const Scalar &margin,
-    const c10::optional<Tensor> &weights, int64_t reduction) {
+    const std::optional<Tensor> &weights, int64_t reduction) {
   auto out = at::empty({0}, input.options());
   multi_margin_loss_cuda_out(input, target, p, margin, weights, reduction, out);
   return out;
@@ -296,7 +296,7 @@ Tensor multi_margin_loss_cuda(
 
 Tensor& multi_margin_loss_cuda_backward_out(
     const Tensor &grad_output_,const Tensor &input_, const Tensor &target_,
-    const Scalar &p_, const Scalar &margin_, const c10::optional<Tensor> &weights_,
+    const Scalar &p_, const Scalar &margin_, const std::optional<Tensor> &weights_,
     int64_t reduction, Tensor &grad_input_) {
   auto p = p_.toLong();
   int64_t nframe, dim;
@@ -403,7 +403,7 @@ Tensor& multi_margin_loss_cuda_backward_out(
 
 Tensor multi_margin_loss_cuda_backward(
     const Tensor &grad_output, const Tensor &input, const Tensor &target,
-    const Scalar &p, const Scalar &margin, const c10::optional<Tensor> &weights,
+    const Scalar &p, const Scalar &margin, const std::optional<Tensor> &weights,
     int64_t reduction) {
   auto grad_input = at::empty({0}, input.options());
   multi_margin_loss_cuda_backward_out(
diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu
index d8f142a813f83..3e67f5ad5bfbe 100644
--- a/aten/src/ATen/native/cuda/MultinomialKernel.cu
+++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@@ -328,7 +328,7 @@ void multinomial_with_replacement_kernel_impl(
     Tensor& result,
     const Tensor& self,
     const int64_t n_sample,
-    c10::optional<Generator> generator) {
+    std::optional<Generator> generator) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(generator, cuda::detail::getDefaultCUDAGenerator());
 
   int inputSize = self.dim();
diff --git a/aten/src/ATen/native/cuda/NLLLoss2d.cu b/aten/src/ATen/native/cuda/NLLLoss2d.cu
index 94c9aeba79f51..046ea7bbc6d7f 100644
--- a/aten/src/ATen/native/cuda/NLLLoss2d.cu
+++ b/aten/src/ATen/native/cuda/NLLLoss2d.cu
@@ -233,7 +233,7 @@ void nll_loss2d_forward_out_cuda_template(
     Tensor& total_weight,
     const Tensor& input,
     const Tensor& target,
-    const c10::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& weight_opt,
     int64_t reduction,
     int64_t ignore_index) {
   // See Note [Writing Nondeterministic Operations]
@@ -356,7 +356,7 @@ void nll_loss2d_backward_out_cuda_template(
     const Tensor& grad_output,
     const Tensor& input,
     const Tensor& target,
-    const c10::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& weight_opt,
     int64_t reduction,
     int64_t ignore_index,
     const Tensor& total_weight) {
@@ -467,7 +467,7 @@ void nll_loss2d_backward_out_cuda_template(
 std::tuple<Tensor&, Tensor&> nll_loss2d_forward_out_cuda(
     const Tensor& self,
     const Tensor& target,
-    const c10::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& weight_opt,
     int64_t reduction,
     int64_t ignore_index,
     Tensor& output,
@@ -480,7 +480,7 @@ std::tuple<Tensor&, Tensor&> nll_loss2d_forward_out_cuda(
 std::tuple<Tensor, Tensor> nll_loss2d_forward_cuda(
     const Tensor& self,
     const Tensor& target,
-    const c10::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& weight_opt,
     int64_t reduction,
     int64_t ignore_index) {
   auto output = at::empty({0}, self.options());
@@ -494,7 +494,7 @@ Tensor& nll_loss2d_backward_out_cuda(
     const Tensor& grad_output,
     const Tensor& self,
     const Tensor& target,
-    const c10::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& weight_opt,
     int64_t reduction,
     int64_t ignore_index,
     const Tensor& total_weight,
@@ -515,7 +515,7 @@ Tensor nll_loss2d_backward_cuda(
     const Tensor& grad_output,
     const Tensor& self,
     const Tensor& target,
-    const c10::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& weight_opt,
     int64_t reduction,
     int64_t ignore_index,
     const Tensor& total_weight) {
diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu
index fd6e83aa24171..56b762a051fbf 100644
--- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu
+++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu
@@ -835,7 +835,7 @@ void slow_conv_transpose3d_acc_grad_parameters_cuda(
 
 Tensor& slow_conv_transpose3d_out_cuda(const Tensor& input,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef output_padding,
@@ -862,7 +862,7 @@ Tensor& slow_conv_transpose3d_out_cuda(const Tensor& input,
 Tensor slow_conv_transpose3d_cuda(
     const Tensor& input,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef output_padding,
diff --git a/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu b/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu
index e62e959fdf4a0..cd969fa9405bb 100644
--- a/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu
+++ b/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu
@@ -399,7 +399,7 @@ void slow_conv_dilated_all_cuda_template(
 Tensor slow_conv_dilated2d_cuda(
     const Tensor& input,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride_size,
     IntArrayRef pad_size,
     IntArrayRef dilation_size) {
@@ -505,7 +505,7 @@ std::tuple<Tensor, Tensor, Tensor> slow_conv_dilated2d_backward_cuda(
 Tensor slow_conv_dilated3d_cuda(
     const Tensor& input,
     const Tensor& weight,
-    IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
     IntArrayRef stride_size,
     IntArrayRef pad_size,
     IntArrayRef dilation_size) {
diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu
index ce0a50daae145..2bfaf13390858 100644
--- a/aten/src/ATen/native/cuda/Normalization.cu
+++ b/aten/src/ATen/native/cuda/Normalization.cu
@@ -95,8 +95,8 @@ inline Impl batch_norm_choose_impl(const Tensor& in1, const Tensor& in2) {
 }
 
 void batch_norm_elementwise(
-    const Tensor& out, const Tensor& self, const c10::optional<Tensor>& weight_opt,
-    const c10::optional<Tensor>& bias_opt, const Tensor& mean_, const Tensor& invstd_) {
+    const Tensor& out, const Tensor& self, const std::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& bias_opt, const Tensor& mean_, const Tensor& invstd_) {
   switch (batch_norm_choose_impl(self)) {
   case Impl::Contiguous: {
     c10::MaybeOwned<Tensor> weight = at::borrow_from_optional_tensor(weight_opt);
@@ -432,7 +432,7 @@ void batch_norm_calc_invstd(const Tensor& out_invstd, const Tensor& running_var,
 }
 }
 
-std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_cuda_out(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) {
+std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_cuda_out(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) {
   const bool has_running_mean = (running_mean_opt.has_value() && running_mean_opt->defined());
   const bool has_running_var = (running_var_opt.has_value() && running_var_opt->defined());
   TORCH_CHECK(has_running_mean == has_running_var);
@@ -458,7 +458,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_cuda_out(const Tensor& self, co
   return std::tuple<Tensor&, Tensor&, Tensor&>(output, save_mean, save_invstd);
 }
 
-std::tuple<Tensor, Tensor, Tensor> batch_norm_cuda(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, bool train, double momentum, double epsilon) {
+std::tuple<Tensor, Tensor, Tensor> batch_norm_cuda(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, bool train, double momentum, double epsilon) {
   auto output = at::empty_like(self);
   int64_t n_input = self.size(1);
   auto options = self.options().dtype(
@@ -482,7 +482,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cuda(const Tensor& self, const c10
 }
 
 std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_cuda(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
     Tensor& running_mean, Tensor& running_var, double momentum, double eps) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
@@ -507,7 +507,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_cuda(
 }
 
 std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> _batch_norm_with_update_cuda_out(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
     Tensor& running_mean, Tensor& running_var, double momentum, double eps,
     Tensor& out, Tensor& save_mean, Tensor& save_var, Tensor& reserve) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -529,26 +529,26 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> _batch_norm_with_update_cuda_out(
   return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>(out, save_mean, save_var, reserve);
 }
 
-std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_cuda(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon) {
+std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_cuda(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon) {
   return batch_norm_cuda(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, epsilon);
 }
 
-std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_stats_cuda(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, bool train, double momentum, double epsilon) {
+std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_stats_cuda(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, bool train, double momentum, double epsilon) {
   return batch_norm_cuda(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, epsilon);
 }
 
-std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_cuda_out(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) {
+std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_cuda_out(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) {
   return batch_norm_cuda_out(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, epsilon, output, save_mean, save_invstd);
 }
 
-std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_no_stats_cuda_out(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) {
+std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_no_stats_cuda_out(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) {
   return batch_norm_cuda_out(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, epsilon, output, save_mean, save_invstd);
 }
 
 std::tuple<Tensor, Tensor, Tensor> _new_batch_norm_backward_cuda(
     const Tensor& grad_output, const Tensor& input, const Tensor& weight,
-    const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
-    const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
+    const std::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const std::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
     bool update, double eps, std::array<bool,3> grad_input_mask, const Tensor& reserve) {
   const Tensor& dummy_bias = at::empty(1);
   const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
@@ -567,7 +567,7 @@ std::tuple<Tensor, Tensor, Tensor> _new_batch_norm_backward_cuda(
   }
 }
 
-std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cuda(const Tensor& grad_out, const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt, bool train, double epsilon, std::array<bool,3> grad_input_mask) {
+std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cuda(const Tensor& grad_out, const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt, bool train, double epsilon, std::array<bool,3> grad_input_mask) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight = at::borrow_from_optional_tensor(weight_opt);
   c10::MaybeOwned<Tensor> save_mean = at::borrow_from_optional_tensor(save_mean_opt);
@@ -673,8 +673,8 @@ std::tuple<Tensor, Tensor> batch_norm_stats_cuda(const Tensor& self, double epsi
 }
 
 Tensor batch_norm_elemt_cuda(
-    const Tensor& self, const c10::optional<Tensor>& weight_opt,
-    const c10::optional<Tensor>& bias_opt, const Tensor& mean,
+    const Tensor& self, const std::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& bias_opt, const Tensor& mean,
     const Tensor& invstd, double epsilon) {
   auto output = at::empty_like(self);
   // FIXME: Epsilon parameter isn't required, we don't take the reciprocal
@@ -682,7 +682,7 @@ Tensor batch_norm_elemt_cuda(
   return output;
 }
 
-Tensor& batch_norm_elemt_cuda_out(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+Tensor& batch_norm_elemt_cuda_out(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
                                   const Tensor& mean, const Tensor& invstd, double epsilon, Tensor& output) {
   // FIXME: Epsilon parameter isn't required, we don't take the reciprocal
   batch_norm_elementwise(output, self, weight_opt, bias_opt, mean, invstd);
@@ -690,7 +690,7 @@ Tensor& batch_norm_elemt_cuda_out(const Tensor& self, const c10::optional<Tensor
 }
 
 // accepting input(self) here to determine template data types, since running_mean/running_var are optional
-std::tuple<Tensor, Tensor> batch_norm_gather_stats_cuda(const Tensor& self, const Tensor& mean, const Tensor& invstd, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, double momentum, double epsilon, int64_t count) {
+std::tuple<Tensor, Tensor> batch_norm_gather_stats_cuda(const Tensor& self, const Tensor& mean, const Tensor& invstd, const std::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, double momentum, double epsilon, int64_t count) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> running_mean_maybe_owned = at::borrow_from_optional_tensor(running_mean_opt);
   const Tensor& running_mean = *running_mean_maybe_owned;
@@ -704,7 +704,7 @@ std::tuple<Tensor, Tensor> batch_norm_gather_stats_cuda(const Tensor& self, cons
 
 
 std::tuple<Tensor, Tensor> batch_norm_gather_stats_with_counts_cuda(
-    const Tensor& self, const Tensor& mean, const Tensor& invstd, const c10::optional<Tensor>& running_mean_opt /* optional */, const c10::optional<Tensor>& running_var_opt /* optional */, double momentum, double epsilon, const Tensor& counts) {
+    const Tensor& self, const Tensor& mean, const Tensor& invstd, const std::optional<Tensor>& running_mean_opt /* optional */, const c10::optional<Tensor>& running_var_opt /* optional */, double momentum, double epsilon, const Tensor& counts) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> running_mean_maybe_owned = at::borrow_from_optional_tensor(running_mean_opt);
   const Tensor& running_mean = *running_mean_maybe_owned;
@@ -722,7 +722,7 @@ std::tuple<Tensor, Tensor> batch_norm_gather_stats_with_counts_cuda(
   });
 }
 
-std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda(const Tensor& grad_output, const Tensor& input, const Tensor& mean, const Tensor& invstd, const c10::optional<Tensor>& weight_opt, bool input_g, bool weight_g, bool bias_g) {
+std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda(const Tensor& grad_output, const Tensor& input, const Tensor& mean, const Tensor& invstd, const std::optional<Tensor>& weight_opt, bool input_g, bool weight_g, bool bias_g) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -759,7 +759,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda(const
   });
 }
 
-Tensor batch_norm_backward_elemt_cuda(const Tensor& self, const Tensor& input, const Tensor& mean, const Tensor& invstd, const c10::optional<Tensor>& weight_opt, const Tensor& sum_dy, const Tensor& sum_dy_xmu, const Tensor& count) {
+Tensor batch_norm_backward_elemt_cuda(const Tensor& self, const Tensor& input, const Tensor& mean, const Tensor& invstd, const std::optional<Tensor>& weight_opt, const Tensor& sum_dy, const Tensor& sum_dy_xmu, const Tensor& count) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -794,8 +794,8 @@ Tensor batch_norm_backward_elemt_cuda(const Tensor& self, const Tensor& input, c
 }
 
 std::tuple<Tensor, Tensor> batch_norm_update_stats_cuda(
-    const Tensor& self, const c10::optional<Tensor>& running_mean_opt,
-    const c10::optional<Tensor>& running_var_opt, double momentum) {
+    const Tensor& self, const std::optional<Tensor>& running_mean_opt,
+    const std::optional<Tensor>& running_var_opt, double momentum) {
   c10::MaybeOwned<Tensor> running_mean = at::borrow_from_optional_tensor(running_mean_opt);
   c10::MaybeOwned<Tensor> running_var = at::borrow_from_optional_tensor(running_var_opt);
 
diff --git a/aten/src/ATen/native/cuda/RNN.cu b/aten/src/ATen/native/cuda/RNN.cu
index a997777fe0c3a..c448ba592e4af 100644
--- a/aten/src/ATen/native/cuda/RNN.cu
+++ b/aten/src/ATen/native/cuda/RNN.cu
@@ -516,7 +516,7 @@ void gru_backward_impl(const Tensor& grad_hy, const Tensor& workspace,
 
 std::tuple<Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_cuda(
       const Tensor& input_gates, const Tensor& hidden_gates,
-      const Tensor& cx, const c10::optional<Tensor>& input_bias_opt, const c10::optional<Tensor>& hidden_bias_opt) {
+      const Tensor& cx, const std::optional<Tensor>& input_bias_opt, const c10::optional<Tensor>& hidden_bias_opt) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> input_bias_maybe_owned = at::borrow_from_optional_tensor(input_bias_opt);
   const Tensor& input_bias = *input_bias_maybe_owned;
@@ -564,7 +564,7 @@ void checkLSTMBackwardSizes(const TensorArg& grad_hy, const TensorArg& grad_cy,
   checkNumel(c, workspace, exp_size[0] * exp_size[1] * 4);
 }
 
-std::tuple<Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backward_impl_cuda( const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt,
+std::tuple<Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backward_impl_cuda( const std::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt,
       const Tensor& cx, const Tensor& cy,
       const Tensor& workspace, bool has_bias) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -602,7 +602,7 @@ static constexpr int64_t GRU_WORKSPACE_MULTIPLIER = 5;
 
 std::tuple<Tensor, Tensor> _thnn_fused_gru_cell_cuda(
       const Tensor& input_gates, const Tensor& hidden_gates,
-      const Tensor& hx, const c10::optional<Tensor>& input_bias_opt, const c10::optional<Tensor>& hidden_bias_opt) {
+      const Tensor& hx, const std::optional<Tensor>& input_bias_opt, const c10::optional<Tensor>& hidden_bias_opt) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> input_bias_maybe_owned = at::borrow_from_optional_tensor(input_bias_opt);
   const Tensor& input_bias = *input_bias_maybe_owned;
diff --git a/aten/src/ATen/native/cuda/Randperm.cu b/aten/src/ATen/native/cuda/Randperm.cu
index c22c99dfe6a71..bde5457e8cdd8 100644
--- a/aten/src/ATen/native/cuda/Randperm.cu
+++ b/aten/src/ATen/native/cuda/Randperm.cu
@@ -55,7 +55,7 @@ namespace {
 template <int N> struct alignas(N) OpaqueType { char data[N]; };
 }
 
-Tensor& randperm_out_cuda(int64_t n, c10::optional<Generator> generator, Tensor& result) {
+Tensor& randperm_out_cuda(int64_t n, std::optional<Generator> generator, Tensor& result) {
   TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
 
   check_supported_max_int_with_precision(n, result);
diff --git a/aten/src/ATen/native/cuda/Repeat.cu b/aten/src/ATen/native/cuda/Repeat.cu
index 0a39a0445dbe2..57a879d6f61ac 100644
--- a/aten/src/ATen/native/cuda/Repeat.cu
+++ b/aten/src/ATen/native/cuda/Repeat.cu
@@ -54,7 +54,7 @@ namespace at::native {
 
 Tensor repeat_interleave_cuda(
     const Tensor& repeat,
-    c10::optional<int64_t> output_size) {
+    std::optional<int64_t> output_size) {
   Tensor output;
   AT_DISPATCH_INDEX_TYPES(
       repeat.scalar_type(), "repeat_interleave_cuda", [&]() {
diff --git a/aten/src/ATen/native/cuda/Resize.cpp b/aten/src/ATen/native/cuda/Resize.cpp
index 2bf6266d678b9..fe844f55d2333 100644
--- a/aten/src/ATen/native/cuda/Resize.cpp
+++ b/aten/src/ATen/native/cuda/Resize.cpp
@@ -49,7 +49,7 @@ void resize_bytes_cuda(StorageImpl* storage, size_t size_bytes) {
 const Tensor& resize_cuda_(
     const Tensor& self,
     IntArrayRef size,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   if (self.has_names()) {
     return resize_named_tensor_(self, size, optional_memory_format);
   }
diff --git a/aten/src/ATen/native/cuda/RreluWithNoise.cu b/aten/src/ATen/native/cuda/RreluWithNoise.cu
index 463a5ce00c813..7133a4920c327 100644
--- a/aten/src/ATen/native/cuda/RreluWithNoise.cu
+++ b/aten/src/ATen/native/cuda/RreluWithNoise.cu
@@ -74,7 +74,7 @@ inline void _rrelu_with_noise_cuda_train(
     const Tensor& noise_,
     const Scalar& lower_,
     const Scalar& upper_,
-    c10::optional<Generator> generator) {
+    std::optional<Generator> generator) {
   auto input = input_.contiguous();
   auto noise = noise_.contiguous();
   Tensor tmp_output = output.contiguous();
@@ -142,7 +142,7 @@ Tensor& rrelu_with_noise_out_cuda(const Tensor& self,
     const Scalar& lower,
     const Scalar& upper,
     bool training,
-    c10::optional<Generator> generator,
+    std::optional<Generator> generator,
     Tensor& output) {
   at::native::resize_output(output, self.sizes());
 
@@ -176,7 +176,7 @@ Tensor rrelu_with_noise_cuda(
     const Scalar& lower,
     const Scalar& upper,
     bool training,
-    c10::optional<Generator> generator) {
+    std::optional<Generator> generator) {
   Tensor output = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   return at::native::rrelu_with_noise_out_cuda(self, noise, lower, upper, training, generator, output);
 }
@@ -187,7 +187,7 @@ Tensor& rrelu_with_noise_cuda_(
     const Scalar& lower,
     const Scalar& upper,
     bool training,
-    c10::optional<Generator> generator) {
+    std::optional<Generator> generator) {
   return at::native::rrelu_with_noise_out_cuda(
       self, noise, lower, upper, training, generator, self);
 }
diff --git a/aten/src/ATen/native/cuda/SegmentReduce.cu b/aten/src/ATen/native/cuda/SegmentReduce.cu
index d4af81db771d3..cbdbb020d634a 100644
--- a/aten/src/ATen/native/cuda/SegmentReduce.cu
+++ b/aten/src/ATen/native/cuda/SegmentReduce.cu
@@ -266,7 +266,7 @@ Tensor _segment_reduce_lengths_offsets_backward_cuda_kernel(
     ReductionType reduction,
     const Tensor& lengths_or_offsets_contig,
     int64_t axis,
-    const c10::optional<Scalar>& initial,
+    const std::optional<Scalar>& initial,
     bool is_offsets_like) {
   axis = lengths_or_offsets_contig.dim() - 1;
   int64_t segment_count = is_offsets_like ?
@@ -368,7 +368,7 @@ Tensor _segment_reduce_lengths_backward_cuda_kernel(
   ReductionType reduction,
   const Tensor& lengths_contig,
   int64_t axis,
-  const c10::optional<Scalar>& initial) {
+  const std::optional<Scalar>& initial) {
   return _segment_reduce_lengths_offsets_backward_cuda_kernel(
     grad_contig, output_contig, data_contig, reduction, lengths_contig, axis, initial, /*is_offsets_like=*/false);
 }
@@ -380,7 +380,7 @@ Tensor _segment_reduce_offsets_backward_cuda_kernel(
   ReductionType reduction,
   const Tensor& offsets_contig,
   int64_t axis,
-  const c10::optional<Scalar>& initial) {
+  const std::optional<Scalar>& initial) {
   return _segment_reduce_lengths_offsets_backward_cuda_kernel(
     grad_contig, output_contig, data_contig, reduction, offsets_contig, axis, initial, /*is_offsets_like=*/true);
 }
@@ -390,7 +390,7 @@ Tensor _segment_reduce_lengths_offsets_cuda_kernel(
   const Tensor& data,
   const Tensor& lengths_or_offsets,
   int64_t axis,
-  const c10::optional<Scalar>& initial,
+  const std::optional<Scalar>& initial,
   bool is_offsets_like) {
   // data and lengths_or_offsets should be contiguous from the call to .contiguous in segment_reduce_kernel
   TORCH_CHECK(data.is_contiguous());
@@ -575,7 +575,7 @@ Tensor _segment_reduce_lengths_cuda_kernel(
   const Tensor& data,
   const Tensor& lengths,
   int64_t axis,
-  const c10::optional<Scalar>& initial) {
+  const std::optional<Scalar>& initial) {
   return _segment_reduce_lengths_offsets_cuda_kernel(
     reduction, data, lengths, axis, initial, /*is_offsets_like=*/false);
 }
@@ -585,7 +585,7 @@ Tensor _segment_reduce_offsets_cuda_kernel(
   const Tensor& data,
   const Tensor& offsets,
   int64_t axis,
-  const c10::optional<Scalar>& initial) {
+  const std::optional<Scalar>& initial) {
   return _segment_reduce_lengths_offsets_cuda_kernel(
     reduction, data, offsets, axis, initial, /*is_offsets_like=*/true);
 }
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
index cffd52624f9e3..97528b48d8cb0 100644
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -1113,7 +1113,7 @@ TORCH_IMPL_FUNC(softmax_backward_cuda_out)
   host_softmax_backward<SoftMaxBackwardEpilogue, false>(tmp, output, dim, half_to_float, grad_input);
 }
 
-Tensor masked_softmax_cuda(const Tensor& input_, const Tensor& mask_, const c10::optional<int64_t> dim_, const c10::optional<int64_t> mask_type_) {
+Tensor masked_softmax_cuda(const Tensor& input_, const Tensor& mask_, const std::optional<int64_t> dim_, const c10::optional<int64_t> mask_type_) {
   Tensor output = at::empty_like(input_, input_.options());
   TORCH_CHECK(mask_.scalar_type() == ScalarType::Bool, "Mask should be a boolean tensor");
 
@@ -1211,7 +1211,7 @@ Tensor masked_softmax_backward_cuda(
     const Tensor& grad_,
     const Tensor& output_,
     const Tensor& mask_,
-    const c10::optional<int64_t> dim_) {
+    const std::optional<int64_t> dim_) {
   Tensor grad_input = at::empty_like(grad_, grad_.options());
   if (grad_.numel() == 0) {
     return grad_input;
diff --git a/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu b/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu
index 62282659f6e8b..2cd1dd893a447 100644
--- a/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu
+++ b/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu
@@ -167,7 +167,7 @@ struct CUDAValueSelectionIntersectionKernel {
   }
 };
 
-using OptTensor = c10::optional<Tensor>;
+using OptTensor = std::optional<Tensor>;
 
 void mul_sparse_sparse_out_cuda_kernel(
     Tensor& result,
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp
index 1032fb28d799c..5d93797c5bd21 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@@ -218,7 +218,7 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
   CuFFTParams Params(input.strides(), out.strides(), signal_size, fft_type, value_type);
   CuFFTParamsLRUCache& plan_cache = cufft_get_plan_cache(input.device().index());
   std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
-  c10::optional<CuFFTConfig> uncached_plan;
+  std::optional<CuFFTConfig> uncached_plan;
   const CuFFTConfig * config = nullptr;
 
   // Workaround for gh-63152, gh-58724
diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu
index f2626ccff4db7..30adb0b3e5c1a 100644
--- a/aten/src/ATen/native/cuda/SummaryOps.cu
+++ b/aten/src/ATen/native/cuda/SummaryOps.cu
@@ -360,7 +360,7 @@ Tensor _histc_cuda_template(
 
 namespace native {
 Tensor _bincount_cuda(
-    const Tensor& self, const c10::optional<Tensor>& weights_opt,
+    const Tensor& self, const std::optional<Tensor>& weights_opt,
     int64_t minlength) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weights_maybe_owned = at::borrow_from_optional_tensor(weights_opt);
diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
index 42ea83a4b8bf0..87daceacdfba0 100644
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -51,7 +51,7 @@ Tensor& eye_out_cuda(int64_t n, int64_t m, Tensor& result) {
   return result;
 }
 
-Tensor empty_cuda(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
+Tensor empty_cuda(IntArrayRef size, std::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
   Tensor result = at::detail::empty_cuda(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
   // See Note [Enabling Deterministic Operations]
   if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) {
@@ -61,10 +61,10 @@ Tensor empty_cuda(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::op
 }
 
 Tensor _efficientzerotensor_cuda(IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
     auto device_ = device_or_default(device);
     if (!device_.has_index()) {
       device_.set_index(at::cuda::current_device());
@@ -77,7 +77,7 @@ Tensor _efficientzerotensor_cuda(IntArrayRef size,
 }
 
 
-Tensor empty_strided_cuda(IntArrayRef size, IntArrayRef stride, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
+Tensor empty_strided_cuda(IntArrayRef size, IntArrayRef stride, std::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
   Tensor result = at::detail::empty_strided_cuda(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
   // See Note [Enabling Deterministic Operations]
   if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) {
@@ -274,8 +274,8 @@ void tril_indices_kernel(scalar_t * tensor,
 // implementation, please enable them in test/test_cuda.py and make sure they
 // pass on your local server.
 Tensor tril_indices_cuda(
-    int64_t row, int64_t col, int64_t offset, c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
+    int64_t row, int64_t col, int64_t offset, std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
   check_args(row, col, layout_opt);
 
   auto tril_size = get_tril_size(row, col, offset);
@@ -350,8 +350,8 @@ void triu_indices_kernel(scalar_t * tensor,
 // implementation, please enable them in test/test_cuda.py and make sure they
 // pass on your local server.
 Tensor triu_indices_cuda(
-    int64_t row, int64_t col, int64_t offset, c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
+    int64_t row, int64_t col, int64_t offset, std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
   check_args(row, col, layout_opt);
 
   auto triu_size = row * col - get_tril_size(row, col, offset - 1);
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
index 451c15443fa8e..1dd47c93fae94 100644
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -221,9 +221,9 @@ C10_HOST_DEVICE static inline scalar_t _nan_to_num_replace(scalar_t a, scalar_t
 
 void nan_to_num_kernel_cuda(
     TensorIteratorBase& iter,
-    c10::optional<double> nan,
-    c10::optional<double> pos_inf,
-    c10::optional<double> neg_inf) {
+    std::optional<double> nan,
+    std::optional<double> pos_inf,
+    std::optional<double> neg_inf) {
   if (isComplexType(iter.dtype())) {
     AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "nan_to_num", [&]() {
       using value_t = scalar_t::value_type;
diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu
index e2654be0135f8..39e80e0a68c3c 100644
--- a/aten/src/ATen/native/cuda/Unique.cu
+++ b/aten/src/ATen/native/cuda/Unique.cu
@@ -218,7 +218,7 @@ unique_dim_consecutive_cuda(const Tensor& self, const int64_t dim, const bool re
 }
 
 std::tuple<Tensor, Tensor, Tensor>
-unique_consecutive_cuda(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional<int64_t> dim) {
+unique_consecutive_cuda(const Tensor& self, const bool return_inverse, const bool return_counts, std::optional<int64_t> dim) {
   if (!dim.has_value()) {
     return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
       // The current CUDA implementation of unique always sort due to the
diff --git a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
index 6673fe4993f39..31cdf0a5688b7 100644
--- a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
@@ -170,8 +170,8 @@ static void upsample_bicubic2d_out_cuda_template(
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
   checkAllSameGPU(__func__, {input_arg, output_arg});
 
@@ -225,8 +225,8 @@ static void upsample_bicubic2d_backward_out_cuda_template(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TensorArg grad_input_arg{grad_input, "grad_input", 1},
       grad_output_arg{grad_output_, "grad_output_", 2};
   checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg});
@@ -275,8 +275,8 @@ TORCH_IMPL_FUNC(upsample_bicubic2d_out_cuda) (
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output) {
   upsample_bicubic2d_out_cuda_template(output, input, output_size, align_corners, scales_h, scales_w);
 }
@@ -286,8 +286,8 @@ TORCH_IMPL_FUNC(upsample_bicubic2d_backward_out_cuda) (
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input) {
   // See Note [Writing Nondeterministic Operations]
   // Nondeterministic because of atomicAdd usage
diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
index 3c80cb7877a5c..4bd230ab8fe76 100644
--- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
@@ -264,8 +264,8 @@ static void upsample_bilinear2d_out_cuda_template(
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
   checkAllSameGPU(__func__, {input_arg, output_arg});
 
@@ -362,8 +362,8 @@ static void upsample_bilinear2d_backward_out_cuda_template(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TensorArg grad_input_arg{grad_input, "grad_input", 1},
       grad_output_arg{grad_output_, "grad_output_", 2};
   checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg});
@@ -674,8 +674,8 @@ static void upsample_gen2d_aa_out_cuda_template(
     const Tensor& input_,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TensorArg input_arg{input_, "input_", 1}, output_arg{output, "output", 2};
   checkAllSameGPU("upsample_gen2d_aa_out_cuda", {input_arg, output_arg});
 
@@ -769,8 +769,8 @@ static void upsample_gen2d_aa_backward_out_cuda_template(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
 
   // Inspired from UpSampleBicubic2d.cu::upsample_bicubic2d_backward_out_cuda_template
   TensorArg grad_input_arg{grad_input, "grad_input", 1},
@@ -844,8 +844,8 @@ TORCH_IMPL_FUNC(upsample_bilinear2d_out_cuda) (
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output) {
   upsample_bilinear2d_out_cuda_template(output, input, output_size, align_corners, scales_h, scales_w);
 }
@@ -855,8 +855,8 @@ TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_cuda) (
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input) {
   // See Note [Writing Nondeterministic Operations]
   // Nondeterministic because of atomicAdd usage
@@ -869,8 +869,8 @@ TORCH_IMPL_FUNC(_upsample_bilinear2d_aa_out_cuda) (
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output) {
 
   upsample_gen2d_aa_out_cuda_template<upsample_antialias::BilinearFilterFunctor>(
@@ -882,8 +882,8 @@ TORCH_IMPL_FUNC(_upsample_bilinear2d_aa_backward_out_cuda) (
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input) {
   // See Note [Writing Nondeterministic Operations]
   // Nondeterministic because of atomicAdd usage
@@ -898,8 +898,8 @@ TORCH_IMPL_FUNC(_upsample_bicubic2d_aa_out_cuda) (
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output) {
   upsample_gen2d_aa_out_cuda_template<upsample_antialias::BicubicFilterFunctor>(
       output, input, output_size, align_corners, scales_h, scales_w);
@@ -910,8 +910,8 @@ TORCH_IMPL_FUNC(_upsample_bicubic2d_aa_backward_out_cuda) (
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input) {
   // See Note [Writing Nondeterministic Operations]
   // Nondeterministic because of atomicAdd usage
diff --git a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
index dfba2f5479071..ebd11e234d7b3 100644
--- a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
@@ -121,7 +121,7 @@ static void upsample_linear1d_out_cuda_template(
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales) {
+    std::optional<double> scales) {
   TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
   checkAllSameGPU(__func__, {input_arg, output_arg});
 
@@ -164,7 +164,7 @@ static void upsample_linear1d_backward_out_cuda_template(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales) {
+    std::optional<double> scales) {
   TensorArg grad_output_arg{grad_output_, "grad_output_", 1},
       grad_input_arg{grad_input, "grad_input", 2};
   checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg});
@@ -208,7 +208,7 @@ TORCH_IMPL_FUNC(upsample_linear1d_out_cuda) (
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales,
+    std::optional<double> scales,
     const Tensor& output
 ) {
   upsample_linear1d_out_cuda_template(output, input, output_size, align_corners, scales);
@@ -219,7 +219,7 @@ TORCH_IMPL_FUNC(upsample_linear1d_backward_out_cuda) (
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales,
+    std::optional<double> scales,
     const Tensor& grad_input
 ) {
   // See Note [Writing Nondeterministic Operations]
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
index 3085cba0a1d16..1073f8d9dbb51 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
@@ -104,7 +104,7 @@ static void upsample_nearest1d_out_cuda_template(
     const Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
-    c10::optional<double> scales) {
+    std::optional<double> scales) {
   TensorArg input_arg{input_, "input_", 1}, output_arg{output, "output", 2};
   checkAllSameGPU("upsample_nearest1d_out_cuda", {input_arg, output_arg});
 
@@ -149,7 +149,7 @@ static void upsample_nearest1d_backward_out_cuda_template(
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales) {
+    std::optional<double> scales) {
   TensorArg grad_input_arg{grad_input, "grad_input", 1},
       grad_output_arg{grad_output_, "grad_output_", 2};
   checkAllSameGPU(
@@ -198,7 +198,7 @@ static void upsample_nearest1d_backward_out_cuda_template(
 TORCH_IMPL_FUNC(upsample_nearest1d_out_cuda) (
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales,
+    std::optional<double> scales,
     const Tensor& output
 ) {
   upsample_nearest1d_out_cuda_template<nearest_neighbor_compute_source_index>(
@@ -208,7 +208,7 @@ TORCH_IMPL_FUNC(upsample_nearest1d_out_cuda) (
 TORCH_IMPL_FUNC(_upsample_nearest_exact1d_out_cuda) (
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales,
+    std::optional<double> scales,
     const Tensor& output
 ) {
   upsample_nearest1d_out_cuda_template<nearest_neighbor_exact_compute_source_index>(output, input, output_size, scales);
@@ -218,7 +218,7 @@ TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_cuda) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales,
+    std::optional<double> scales,
     const Tensor& grad_input
 ) {
   upsample_nearest1d_backward_out_cuda_template<nearest_neighbor_bw_compute_source_index>(
@@ -229,7 +229,7 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact1d_backward_out_cuda) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales,
+    std::optional<double> scales,
     const Tensor& grad_input
 ) {
   upsample_nearest1d_backward_out_cuda_template<nearest_neighbor_exact_bw_compute_source_index>(
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
index 197fc9d60bef7..36db81cd277aa 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
@@ -207,8 +207,8 @@ static void upsample_nearest2d_out_cuda_template(
     const Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TensorArg input_arg{input_, "input_", 1}, output_arg{output, "output", 2};
   checkAllSameGPU(__func__, {input_arg, output_arg});
 
@@ -337,8 +337,8 @@ static void upsample_nearest2d_backward_out_cuda_template(
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TensorArg grad_input_arg{grad_input, "grad_input", 1},
       grad_output_arg{grad_output_, "grad_output_", 2};
   checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg});
@@ -446,8 +446,8 @@ static void upsample_nearest2d_backward_out_cuda_template(
 TORCH_IMPL_FUNC(upsample_nearest2d_out_cuda) (
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output) {
   upsample_nearest2d_out_cuda_template<nearest_neighbor_compute_source_index>(
       output, input, output_size, scales_h, scales_w);
@@ -456,8 +456,8 @@ TORCH_IMPL_FUNC(upsample_nearest2d_out_cuda) (
 TORCH_IMPL_FUNC(_upsample_nearest_exact2d_out_cuda) (
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output) {
   upsample_nearest2d_out_cuda_template<nearest_neighbor_exact_compute_source_index>(
       output, input, output_size, scales_h, scales_w);
@@ -467,8 +467,8 @@ TORCH_IMPL_FUNC(upsample_nearest2d_backward_out_cuda) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input) {
   upsample_nearest2d_backward_out_cuda_template<nearest_neighbor_bw_compute_source_index>(
       grad_input, grad_output, output_size, input_size, scales_h, scales_w);
@@ -478,8 +478,8 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact2d_backward_out_cuda) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input) {
   upsample_nearest2d_backward_out_cuda_template<nearest_neighbor_exact_bw_compute_source_index>(
       grad_input, grad_output, output_size, input_size, scales_h, scales_w);
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
index 31a7ee92e7488..53e8d71e79a79 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
@@ -148,9 +148,9 @@ static void upsample_nearest3d_out_cuda_template(
     const Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TensorArg input_arg{input_, "input_", 1}, output_arg{output, "output", 2};
   checkAllSameGPU(__func__, {input_arg, output_arg});
 
@@ -223,9 +223,9 @@ static void upsample_nearest3d_backward_out_cuda_template(
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TensorArg grad_input_arg{grad_input, "grad_input", 1},
       grad_output_arg{grad_output_, "grad_output_", 2};
   checkAllSameGPU(
@@ -292,9 +292,9 @@ static void upsample_nearest3d_backward_out_cuda_template(
 TORCH_IMPL_FUNC(upsample_nearest3d_out_cuda) (
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output) {
   upsample_nearest3d_out_cuda_template<nearest_neighbor_compute_source_index>(
       output, input, output_size, scales_d, scales_h, scales_w);
@@ -303,9 +303,9 @@ TORCH_IMPL_FUNC(upsample_nearest3d_out_cuda) (
 TORCH_IMPL_FUNC(_upsample_nearest_exact3d_out_cuda) (
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output) {
   upsample_nearest3d_out_cuda_template<nearest_neighbor_exact_compute_source_index>(output, input, output_size, scales_d, scales_h, scales_w);
 }
@@ -314,9 +314,9 @@ TORCH_IMPL_FUNC(upsample_nearest3d_backward_out_cuda) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input) {
   upsample_nearest3d_backward_out_cuda_template<nearest_neighbor_bw_compute_source_index>(
       grad_input, grad_output, output_size, input_size, scales_d, scales_h, scales_w);
@@ -326,9 +326,9 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact3d_backward_out_cuda) (
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input) {
   upsample_nearest3d_backward_out_cuda_template<nearest_neighbor_exact_bw_compute_source_index>(
       grad_input, grad_output, output_size, input_size, scales_d, scales_h, scales_w);
diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
index 43cc09d34b677..0abe0b6bcb4d2 100644
--- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
@@ -245,9 +245,9 @@ static void upsample_trilinear3d_out_cuda_template(
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
   checkAllSameGPU("upsample_trilinear3d_out_cuda", {input_arg, output_arg});
 
@@ -301,9 +301,9 @@ static void upsample_trilinear3d_backward_out_cuda_template(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TensorArg grad_input_arg{grad_input_, "grad_input_", 1},
       grad_output_arg{grad_output_, "grad_output_", 2};
   checkAllSameGPU(
@@ -377,9 +377,9 @@ TORCH_IMPL_FUNC(upsample_trilinear3d_out_cuda) (
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& output) {
   upsample_trilinear3d_out_cuda_template(output, input, output_size, align_corners, scales_d, scales_h, scales_w);
 }
@@ -389,9 +389,9 @@ TORCH_IMPL_FUNC(upsample_trilinear3d_backward_out_cuda) (
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
     const Tensor& grad_input) {
   // See Note [Writing Nondeterministic Operations]
   // Nondeterministic because of atomicAdd usage
diff --git a/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu b/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu
index 9cebb82e512a8..cef07de1b41f9 100644
--- a/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu
+++ b/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu
@@ -21,8 +21,8 @@ void _fused_adam_amsgrad_cuda_impl_(
     const double weight_decay,
     const double eps,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   std::vector<std::vector<at::Tensor>> tensor_lists{
       params.vec(),
       grads.vec(),
@@ -72,8 +72,8 @@ void _fused_adam_amsgrad_cuda_impl_(
     const double weight_decay,
     const double eps,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   std::vector<std::vector<at::Tensor>> tensor_lists{
       params.vec(),
       grads.vec(),
diff --git a/aten/src/ATen/native/cuda/fused_adam_impl.cu b/aten/src/ATen/native/cuda/fused_adam_impl.cu
index 7f2843b3b4ee4..2c1f5ce0d6d57 100644
--- a/aten/src/ATen/native/cuda/fused_adam_impl.cu
+++ b/aten/src/ATen/native/cuda/fused_adam_impl.cu
@@ -20,8 +20,8 @@ void _fused_adam_cuda_impl_(
     const double weight_decay,
     const double eps,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   std::vector<std::vector<at::Tensor>> tensor_lists{
       params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()};
 
@@ -66,8 +66,8 @@ void _fused_adam_cuda_impl_(
     const double weight_decay,
     const double eps,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   std::vector<std::vector<at::Tensor>> tensor_lists{
       params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()};
 
diff --git a/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu b/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu
index 376711c39db6d..8a22b57a47e8b 100644
--- a/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu
+++ b/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu
@@ -22,8 +22,8 @@ void _fused_adamw_amsgrad_cuda_impl_(
     const double weight_decay,
     const double eps,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   std::vector<std::vector<at::Tensor>> tensor_lists{
       params.vec(),
       grads.vec(),
@@ -73,8 +73,8 @@ void _fused_adamw_amsgrad_cuda_impl_(
     const double weight_decay,
     const double eps,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   std::vector<std::vector<at::Tensor>> tensor_lists{
       params.vec(),
       grads.vec(),
diff --git a/aten/src/ATen/native/cuda/fused_adamw_impl.cu b/aten/src/ATen/native/cuda/fused_adamw_impl.cu
index cc4feaa145122..b0f9dc6db6aff 100644
--- a/aten/src/ATen/native/cuda/fused_adamw_impl.cu
+++ b/aten/src/ATen/native/cuda/fused_adamw_impl.cu
@@ -21,8 +21,8 @@ void _fused_adamw_cuda_impl_(
     const double weight_decay,
     const double eps,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   std::vector<std::vector<at::Tensor>> tensor_lists{
       params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()};
 
@@ -67,8 +67,8 @@ void _fused_adamw_cuda_impl_(
     const double weight_decay,
     const double eps,
     const bool maximize,
-    const c10::optional<at::Tensor>& grad_scale,
-    const c10::optional<at::Tensor>& found_inf) {
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
   std::vector<std::vector<at::Tensor>> tensor_lists{
       params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()};
 
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index 6e804efe5f847..0d870cef58708 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -1393,7 +1393,7 @@ std::string generate_reduction_code(
 }
 
 // Acquires (possibly creating) the kernel cache directory
-c10::optional<std::string> get_cache_dir() {
+std::optional<std::string> get_cache_dir() {
   // If the environment variable USE_TORCH_KERNEL_CACHE is set to "0" then no persistent cache is used
   const char* uptkc = std::getenv("USE_PYTORCH_KERNEL_CACHE");
   const bool use_kernel_cache = (uptkc == nullptr) ? true : std::strcmp(uptkc, "0");
@@ -1483,7 +1483,7 @@ NvrtcFunction jit_pwise_function(
   NvrtcFunction compiled_kernel_;
   std::string name = kernel_name + "_kernel";
 
-  static const c10::optional<std::string> cache_dir = get_cache_dir();
+  static const std::optional<std::string> cache_dir = get_cache_dir();
 
   std::string file_path;
   if (cache_dir.has_value()) {
diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index 6423dddbb2995..f06b247ef32be 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -1334,8 +1334,8 @@ void LayerNormBackwardKernelImpl(
 std::tuple<Tensor, Tensor, Tensor> layer_norm_cuda(
     const Tensor& input,
     IntArrayRef normalized_shape,
-    const c10::optional<Tensor>& weight_opt /* optional */,
-    const c10::optional<Tensor>& bias_opt /* optional */,
+    const std::optional<Tensor>& weight_opt /* optional */,
+    const std::optional<Tensor>& bias_opt /* optional */,
     double eps) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned =
@@ -1390,8 +1390,8 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_cuda(
     IntArrayRef normalized_shape,
     const Tensor& mean,
     const Tensor& rstd,
-    const c10::optional<Tensor>& weight_opt /* optional */,
-    const c10::optional<Tensor>& bias_opt /* optional */,
+    const std::optional<Tensor>& weight_opt /* optional */,
+    const std::optional<Tensor>& bias_opt /* optional */,
     std::array<bool, 3> grad_input_mask) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned =
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
index 5471c57ec30ed..04b12695dd0a7 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@@ -2210,7 +2210,7 @@ void svd_magma(const Tensor& A,
 void svd_kernel(const Tensor& A,
                 const bool full_matrices,
                 const bool compute_uv,
-                const c10::optional<c10::string_view>& driver,
+                const std::optional<c10::string_view>& driver,
                 const Tensor& U,
                 const Tensor& S,
                 const Tensor& Vh,
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
index 643501f0cbccd..bc06f118ae9a0 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@@ -648,7 +648,7 @@ std::string _format_non_converging_batches(const std::vector<int64_t>& batches)
 void svd_cusolver(const Tensor& A,
                   const bool full_matrices,
                   const bool compute_uv,
-                  const c10::optional<c10::string_view>& driver,
+                  const std::optional<c10::string_view>& driver,
                   const Tensor& U,
                   const Tensor& S,
                   const Tensor& V,
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
index cca2e04941a54..75732ec315a45 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
@@ -61,7 +61,7 @@ void lu_solve_batched_cublas(const Tensor& LU, const Tensor& pivots, const Tenso
 
 // entrance of calculations of `svd` using cusolver gesvdj and gesvdjBatched
 void svd_cusolver(const Tensor& A, const bool full_matrices, const bool compute_uv,
-  const c10::optional<c10::string_view>& driver, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& info);
+  const std::optional<c10::string_view>& driver, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& info);
 
 // entrance of calculations of `cholesky` using cusolver potrf and potrfBatched
 void cholesky_helper_cusolver(const Tensor& input, bool upper, const Tensor& info);
diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp
index 44b004dff0007..460a9b73dd2c5 100644
--- a/aten/src/ATen/native/cudnn/BatchNorm.cpp
+++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp
@@ -19,9 +19,9 @@ namespace native {
 std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<Tensor>& bias_opt,
-    const c10::optional<Tensor>& running_mean_opt,
-    const c10::optional<Tensor>& running_var_opt,
+    const std::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& running_mean_opt,
+    const std::optional<Tensor>& running_var_opt,
     bool training,
     double exponential_average_factor,
     double epsilon) {
@@ -32,10 +32,10 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
     const Tensor& input,
     const Tensor& grad_output,
     const Tensor& weight,
-    const c10::optional<Tensor>& running_mean_opt,
-    const c10::optional<Tensor>& running_var_opt,
-    const c10::optional<Tensor>& save_mean_opt,
-    const c10::optional<Tensor>& save_var_opt,
+    const std::optional<Tensor>& running_mean_opt,
+    const std::optional<Tensor>& running_var_opt,
+    const std::optional<Tensor>& save_mean_opt,
+    const std::optional<Tensor>& save_var_opt,
     double epsilon,
     const Tensor& reservedSpace) {
   AT_ERROR("cudnn_batch_norm_backward: ATen not compiled with cuDNN support");
@@ -121,9 +121,9 @@ size_t _get_cudnn_batch_norm_reserve_space_size(
 std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
     const Tensor& input_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_t_opt,
-    const c10::optional<Tensor>& running_mean_t_opt,
-    const c10::optional<Tensor>& running_var_t_opt,
+    const std::optional<Tensor>& bias_t_opt,
+    const std::optional<Tensor>& running_mean_t_opt,
+    const std::optional<Tensor>& running_var_t_opt,
     bool training,
     double exponential_average_factor,
     double epsilon) {
@@ -274,10 +274,10 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
     const Tensor& weight_t,
     // Unused: but we require them to be passed so that double backwards
     // has access
-    const c10::optional<Tensor>& running_mean_opt,
-    const c10::optional<Tensor>& running_var_opt,
-    const c10::optional<Tensor>& save_mean_t_opt,
-    const c10::optional<Tensor>& save_var_t_opt,
+    const std::optional<Tensor>& running_mean_opt,
+    const std::optional<Tensor>& running_var_opt,
+    const std::optional<Tensor>& save_mean_t_opt,
+    const std::optional<Tensor>& save_var_t_opt,
     double epsilon,
     const Tensor& reserveSpace) {
   // See [Note: hacky wrapper removal for optional tensor]
diff --git a/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp b/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp
index 8475a143f466c..349999e4544f9 100644
--- a/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp
+++ b/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp
@@ -205,7 +205,7 @@ void raw_cudnn_convolution_backward_weight_out(
 Tensor cudnn_convolution_relu(
     const Tensor& input_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_t,
+    const std::optional<Tensor>& bias_t,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
@@ -217,8 +217,8 @@ Tensor cudnn_convolution_add_relu(
     const Tensor& input_t,
     const Tensor& weight_t,
     const Tensor& z_t,
-    const c10::optional<Scalar>& alpha,
-    const c10::optional<Tensor>& bias_t,
+    const std::optional<Scalar>& alpha,
+    const std::optional<Tensor>& bias_t,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp
index 104ae8c70803d..09a10581ab142 100644
--- a/aten/src/ATen/native/cudnn/ConvShared.cpp
+++ b/aten/src/ATen/native/cudnn/ConvShared.cpp
@@ -705,7 +705,7 @@ std::tuple<at::Tensor, at::Tensor> cudnn_convolution_transpose_backward(
 Tensor cudnn_convolution_relu(
     const Tensor& input_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_t,
+    const std::optional<Tensor>& bias_t,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
@@ -758,8 +758,8 @@ Tensor cudnn_convolution_add_relu(
     const Tensor& input_t,
     const Tensor& weight_t,
     const Tensor& z_t,
-    const c10::optional<Scalar>& alpha,
-    const c10::optional<Tensor>& bias_t,
+    const std::optional<Scalar>& alpha,
+    const std::optional<Tensor>& bias_t,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 05b1df3114f85..55c666eeca83c 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -51,9 +51,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     const Tensor& input_r,
     TensorList weight,
     int64_t weight_stride0,
-    const c10::optional<Tensor>& weight_buf_r_opt,
+    const std::optional<Tensor>& weight_buf_r_opt,
     const Tensor& hx,
-    const c10::optional<Tensor>& cx_opt,
+    const std::optional<Tensor>& cx_opt,
     int64_t fn_mode,
     int64_t fn_hidden_size,
     int64_t fn_proj_size,
@@ -63,7 +63,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     bool fn_train,
     bool fn_bidirectional,
     IntArrayRef fn_batch_sizes,
-    const c10::optional<Tensor>& fn_dropout_state_opt) {
+    const std::optional<Tensor>& fn_dropout_state_opt) {
   AT_ERROR("_cudnn_rnn: ATen not compiled with cuDNN support");
 }
 
@@ -73,11 +73,11 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
     int64_t weight_stride0,
     const Tensor& weight_buf,
     const Tensor& hx,
-    const c10::optional<Tensor>& cx_opt,
+    const std::optional<Tensor>& cx_opt,
     const Tensor& output,
-    const c10::optional<Tensor>& grad_output_r_opt,
-    const c10::optional<Tensor>& grad_hy_r_opt,
-    const c10::optional<Tensor>& grad_cy_r_opt,
+    const std::optional<Tensor>& grad_output_r_opt,
+    const std::optional<Tensor>& grad_hy_r_opt,
+    const std::optional<Tensor>& grad_cy_r_opt,
     int64_t mode,
     int64_t hidden_size,
     int64_t proj_size,
@@ -87,7 +87,7 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
     bool train,
     bool bidirectional,
     IntArrayRef batch_sizes,
-    const c10::optional<Tensor>& dropout_state_opt,
+    const std::optional<Tensor>& dropout_state_opt,
     const Tensor& reserve,
     std::array<bool, 4> output_mask) {
   AT_ERROR("_cudnn_rnn_backward: ATen not compiled with cuDNN support");
@@ -97,10 +97,10 @@ Tensor _cudnn_init_dropout_state(
     double dropout,
     bool train,
     int64_t dropout_seed,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(
       pin_memory);
@@ -1396,9 +1396,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     const Tensor& input_r,
     TensorList weight,
     int64_t weight_stride0,
-    const c10::optional<Tensor>& weight_buf_r_opt,
+    const std::optional<Tensor>& weight_buf_r_opt,
     const Tensor& hx,
-    const c10::optional<Tensor>& cx_opt,
+    const std::optional<Tensor>& cx_opt,
     int64_t fn_mode,
     int64_t fn_hidden_size,
     int64_t fn_proj_size,
@@ -1408,7 +1408,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     bool fn_train,
     bool fn_bidirectional,
     IntArrayRef fn_batch_sizes,
-    const c10::optional<Tensor>& fn_dropout_state_opt) {
+    const std::optional<Tensor>& fn_dropout_state_opt) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_buf_r_maybe_owned =
       at::borrow_from_optional_tensor(weight_buf_r_opt);
@@ -2105,11 +2105,11 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
     int64_t weight_stride0,
     const Tensor& weight_buf,
     const Tensor& hx,
-    const c10::optional<Tensor>& cx_opt,
+    const std::optional<Tensor>& cx_opt,
     const Tensor& output,
-    const c10::optional<Tensor>& grad_output_r_opt,
-    const c10::optional<Tensor>& grad_hy_r_opt,
-    const c10::optional<Tensor>& grad_cy_r_opt,
+    const std::optional<Tensor>& grad_output_r_opt,
+    const std::optional<Tensor>& grad_hy_r_opt,
+    const std::optional<Tensor>& grad_cy_r_opt,
     int64_t mode,
     int64_t hidden_size,
     int64_t proj_size,
@@ -2119,7 +2119,7 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
     bool train,
     bool bidirectional,
     IntArrayRef batch_sizes,
-    const c10::optional<Tensor>& dropout_state_opt,
+    const std::optional<Tensor>& dropout_state_opt,
     const Tensor& reserve,
     std::array<bool, 4> output_mask) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -2214,10 +2214,10 @@ Tensor _cudnn_init_dropout_state(
     double dropout,
     bool train,
     int64_t dropout_seed,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options =
       TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(
@@ -2304,7 +2304,7 @@ struct DropoutState {
   // needed for the first time. Note that in this case needed != used, as we
   // don't need a buffer to e.g. run RNNs in test mode.
   at::Tensor buffer;
-  c10::optional<cuda::CUDAEvent> event;
+  std::optional<cuda::CUDAEvent> event;
   std::mutex mutex;
 #if !defined(USE_ROCM)
   // cudaStreamGetCaptureInfo will never give back a capture id of 0, so 0 can
@@ -2531,8 +2531,8 @@ std::pair<Tensor, hidden_type> _cudnn_impl(
   }
 
   // TODO:  try_get_weight_buf returns a Tensor, but _cudnn_rnn below takes a
-  // c10::optional<Tensor> in weight_buf's slot.  Do we want try_get_weight_buf
-  // to return a c10::optional<Tensor> instead of a defined or undefined Tensor?
+  // std::optional<Tensor> in weight_buf's slot.  Do we want try_get_weight_buf
+  // to return a std::optional<Tensor> instead of a defined or undefined Tensor?
   at::cuda::OptionalCUDAGuard guard(input.get_device());
   auto weight_buf = try_get_weight_buf(
       input,
diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp
index 1babf82b90e05..85767b7502dc3 100644
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@@ -61,8 +61,8 @@ void check_group_norm_inputs(
 
 std::tuple<Tensor, Tensor, Tensor> native_group_norm(
     const Tensor& X,
-    const c10::optional<Tensor>& gamma_opt /* optional */,
-    const c10::optional<Tensor>& beta_opt /* optional */,
+    const std::optional<Tensor>& gamma_opt /* optional */,
+    const std::optional<Tensor>& beta_opt /* optional */,
     int64_t N,
     int64_t C,
     int64_t HxW,
@@ -107,7 +107,7 @@ std::tuple<Tensor, Tensor, Tensor> native_group_norm_backward(
     const Tensor& X,
     const Tensor& mean,
     const Tensor& rstd,
-    const c10::optional<Tensor>& gamma_opt,
+    const std::optional<Tensor>& gamma_opt,
     int64_t N,
     int64_t C,
     int64_t HxW,
@@ -177,8 +177,8 @@ std::tuple<Tensor, Tensor, Tensor> native_group_norm_backward(
 Tensor group_norm(
     const Tensor& input,
     int64_t num_groups,
-    const c10::optional<Tensor>& weight_opt /* optional */,
-    const c10::optional<Tensor>& bias_opt /* optional */,
+    const std::optional<Tensor>& weight_opt /* optional */,
+    const std::optional<Tensor>& bias_opt /* optional */,
     double eps,
     bool /* cudnn_enabled, deprecated */) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -213,8 +213,8 @@ DEFINE_DISPATCH(GroupNormBackwardKernel);
 // Ported from pytorch/xla repo
 std::tuple<at::Tensor, at::Tensor, at::Tensor> math_group_norm(
     const Tensor& input,
-    const c10::optional<Tensor>& weight_opt,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& bias_opt,
     int64_t N,
     int64_t C,
     int64_t HxW,
diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp
index 27a701dd2eb49..9858840f95223 100644
--- a/aten/src/ATen/native/layer_norm.cpp
+++ b/aten/src/ATen/native/layer_norm.cpp
@@ -74,7 +74,7 @@ void layer_norm_cpu_out(
 
 std::tuple<Tensor, Tensor, Tensor> layer_norm_cpu(
     const Tensor& input,
-    IntArrayRef normalized_shape, const c10::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */,
+    IntArrayRef normalized_shape, const std::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */,
     double eps) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
@@ -115,8 +115,8 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_cpu(
     IntArrayRef normalized_shape,
     const Tensor& mean,
     const Tensor& rstd,
-    const c10::optional<Tensor>& weight_opt /* optional */,
-    const c10::optional<Tensor>& bias_opt /* optional */,
+    const std::optional<Tensor>& weight_opt /* optional */,
+    const std::optional<Tensor>& bias_opt /* optional */,
     std::array<bool, 3> grad_input_mask) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned =
@@ -186,7 +186,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_cpu(
 
 Tensor layer_norm_symint(
     const Tensor& input,
-    c10::SymIntArrayRef normalized_shape, const c10::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */,
+    c10::SymIntArrayRef normalized_shape, const std::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */,
     double eps,
     bool /* cudnn_enable, deprecated */) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -204,7 +204,7 @@ DEFINE_DISPATCH(LayerNormBackwardKernel);
 // Ported from pytorch/xla repo
 std::tuple<Tensor, Tensor, Tensor> math_native_layer_norm(
     const Tensor& input,
-    IntArrayRef normalized_shape, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef normalized_shape, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
     double eps) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
@@ -266,8 +266,8 @@ std::tuple<Tensor, Tensor, Tensor> math_native_layer_norm(
 Tensor rms_norm(
     const Tensor& input,
     IntArrayRef normalized_shape,
-    const c10::optional<Tensor>& weight_opt /* optional */,
-    c10::optional<double> eps) {
+    const std::optional<Tensor>& weight_opt /* optional */,
+    std::optional<double> eps) {
 
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
diff --git a/aten/src/ATen/native/layer_norm.h b/aten/src/ATen/native/layer_norm.h
index 38e63569586e3..e35ccf8634bcc 100644
--- a/aten/src/ATen/native/layer_norm.h
+++ b/aten/src/ATen/native/layer_norm.h
@@ -74,8 +74,8 @@ void layer_norm_cpu_out(
 Tensor rms_norm(
     const Tensor& input,
     IntArrayRef normalized_shape,
-    const c10::optional<Tensor>& weight_opt /* optional */,
-    c10::optional<double> eps);
+    const std::optional<Tensor>& weight_opt /* optional */,
+    std::optional<double> eps);
 
 using forward_fn = void (*)(
     const Tensor& /* X */,
diff --git a/aten/src/ATen/native/metal/MetalNeuronType.h b/aten/src/ATen/native/metal/MetalNeuronType.h
index b59d163c4ae88..c5cb0b99502c6 100644
--- a/aten/src/ATen/native/metal/MetalNeuronType.h
+++ b/aten/src/ATen/native/metal/MetalNeuronType.h
@@ -20,8 +20,8 @@ enum class NeuronType {
 };
 
 static inline NeuronType neuronType(
-    c10::optional<c10::Scalar> output_min,
-    c10::optional<c10::Scalar> output_max) {
+    std::optional<c10::Scalar> output_min,
+    std::optional<c10::Scalar> output_max) {
   float inf_max = std::numeric_limits<float>::infinity();
   float inf_min = -std::numeric_limits<float>::infinity();
   float output_max_ =
diff --git a/aten/src/ATen/native/metal/MetalPrepackOpContext.h b/aten/src/ATen/native/metal/MetalPrepackOpContext.h
index 02f474ece8da2..4481c879eec29 100644
--- a/aten/src/ATen/native/metal/MetalPrepackOpContext.h
+++ b/aten/src/ATen/native/metal/MetalPrepackOpContext.h
@@ -9,13 +9,13 @@ namespace metal {
 
 using SerializationTypeConv2dPrePack = std::tuple<
     Tensor,
-    c10::optional<Tensor>,
+    std::optional<Tensor>,
     std::vector<int64_t>,
     std::vector<int64_t>,
     std::vector<int64_t>,
     int64_t,
-    c10::optional<Scalar>,
-    c10::optional<Scalar>>;
+    std::optional<Scalar>,
+    std::optional<Scalar>>;
 
 class Conv2dOpContext : public torch::jit::CustomClassHolder {
  public:
@@ -33,13 +33,13 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder {
   Conv2dOpContext() = delete;
   Conv2dOpContext(
       at::Tensor&& weight,
-      c10::optional<at::Tensor>&& bias,
+      std::optional<at::Tensor>&& bias,
       std::vector<int64_t> stride,
       std::vector<int64_t> padding,
       std::vector<int64_t> dilation,
       int64_t groups,
-      c10::optional<Scalar> output_min,
-      c10::optional<Scalar> output_max)
+      std::optional<Scalar> output_min,
+      std::optional<Scalar> output_max)
       : weight_(std::move(weight)),
         bias_(std::move(bias)),
         stride_(std::move(stride)),
@@ -65,7 +65,7 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder {
     return weight_;
   }
 
-  const c10::optional<Tensor>& get_bias() const {
+  const std::optional<Tensor>& get_bias() const {
     return bias_;
   }
 
@@ -85,11 +85,11 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder {
     return groups_;
   }
 
-  const c10::optional<Scalar>& get_output_min() const {
+  const std::optional<Scalar>& get_output_min() const {
     return output_min_;
   }
 
-  const c10::optional<Scalar>& get_output_max() const {
+  const std::optional<Scalar>& get_output_max() const {
     return output_max_;
   }
 
@@ -111,22 +111,22 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder {
 
   private:
     Tensor weight_;
-    c10::optional<Tensor> bias_;
+    std::optional<Tensor> bias_;
     std::vector<int64_t> stride_;
     std::vector<int64_t> padding_;
     std::vector<int64_t> dilation_;
     int64_t groups_;
-    c10::optional<Scalar> output_min_;
-    c10::optional<Scalar> output_max_;
+    std::optional<Scalar> output_min_;
+    std::optional<Scalar> output_max_;
     std::function<void(void*)> releaseCallback_ = nullptr;
     void* conv2dOp_ = nullptr; // reserved to hold MPSCNNConv2dOp objects
 };
 
 using SerializationTypeLinearPrePack = std::tuple<
     Tensor,
-    c10::optional<Tensor>,
-    c10::optional<Scalar>,
-    c10::optional<Scalar>>;
+    std::optional<Tensor>,
+    std::optional<Scalar>,
+    std::optional<Scalar>>;
 
 class LinearOpContext : public torch::jit::CustomClassHolder {
  public:
@@ -136,9 +136,9 @@ class LinearOpContext : public torch::jit::CustomClassHolder {
   LinearOpContext() = delete;
   LinearOpContext(
       at::Tensor&& weight,
-      c10::optional<at::Tensor>&& bias,
-      c10::optional<Scalar> output_min,
-      c10::optional<Scalar> output_max)
+      std::optional<at::Tensor>&& bias,
+      std::optional<Scalar> output_min,
+      std::optional<Scalar> output_max)
       : weight_(std::move(weight)),
         bias_(std::move(bias)),
         output_min_(std::move(output_min)),
@@ -160,15 +160,15 @@ class LinearOpContext : public torch::jit::CustomClassHolder {
     return weight_;
   }
 
-  const c10::optional<Tensor>& get_bias() const {
+  const std::optional<Tensor>& get_bias() const {
     return bias_;
   }
 
-  const c10::optional<Scalar>& get_output_min() const {
+  const std::optional<Scalar>& get_output_min() const {
     return output_min_;
   }
 
-  const c10::optional<Scalar>& get_output_max() const {
+  const std::optional<Scalar>& get_output_max() const {
     return output_max_;
   }
 
@@ -190,9 +190,9 @@ class LinearOpContext : public torch::jit::CustomClassHolder {
 
  private:
   Tensor weight_;
-  c10::optional<Tensor> bias_;
-  c10::optional<Scalar> output_min_;
-  c10::optional<Scalar> output_max_;
+  std::optional<Tensor> bias_;
+  std::optional<Scalar> output_min_;
+  std::optional<Scalar> output_max_;
   void* opaqueOpPtr_ = nullptr; // reserved to hold MPSCNNFullyConnected objects
   std::function<void(void*)> releaseCallback_ = nullptr;
 };
diff --git a/aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp b/aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp
index bbdf713801860..ebf9b9daf6263 100644
--- a/aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp
+++ b/aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp
@@ -9,13 +9,13 @@ namespace metal {
 
 c10::intrusive_ptr<Conv2dOpContext> unpack(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& dilation,
     int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   auto packedWeight = weight.contiguous(MemoryFormat::ChannelsLast);
   return c10::make_intrusive<Conv2dOpContext>(
       std::move(packedWeight),
@@ -30,9 +30,9 @@ c10::intrusive_ptr<Conv2dOpContext> unpack(
 
 c10::intrusive_ptr<LinearOpContext> unpack(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    std::optional<Tensor>&& bias,
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   TORCH_CHECK(weight.dim() == 2);
   // Don't need to do `weight.t()`
   auto packedWeight = weight.view({weight.size(0), weight.size(1), 1, 1})
@@ -96,13 +96,13 @@ TORCH_LIBRARY(metal_prepack, m) {
 
 c10::intrusive_ptr<Conv2dOpContext> conv2d_prepack(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& dilation,
     int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   TORCH_CHECK(weight.dim() == 4);
   return c10::make_intrusive<Conv2dOpContext>(
       std::move(weight),
@@ -117,9 +117,9 @@ c10::intrusive_ptr<Conv2dOpContext> conv2d_prepack(
 
 c10::intrusive_ptr<LinearOpContext> linear_prepack(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    std::optional<Tensor>&& bias,
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   return c10::make_intrusive<LinearOpContext>(
       std::move(weight), std::move(bias), output_min, output_max);
 }
diff --git a/aten/src/ATen/native/metal/ops/MetalConvolution.h b/aten/src/ATen/native/metal/ops/MetalConvolution.h
index e5a68e45cd929..77053448cbcb4 100644
--- a/aten/src/ATen/native/metal/ops/MetalConvolution.h
+++ b/aten/src/ATen/native/metal/ops/MetalConvolution.h
@@ -9,7 +9,7 @@ namespace metal {
 Tensor conv2d(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<at::Tensor>& bias,
+    const std::optional<at::Tensor>& bias,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
index 7c641b3fadd89..5a89c01bc0394 100644
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -22,13 +22,13 @@ namespace at { namespace native {
 // See Note [ATen preprocessor philosophy]
 
 std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
-    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
     bool training, double exponential_average_factor, double epsilon) {
   AT_ERROR("miopen_batch_norm: ATen not compiled with MIOpen support");
 }
 
 std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
-    const Tensor& input, const Tensor& grad_output, const Tensor& weight, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
+    const Tensor& input, const Tensor& grad_output, const Tensor& weight, const std::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
     double epsilon) {
   AT_ERROR("miopen_batch_norm_backward: ATen not compiled with MIOpen support");
 }
@@ -58,7 +58,7 @@ Tensor expandScale(const Tensor& t, int64_t dim) {
 }  // namespace
 
 std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
-    const Tensor& input_t, const Tensor& weight_t, const c10::optional<Tensor>& bias_t_opt, const c10::optional<Tensor>& running_mean_t_opt, const c10::optional<Tensor>& running_var_t_opt,
+    const Tensor& input_t, const Tensor& weight_t, const std::optional<Tensor>& bias_t_opt, const c10::optional<Tensor>& running_mean_t_opt, const c10::optional<Tensor>& running_var_t_opt,
     bool training, double exponential_average_factor, double epsilon)
 {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -83,7 +83,8 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
     checkAllSameType(c, {input, weight});
   }
   checkAllSameType(c, {weight, bias, running_mean, running_var});
-  checkAllContiguous(c, {input, weight, bias, running_mean, running_var});
+  checkAllContiguous(c, {weight, bias, running_mean, running_var});
+  TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
   checkDimRange(c, input, 2, 6 /* exclusive */);
   auto num_features = input->size(1);
   for (auto t : {weight, bias, running_mean, running_var}) {
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 88f889c2cc1fa..71b4620ecfdf0 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -31,7 +31,7 @@ namespace at { namespace native {
 // See Note [ATen preprocessor philosophy]
 
 at::Tensor miopen_convolution(
-    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt /* optional */,
+    const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt /* optional */,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
     int64_t groups, bool benchmark, bool deterministic) {
   AT_ERROR("miopen_convolution: ATen not compiled with MIOpen support");
@@ -64,7 +64,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_backward(
 }
 
 at::Tensor miopen_convolution_transpose(
-    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt /* optional */,
+    const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt /* optional */,
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
     int64_t groups, bool benchmark, bool deterministic) {
   AT_ERROR("miopen_convolution_transpose: ATen not compiled with MIOpen support");
@@ -92,7 +92,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_transpose_backwa
 }
 
 at::Tensor miopen_depthwise_convolution(
-    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt /* optional */,
+    const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt /* optional */,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
     int64_t groups, bool benchmark, bool deterministic) {
   AT_ERROR("miopen_depthwise_convolution: ATen not compiled with MIOpen support");
@@ -122,13 +122,13 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_depthwise_convolution_backwa
 
 at::Tensor miopen_convolution_add_relu(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& z,
-    const c10::optional<Scalar>& alpha, const c10::optional<Tensor>& bias, IntArrayRef stride,
+    const std::optional<Scalar>& alpha, const c10::optional<Tensor>& bias, IntArrayRef stride,
     IntArrayRef padding, IntArrayRef dilation, int64_t groups) {
   AT_ERROR("miopen_convolution_add_relu: ATen not compiled with MIOpen support");
 }
 
 at::Tensor miopen_convolution_relu(
-    const at::Tensor& input, const at::Tensor& weight, const c10::optional<Tensor>& bias,
+    const at::Tensor& input, const at::Tensor& weight, const std::optional<Tensor>& bias,
     IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, int64_t groups) {
   AT_ERROR("miopen_convolution_relu: ATen not compiled with MIOpen support");
 }
@@ -795,7 +795,7 @@ Tensor miopen_convolution_forward(
 }
 
 Tensor miopen_convolution(
-    const Tensor& input_t, const Tensor& weight_t, const c10::optional<Tensor>& bias_t_opt,
+    const Tensor& input_t, const Tensor& weight_t, const std::optional<Tensor>& bias_t_opt,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
     int64_t groups, bool benchmark, bool deterministic)
 {
@@ -896,7 +896,7 @@ Tensor miopen_depthwise_convolution_forward(
 }
 
 Tensor miopen_depthwise_convolution(
-    const Tensor& input_t, const Tensor& weight_t, const c10::optional<Tensor>& bias_t_opt,
+    const Tensor& input_t, const Tensor& weight_t, const std::optional<Tensor>& bias_t_opt,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
     int64_t groups, bool benchmark, bool deterministic)
 {
@@ -1463,7 +1463,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_depthwise_convolution_backwa
 }
 
 Tensor miopen_convolution_transpose(
-    const Tensor& input_t, const Tensor& weight_t, const c10::optional<Tensor>& bias_t_opt,
+    const Tensor& input_t, const Tensor& weight_t, const std::optional<Tensor>& bias_t_opt,
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
     int64_t groups, bool benchmark, bool deterministic)
 {
@@ -1552,8 +1552,8 @@ Tensor miopen_convolution_add_relu(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& z,
-    const c10::optional<Scalar>& alpha,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Scalar>& alpha,
+    const std::optional<Tensor>& bias,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
@@ -1607,7 +1607,7 @@ Tensor miopen_convolution_add_relu(
 Tensor miopen_convolution_relu(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
diff --git a/aten/src/ATen/native/miopen/RNN_miopen.cpp b/aten/src/ATen/native/miopen/RNN_miopen.cpp
index 7b2b2ab80e553..2cba1aa3aef14 100644
--- a/aten/src/ATen/native/miopen/RNN_miopen.cpp
+++ b/aten/src/ATen/native/miopen/RNN_miopen.cpp
@@ -29,18 +29,18 @@ namespace at { namespace native {
 
     std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> miopen_rnn(
             const Tensor& input_r, TensorList weight, int64_t weight_stride0,
-            const Tensor& hx, const c10::optional<Tensor>& cx_opt,
+            const Tensor& hx, const std::optional<Tensor>& cx_opt,
             int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_num_layers,
             bool batch_first, double fn_dropout, bool fn_train, bool fn_bidirectional,
-            IntArrayRef fn_batch_sizes, const c10::optional<Tensor>& fn_dropout_state_opt
+            IntArrayRef fn_batch_sizes, const std::optional<Tensor>& fn_dropout_state_opt
             ) {
         AT_ERROR("miopen_rnn : ATen not compiled with MIOpen support.");
     }
 
     std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> miopen_rnn_backward(
-            const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const c10::optional<Tensor>& cx_opt,
-            const Tensor& output, const c10::optional<Tensor>& grad_output_r_opt, const c10::optional<Tensor>& grad_hy_r_opt, const c10::optional<Tensor>& grad_cy_r_opt, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first,
-            double dropout, bool train, bool bidirectional, IntArrayRef batch_sizes, const c10::optional<Tensor>& dropout_state_opt,
+            const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const std::optional<Tensor>& cx_opt,
+            const Tensor& output, const std::optional<Tensor>& grad_output_r_opt, const c10::optional<Tensor>& grad_hy_r_opt, const c10::optional<Tensor>& grad_cy_r_opt, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first,
+            double dropout, bool train, bool bidirectional, IntArrayRef batch_sizes, const std::optional<Tensor>& dropout_state_opt,
             const Tensor& reserve, std::array<bool, 4> output_mask
             ) {
         AT_ERROR("miopen_rnn_backward: ATen not compiled with MIOpen support.");
@@ -444,10 +444,10 @@ std::vector<int64_t> _output_size(const RNNDescriptorParams& rnn, const TensorDe
 
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> miopen_rnn(
         const Tensor& input_r, TensorList weight, int64_t weight_stride0,
-        const Tensor& hx, const c10::optional<Tensor>& cx_opt,
+        const Tensor& hx, const std::optional<Tensor>& cx_opt,
         int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_num_layers,
         bool batch_first, double fn_dropout, bool fn_train, bool fn_bidirectional,
-        IntArrayRef fn_batch_sizes, const c10::optional<Tensor>& fn_dropout_state_opt
+        IntArrayRef fn_batch_sizes, const std::optional<Tensor>& fn_dropout_state_opt
         ) {
     // See [Note: hacky wrapper removal for optional tensor]
     c10::MaybeOwned<Tensor> cx_maybe_owned = at::borrow_from_optional_tensor(cx_opt);
@@ -758,9 +758,9 @@ std::vector<Tensor> miopen_rnn_backward_weight(
 }
 
 std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> miopen_rnn_backward(
-        const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const c10::optional<Tensor>& cx_opt,
-        const Tensor& output, const c10::optional<Tensor>& grad_output_r_opt, const c10::optional<Tensor>& grad_hy_r_opt, const c10::optional<Tensor>& grad_cy_r_opt, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first,
-        double dropout, bool train, bool bidirectional, IntArrayRef batch_sizes, const c10::optional<Tensor>& dropout_state_opt,
+        const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const std::optional<Tensor>& cx_opt,
+        const Tensor& output, const std::optional<Tensor>& grad_output_r_opt, const c10::optional<Tensor>& grad_hy_r_opt, const c10::optional<Tensor>& grad_cy_r_opt, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first,
+        double dropout, bool train, bool bidirectional, IntArrayRef batch_sizes, const std::optional<Tensor>& dropout_state_opt,
         const Tensor& reserve, std::array<bool, 4> output_mask
         ) {
     // See [Note: hacky wrapper removal for optional tensor]
diff --git a/aten/src/ATen/native/mkldnn/Common.h b/aten/src/ATen/native/mkldnn/Common.h
index 4e048ebce7597..baf823a9bcec7 100644
--- a/aten/src/ATen/native/mkldnn/Common.h
+++ b/aten/src/ATen/native/mkldnn/Common.h
@@ -13,7 +13,7 @@ namespace mkldnn {
 
 struct ContextConv final {
   ideep::tensor weight_packed_;
-  c10::optional<at::Tensor> at_bias_;
+  std::optional<at::Tensor> at_bias_;
   std::vector<int64_t> padding_;
   std::vector<int64_t> stride_;
   std::vector<int64_t> dilation_;
@@ -24,7 +24,7 @@ struct ContextConv final {
 
   ContextConv(
       ideep::tensor&& weight_packed,
-      c10::optional<at::Tensor> at_bias,
+      std::optional<at::Tensor> at_bias,
       std::vector<int64_t> padding,
       std::vector<int64_t> stride,
       std::vector<int64_t> dilation,
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index 3e41e2f1071d0..09dca06e2b5ae 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -22,7 +22,7 @@
 namespace at { namespace native {
 
 Tensor mkldnn_convolution(
-    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) {
   TORCH_CHECK(false, "mkldnn_convolution_forward: ATen not compiled with MKLDNN support");
 }
@@ -48,7 +48,7 @@ static std::tuple<Tensor, Tensor, Tensor> mkldnn_convolution_backward(
 REGISTER_NO_CPU_DISPATCH(mkldnn_convolution_backward_stub);
 
 static Tensor mkldnn_convolution_transpose(
-    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt,
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) {
   TORCH_CHECK(false, "mkldnn_convolution_transpose: ATen not compiled with MKLDNN support");
 }
@@ -259,16 +259,16 @@ static void _mkldnn_convolution_out (
 static Tensor _mkldnn_convolution(
     const Tensor& input_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
     int64_t groups,
     bool use_channels_last,
     c10::string_view attr = "none",
-    torch::List<c10::optional<at::Scalar>> scalars =
-        torch::List<c10::optional<at::Scalar>>(),
-    c10::optional<c10::string_view> algorithm = c10::nullopt) {
+    torch::List<std::optional<at::Scalar>> scalars =
+        torch::List<std::optional<at::Scalar>>(),
+    std::optional<c10::string_view> algorithm = c10::nullopt) {
   ideep::attr_t op_attr = ideep::attr_t();
   if (attr != "none") {
     auto it = fusion_unary_attr_map().find(attr);
@@ -324,7 +324,7 @@ static Tensor _mkldnn_convolution(
 Tensor mkldnn_convolution(
     const Tensor& input_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
@@ -345,14 +345,14 @@ namespace{
 Tensor mkldnn_convolution_pointwise(
     const Tensor& input_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
     int64_t groups,
     c10::string_view attr,
-    torch::List<c10::optional<at::Scalar>> scalars,
-    c10::optional<c10::string_view> algorithm) {
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<c10::string_view> algorithm) {
   c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
   bool use_channels_last =
       weight_t.is_mkldnn() || mkldnn_conv_use_channels_last(input_t, weight_t);
@@ -382,16 +382,16 @@ Tensor mkldnn_convolution_pointwise_binary(
     const Tensor& input_t,
     const Tensor& other_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
     int64_t groups,
     c10::string_view binary_attr,
-    c10::optional<at::Scalar> alpha,
-    c10::optional<c10::string_view> unary_attr,
-    torch::List<c10::optional<at::Scalar>> unary_scalars,
-    c10::optional<c10::string_view> unary_algorithm) {
+    std::optional<at::Scalar> alpha,
+    std::optional<c10::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<c10::string_view> unary_algorithm) {
   TORCH_CHECK(
       input_t.ndimension() == 4 || input_t.ndimension() == 5,
       "mkldnn_convolution_pointwise_binary: currently only support 2d and 3d")
@@ -546,16 +546,16 @@ Tensor& mkldnn_convolution_pointwise_binary_(
     Tensor& other_t,
     const Tensor& input_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
     int64_t groups,
     c10::string_view binary_attr,
-    c10::optional<at::Scalar> alpha,
-    c10::optional<c10::string_view> unary_attr,
-    torch::List<c10::optional<at::Scalar>> unary_scalars,
-    c10::optional<c10::string_view> unary_algorithm) {
+    std::optional<at::Scalar> alpha,
+    std::optional<c10::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<c10::string_view> unary_algorithm) {
   // other_t += convolution(...), other_t = unary(other_t)
   TORCH_CHECK(
       input_t.ndimension() == 4 || input_t.ndimension() == 5,
@@ -664,7 +664,7 @@ std::vector<int64_t> _original_deconv_weight_size(
 Tensor _mkldnn_convolution_transpose(
     const Tensor& input_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef output_padding,
     IntArrayRef stride,
@@ -672,9 +672,9 @@ Tensor _mkldnn_convolution_transpose(
     int64_t groups,
     bool use_channels_last,
     c10::string_view attr = "none",
-    torch::List<c10::optional<at::Scalar>> scalars =
-        torch::List<c10::optional<at::Scalar>>(),
-    c10::optional<c10::string_view> algorithm = c10::nullopt) {
+    torch::List<std::optional<at::Scalar>> scalars =
+        torch::List<std::optional<at::Scalar>>(),
+    std::optional<c10::string_view> algorithm = c10::nullopt) {
   ideep::attr_t op_attr = ideep::attr_t();
   if (attr != "none") {
     auto it = fusion_unary_attr_map().find(attr);
@@ -760,15 +760,15 @@ Tensor _mkldnn_convolution_transpose(
 Tensor mkldnn_convolution_transpose_pointwise(
     const Tensor& input_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef output_padding,
     IntArrayRef stride,
     IntArrayRef dilation,
     int64_t groups,
     c10::string_view attr,
-    torch::List<c10::optional<at::Scalar>> scalars,
-    c10::optional<c10::string_view> algorithm) {
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<c10::string_view> algorithm) {
   c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
   bool use_channels_last =
       weight_t.is_mkldnn() || mkldnn_conv_use_channels_last(input_t, weight_t);
@@ -791,15 +791,15 @@ Tensor mkldnn_convolution_transpose_pointwise(
 Tensor mkldnn_convolution_transpose_pointwise_meta(
     const Tensor& input_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef output_padding,
     IntArrayRef stride,
     IntArrayRef dilation,
     int64_t groups,
     c10::string_view attr,
-    torch::List<c10::optional<at::Scalar>> scalars,
-    c10::optional<c10::string_view> algorithm) {
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<c10::string_view> algorithm) {
 
   std::vector<int64_t> weight_IOHW_sizes = _original_deconv_weight_size(weight_t, groups);
   int64_t dim = input_t.ndimension() - 2;
@@ -941,7 +941,7 @@ namespace{
 Tensor mkldnn_convolution_transpose(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef output_padding,
     IntArrayRef stride,
diff --git a/aten/src/ATen/native/mkldnn/ConvPrepack.cpp b/aten/src/ATen/native/mkldnn/ConvPrepack.cpp
index 4fb126f25cf09..cab4f1efa55eb 100644
--- a/aten/src/ATen/native/mkldnn/ConvPrepack.cpp
+++ b/aten/src/ATen/native/mkldnn/ConvPrepack.cpp
@@ -19,7 +19,7 @@ namespace convolution {
 
 c10::intrusive_ptr<mkldnn::ConvOpContext> createConvPrePackOpContext(
     Tensor weight,
-    c10::optional<Tensor> bias,
+    std::optional<Tensor> bias,
     std::vector<int64_t> stride,
     std::vector<int64_t> padding,
     std::vector<int64_t> dilation,
@@ -43,7 +43,7 @@ c10::intrusive_ptr<mkldnn::ConvOpContext> createConvPrePackOpContext(
 
 ContextConv create(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const IntArrayRef padding,
     const IntArrayRef stride,
     const IntArrayRef dilation,
@@ -98,7 +98,7 @@ static void _mkldnn_convolution_out(
     const ideep::tensor& x,
     ideep::tensor& y,
     const ideep::tensor& w,
-    const c10::optional<ideep::tensor>& b,
+    const std::optional<ideep::tensor>& b,
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
@@ -147,7 +147,7 @@ static void mkldnn_convolution_out(
     const Tensor& input,
     ideep::tensor& mkldnn_output,
     const ideep::tensor& mkldnn_weight,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
@@ -160,7 +160,7 @@ static void mkldnn_convolution_out(
 
   c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
   const ideep::tensor mkldnn_input = itensor_from_tensor(input);
-  c10::optional<ideep::tensor> mkldnn_bias{c10::nullopt};
+  std::optional<ideep::tensor> mkldnn_bias{c10::nullopt};
   if (bias.defined()) {
     mkldnn_bias = itensor_from_tensor(bias);
   }
diff --git a/aten/src/ATen/native/mkldnn/ConvPrepack.h b/aten/src/ATen/native/mkldnn/ConvPrepack.h
index 03189c5f5e706..db858b9bb46d9 100644
--- a/aten/src/ATen/native/mkldnn/ConvPrepack.h
+++ b/aten/src/ATen/native/mkldnn/ConvPrepack.h
@@ -14,7 +14,7 @@ namespace convolution {
 
 c10::intrusive_ptr<mkldnn::ConvOpContext> createConvPrePackOpContext(
     Tensor weight,
-    c10::optional<Tensor> bias,
+    std::optional<Tensor> bias,
     std::vector<int64_t> stride,
     std::vector<int64_t> padding,
     std::vector<int64_t> dilation,
@@ -28,7 +28,7 @@ Tensor conv_run(
 
 ContextConv create(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const IntArrayRef padding,
     const IntArrayRef stride,
     const IntArrayRef dilation,
diff --git a/aten/src/ATen/native/mkldnn/Linear.cpp b/aten/src/ATen/native/mkldnn/Linear.cpp
index 71d033fca3b86..70434fde7e479 100644
--- a/aten/src/ATen/native/mkldnn/Linear.cpp
+++ b/aten/src/ATen/native/mkldnn/Linear.cpp
@@ -26,7 +26,7 @@ namespace native {
 
 Tensor mkldnn_linear(
     const Tensor& self,
-    const Tensor& weight, const c10::optional<Tensor>& bias_opt) {
+    const Tensor& weight, const std::optional<Tensor>& bias_opt) {
   TORCH_CHECK(false, "mkldnn_linear: ATen not compiled with MKLDNN support");
 }
 Tensor mkldnn_linear_backward_input(
@@ -58,7 +58,7 @@ namespace native {
 
 Tensor mkldnn_linear(
     const Tensor& self,
-    const Tensor& weight_t, const c10::optional<Tensor>& bias_opt) {
+    const Tensor& weight_t, const std::optional<Tensor>& bias_opt) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
@@ -183,10 +183,10 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_linear_backward(
 static Tensor mkldnn_linear_pointwise(
     const Tensor& input_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     c10::string_view attr,
-    torch::List<c10::optional<at::Scalar>> scalars,
-    c10::optional<c10::string_view> algorithm) {
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<c10::string_view> algorithm) {
   auto input = input_t.contiguous();
   auto input_size = input.sizes();
 
@@ -218,7 +218,7 @@ static Tensor mkldnn_linear_pointwise(
 
   const ideep::tensor mkldnn_input = itensor_view_from_dense(input_reshaped);
 
-  c10::optional<ideep::tensor> mkldnn_bias{c10::nullopt};
+  std::optional<ideep::tensor> mkldnn_bias{c10::nullopt};
   if (bias.defined()) {
     mkldnn_bias = itensor_from_tensor(bias);
   }
@@ -258,7 +258,7 @@ static Tensor mkldnn_linear_pointwise_binary(
     const Tensor& input_t,
     const Tensor& other_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     c10::string_view attr) {
   c10::MaybeOwned<Tensor> bias_maybe_owned =
       at::borrow_from_optional_tensor(bias_opt);
@@ -303,7 +303,7 @@ static Tensor mkldnn_linear_pointwise_binary(
   const ideep::tensor mkldnn_other = itensor_from_tensor(other_reshaped);
   const ideep::tensor mkldnn_input = itensor_view_from_dense(input_reshaped);
 
-  c10::optional<ideep::tensor> mkldnn_bias{c10::nullopt};
+  std::optional<ideep::tensor> mkldnn_bias{c10::nullopt};
   if (bias.defined()) {
     mkldnn_bias = itensor_from_tensor(bias);
   }
@@ -339,7 +339,7 @@ static Tensor mkl_linear(
     const Tensor& self,
     const Tensor& mkl_weight_t,
     const Tensor& origin_weight_t,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     const int64_t prepack_batch_size) {
   c10::MaybeOwned<Tensor> bias_maybe_owned =
       at::borrow_from_optional_tensor(bias_opt);
@@ -427,7 +427,7 @@ static Tensor mkl_linear(
     const Tensor& self,
     const Tensor& mkl_weight_t,
     const Tensor& origin_weight_t,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     const int64_t prepack_batch_size) {
   TORCH_CHECK(false, "mkl_linear: ATen not compiled with MKL support");
 }
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp b/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
index 061d154f3b40f..e6fdbb0656c07 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
+++ b/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
@@ -61,7 +61,7 @@ ideep::tensor::data_type get_mkldnn_dtype(ScalarType type) {
   }
 }
 
-Tensor new_with_itensor_mkldnn(ideep::tensor&& it, c10::optional<ScalarType> dtype, c10::optional<Device> device) {
+Tensor new_with_itensor_mkldnn(ideep::tensor&& it, std::optional<ScalarType> dtype, c10::optional<Device> device) {
   // NOTE: int32_t dims from ideep::tensor but sizes needs int64_t
   // TODO: support int64_t dims in ideep::tensor to avoid extra conversion
   auto dims = it.get_dims();
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNCommon.h b/aten/src/ATen/native/mkldnn/MKLDNNCommon.h
index 5e9044ce908aa..f41c4ae075be5 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNCommon.h
+++ b/aten/src/ATen/native/mkldnn/MKLDNNCommon.h
@@ -29,7 +29,7 @@ static inline ideep::tensor::data_type get_mkldnn_dtype(const Tensor& t) {
 }
 
 // Construct aten MKL-DNN tensor given an ideep tensor
-TORCH_API Tensor new_with_itensor_mkldnn(ideep::tensor&& it, c10::optional<ScalarType> dtype, c10::optional<Device> device);
+TORCH_API Tensor new_with_itensor_mkldnn(ideep::tensor&& it, std::optional<ScalarType> dtype, c10::optional<Device> device);
 
 // Retrieve `ideep::tensor` from MKL-DNN tensor
 TORCH_API ideep::tensor& itensor_from_mkldnn(const Tensor& mkldnn_tensor);
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
index b2901bc522be2..f01cb8da1241f 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
+++ b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
@@ -24,7 +24,7 @@ namespace at { namespace native {
 
 #if AT_MKLDNN_ENABLED()
 
-Tensor mkldnn_to_dense(const Tensor& mkldnn_tensor, c10::optional<ScalarType> dtype, c10::optional<bool> masked_grad) {
+Tensor mkldnn_to_dense(const Tensor& mkldnn_tensor, std::optional<ScalarType> dtype, c10::optional<bool> masked_grad) {
   TORCH_CHECK(mkldnn_tensor.scalar_type() == ScalarType::Float ||
               mkldnn_tensor.scalar_type() == ScalarType::BFloat16 ||
               mkldnn_tensor.scalar_type() == ScalarType::Half ||
@@ -73,7 +73,7 @@ Tensor mkldnn_to_dense(const Tensor& mkldnn_tensor, c10::optional<ScalarType> dt
   return cpu_tensor.contiguous().resize_(dims, c10::MemoryFormat::Contiguous);
 }
 
-Tensor dense_to_mkldnn(const Tensor& cpu_tensor, c10::optional<ScalarType> dtype) {
+Tensor dense_to_mkldnn(const Tensor& cpu_tensor, std::optional<ScalarType> dtype) {
   TORCH_CHECK(cpu_tensor.device().is_cpu(),
              "dense_to_mkldnn expects CPU tensor input");
   TORCH_CHECK(cpu_tensor.layout() == Layout::Strided,
@@ -256,7 +256,7 @@ static Tensor mkldnn_reorder_conv_weight(
 
 static Tensor mkldnn_reorder_linear_weight(
     const Tensor& self,
-    c10::optional<int64_t> batch_size_opt) {
+    std::optional<int64_t> batch_size_opt) {
   mkldnn_check_low_precision(self.scalar_type(), "mkldnn_reorder_linear_weight");
   auto out_features = self.size(0);
   auto in_features = self.size(1);
@@ -525,11 +525,11 @@ TORCH_LIBRARY_IMPL(mkldnn, CPU, m) {
 
 #else
 
-Tensor mkldnn_to_dense(const Tensor& mkldnn_tensor, c10::optional<ScalarType> dtype, c10::optional<bool> masked_grad) {
+Tensor mkldnn_to_dense(const Tensor& mkldnn_tensor, std::optional<ScalarType> dtype, c10::optional<bool> masked_grad) {
   TORCH_CHECK(false, "MKL-DNN build is disabled");
 }
 
-Tensor dense_to_mkldnn(const Tensor& cpu_tensor, c10::optional<ScalarType> dtype) {
+Tensor dense_to_mkldnn(const Tensor& cpu_tensor, std::optional<ScalarType> dtype) {
   TORCH_CHECK(false, "MKL-DNN build is disabled");
 }
 
diff --git a/aten/src/ATen/native/mkldnn/Normalization.cpp b/aten/src/ATen/native/mkldnn/Normalization.cpp
index 0aced614a0ea3..e684a931f7752 100644
--- a/aten/src/ATen/native/mkldnn/Normalization.cpp
+++ b/aten/src/ATen/native/mkldnn/Normalization.cpp
@@ -21,7 +21,7 @@ namespace at {
 namespace native {
 
 std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm(
-    const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
     bool train,
     double momentum,
     double eps) {
@@ -30,7 +30,7 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm(
 
 std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm_backward(
     const Tensor& grad_output,
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt,
     bool train,
     double eps,
     std::array<bool,3> grad_input_mask) {
@@ -45,7 +45,7 @@ static std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bi
 }
 
 std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var,
     bool train,
     double momentum,
     double eps) {
@@ -54,7 +54,7 @@ std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit(
 
 
 std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit_no_stats(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
     bool train,
     double momentum,
     double eps) {
@@ -62,15 +62,15 @@ std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit_no_stats(
 }
 
 std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_mkldnn(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
     Tensor& running_mean, Tensor& running_var, double momentum, double eps) {
   TORCH_CHECK(false, "_batch_norm_with_update_mkldnn: ATen not compiled with MKLDNN support");
 }
 
 std::tuple<Tensor, Tensor, Tensor> _new_batch_norm_backward_mkldnn(
     const Tensor& grad_output, const Tensor& input, const Tensor& weight,
-    const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
-    const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
+    const std::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const std::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
     bool update, double eps, std::array<bool,3> grad_input_mask, const Tensor& reserve) {
   TORCH_CHECK(false, "_new_batch_norm_backward_mkldnn: ATen not compiled with MKLDNN support");
 }
@@ -131,7 +131,7 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bias_f32(
 
 
 std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
     bool train,
     double momentum,
     double eps) {
@@ -209,7 +209,7 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm(
 
 
 std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_mkldnn(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
     Tensor& running_mean, Tensor& running_var, double momentum, double eps) {
   Tensor output, save_mean, save_var;
   std::tie(output, save_mean, save_var) =
@@ -220,7 +220,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_mkldnn(
 
 
 std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var,
     bool train,
     double momentum,
     double eps) {
@@ -229,7 +229,7 @@ std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit(
 
 
 std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit_no_stats(
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
     bool train,
     double momentum,
     double eps) {
@@ -239,15 +239,15 @@ std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit_no_stats(
 
 std::tuple<Tensor, Tensor, Tensor> _new_batch_norm_backward_mkldnn(
     const Tensor& grad_output, const Tensor& input, const Tensor& weight,
-    const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
-    const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
+    const std::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const std::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
     bool update, double eps, std::array<bool,3> grad_input_mask, const Tensor& reserve) {
   return mkldnn_batch_norm_backward(grad_output, input, weight, running_mean_opt, running_var_opt, save_mean_opt, save_var_opt, update, eps, grad_input_mask);
 }
 
 
 std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm_backward(const Tensor& grad_output,
-    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt,
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt,
     bool train,
     double eps,
     std::array<bool,3> grad_input_mask) {
diff --git a/aten/src/ATen/native/mkldnn/OpContext.cpp b/aten/src/ATen/native/mkldnn/OpContext.cpp
index 3de67ceacf002..820f1273b0cb5 100644
--- a/aten/src/ATen/native/mkldnn/OpContext.cpp
+++ b/aten/src/ATen/native/mkldnn/OpContext.cpp
@@ -9,7 +9,7 @@ namespace mkldnn {
 
 c10::intrusive_ptr<ConvOpContext> MkldnnConvOpContext::create_context(
     at::Tensor&& weight,
-    c10::optional<at::Tensor>&& bias,
+    std::optional<at::Tensor>&& bias,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& dilation,
diff --git a/aten/src/ATen/native/mkldnn/OpContext.h b/aten/src/ATen/native/mkldnn/OpContext.h
index 21e8cc78a5134..5ae5344ccf509 100644
--- a/aten/src/ATen/native/mkldnn/OpContext.h
+++ b/aten/src/ATen/native/mkldnn/OpContext.h
@@ -17,7 +17,7 @@ const static std::map<std::string, ideep::attr_t> fusion_attr_map = {
 
 using SerializationTypeConvPrePack = std::tuple<
     Tensor,
-    c10::optional<Tensor>,
+    std::optional<Tensor>,
     std::vector<int64_t>,
     std::vector<int64_t>,
     std::vector<int64_t>,
@@ -28,7 +28,7 @@ using SerializationTypeConvPrePack = std::tuple<
 class ConvOpContext : public torch::jit::CustomClassHolder {
  protected:
   Tensor orig_weight_;
-  c10::optional<Tensor> orig_bias_;
+  std::optional<Tensor> orig_bias_;
   std::vector<int64_t> stride_;
   std::vector<int64_t> padding_;
   std::vector<int64_t> dilation_;
@@ -60,7 +60,7 @@ class MkldnnConvOpContext final : public ConvOpContext {
  public:
   MkldnnConvOpContext(
       Tensor&& weight,
-      c10::optional<Tensor>&& bias,
+      std::optional<Tensor>&& bias,
       std::vector<int64_t>&& padding,
       std::vector<int64_t>&& stride,
       std::vector<int64_t>&& dilation,
@@ -83,7 +83,7 @@ class MkldnnConvOpContext final : public ConvOpContext {
 
   static c10::intrusive_ptr<ConvOpContext> create_context(
       Tensor&& weight,
-      c10::optional<Tensor>&& bias,
+      std::optional<Tensor>&& bias,
       std::vector<int64_t>&& padding,
       std::vector<int64_t>&& stride,
       std::vector<int64_t>&& dilation,
diff --git a/aten/src/ATen/native/mkldnn/Pooling.cpp b/aten/src/ATen/native/mkldnn/Pooling.cpp
index 7b59d7b85fe93..e1a5cfe5dff32 100644
--- a/aten/src/ATen/native/mkldnn/Pooling.cpp
+++ b/aten/src/ATen/native/mkldnn/Pooling.cpp
@@ -56,7 +56,7 @@ Tensor mkldnn_avg_pool2d(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_CHECK(false, "mkldnn_avg_pool2d: ATen not compiled with MKLDNN support");
 }
 
@@ -66,7 +66,7 @@ Tensor& mkldnn_avg_pool2d_out(const Tensor& self,
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override,
+    std::optional<int64_t> divisor_override,
     Tensor& output) {
   TORCH_CHECK(false, "mkldnn_avg_pool2d_out: ATen not compiled with MKLDNN support");
 }
@@ -78,7 +78,7 @@ Tensor mkldnn_avg_pool3d(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_CHECK(false, "mkldnn_avg_pool3d: ATen not compiled with MKLDNN support");
 }
 
@@ -88,7 +88,7 @@ Tensor& mkldnn_avg_pool3d_out(const Tensor& self,
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override,
+    std::optional<int64_t> divisor_override,
     Tensor& output) {
   TORCH_CHECK(false, "mkldnn_avg_pool3d_out: ATen not compiled with MKLDNN support");
 }
@@ -140,7 +140,7 @@ Tensor& mkldnn_avg_pool2d_backward_out(const Tensor & grad_output,
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override,
+    std::optional<int64_t> divisor_override,
     Tensor & grad_input) {
   TORCH_CHECK(false, "mkldnn_avg_pool2d_backward_out: ATen not compiled with MKLDNN support");
 }
@@ -153,7 +153,7 @@ Tensor mkldnn_avg_pool2d_backward(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_CHECK(false, "mkldnn_avg_pool2d_backward: ATen not compiled with MKLDNN support");
 }
 
@@ -164,7 +164,7 @@ Tensor& mkldnn_avg_pool3d_backward_out(const Tensor & grad_output,
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override,
+    std::optional<int64_t> divisor_override,
     Tensor & grad_input) {
   TORCH_CHECK(false, "mkldnn_avg_pool3d_backward_out: ATen not compiled with MKLDNN support");
 }
@@ -177,7 +177,7 @@ Tensor mkldnn_avg_pool3d_backward(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_CHECK(false, "mkldnn_avg_pool3d_backward: ATen not compiled with MKLDNN support");
 }
 
@@ -418,7 +418,7 @@ Tensor mkldnn_avg_pool2d(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_CHECK(!divisor_override.has_value(),
       "mkldnn_avg_pool2d operator does not support divisor");
   if (input.scalar_type() == ScalarType::BFloat16) {
@@ -443,7 +443,7 @@ Tensor& mkldnn_avg_pool2d_out(const Tensor& input,
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override,
+    std::optional<int64_t> divisor_override,
     Tensor& output) {
   TORCH_CHECK(false, "mkldnn_avg_pool2d_out: in-place mkldnn operations are not supported yet");
 }
@@ -455,7 +455,7 @@ Tensor mkldnn_avg_pool3d(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_CHECK(!divisor_override.has_value(), "mkldnn_avg_pool3d operator does not support divisor");
   if (input.scalar_type() == ScalarType::BFloat16) {
     TORCH_CHECK(mkldnn_bf16_device_check(),
@@ -479,7 +479,7 @@ Tensor& mkldnn_avg_pool3d_out(const Tensor& input,
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override,
+    std::optional<int64_t> divisor_override,
     Tensor& output) {
   TORCH_CHECK(false, "mkldnn_avg_pool3d_out: in-place mkldnn operations are not supported yet");
 }
@@ -579,7 +579,7 @@ Tensor mkldnn_avg_pool2d_backward(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   return _mkldnn_pooling_backward(
       grad_output,
       grad_output,
@@ -600,7 +600,7 @@ Tensor& mkldnn_avg_pool2d_backward_out(const Tensor & grad_output,
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override,
+    std::optional<int64_t> divisor_override,
     Tensor & grad_input) {
   TORCH_CHECK(false, "mkldnn_avg_pool2d_backward_out: in-place mkldnn operations are not supported yet");
 }
@@ -613,7 +613,7 @@ Tensor mkldnn_avg_pool3d_backward(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   return _mkldnn_pooling_backward(
       grad_output,
       grad_output,
@@ -634,7 +634,7 @@ Tensor& mkldnn_avg_pool3d_backward_out(const Tensor & grad_output,
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override,
+    std::optional<int64_t> divisor_override,
     Tensor & grad_input) {
   TORCH_CHECK(false, "mkldnn_avg_pool3d_backward_out: in-place mkldnn operations are not supported yet");
 }
diff --git a/aten/src/ATen/native/mkldnn/RNN.cpp b/aten/src/ATen/native/mkldnn/RNN.cpp
index afea7f91e79ea..b35504bc19cce 100644
--- a/aten/src/ATen/native/mkldnn/RNN.cpp
+++ b/aten/src/ATen/native/mkldnn/RNN.cpp
@@ -55,9 +55,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> mkldnn_rnn_la
     const Tensor& output,
     const Tensor& hy_,
     const Tensor& cy_,
-    const c10::optional<Tensor>& grad_output_r_opt,
-    const c10::optional<Tensor>& grad_hy_r_opt,
-    const c10::optional<Tensor>& grad_cy_r_opt,
+    const std::optional<Tensor>& grad_output_r_opt,
+    const std::optional<Tensor>& grad_hy_r_opt,
+    const std::optional<Tensor>& grad_cy_r_opt,
     bool reverse,
     int64_t mode,
     int64_t hidden_size,
@@ -306,9 +306,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> mkldnn_rnn_la
     const Tensor& output,
     const Tensor& hy_,
     const Tensor& cy_,
-    const c10::optional<Tensor>& grad_output_r_opt,
-    const c10::optional<Tensor>& grad_hy_r_opt,
-    const c10::optional<Tensor>& grad_cy_r_opt,
+    const std::optional<Tensor>& grad_output_r_opt,
+    const std::optional<Tensor>& grad_hy_r_opt,
+    const std::optional<Tensor>& grad_cy_r_opt,
     bool reverse,
     int64_t mode,
     int64_t hidden_size,
diff --git a/aten/src/ATen/native/mkldnn/TensorFactories.cpp b/aten/src/ATen/native/mkldnn/TensorFactories.cpp
index 65a22aa74ed53..81dc5d8880cfa 100644
--- a/aten/src/ATen/native/mkldnn/TensorFactories.cpp
+++ b/aten/src/ATen/native/mkldnn/TensorFactories.cpp
@@ -12,7 +12,7 @@ namespace at { namespace native {
 
 #if AT_MKLDNN_ENABLED()
 
-Tensor empty_mkldnn(IntArrayRef sizes, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<c10::MemoryFormat> optional_memory_format) {
+Tensor empty_mkldnn(IntArrayRef sizes, std::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<c10::MemoryFormat> optional_memory_format) {
   TORCH_CHECK(
      !optional_memory_format.has_value(),
      "'memory_format' argument is incompatible with mkldnn tensor");
@@ -26,7 +26,7 @@ Tensor empty_mkldnn(IntArrayRef sizes, c10::optional<ScalarType> dtype, c10::opt
 
 #else
 
-Tensor empty_mkldnn(IntArrayRef sizes, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<c10::MemoryFormat> optional_memory_format) {
+Tensor empty_mkldnn(IntArrayRef sizes, std::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<c10::MemoryFormat> optional_memory_format) {
   TORCH_CHECK(false, "empty_mkldnn: MKL-DNN build is disabled");
 }
 
diff --git a/aten/src/ATen/native/mkldnn/TensorShape.cpp b/aten/src/ATen/native/mkldnn/TensorShape.cpp
index ac47648294242..d653d2588ba22 100644
--- a/aten/src/ATen/native/mkldnn/TensorShape.cpp
+++ b/aten/src/ATen/native/mkldnn/TensorShape.cpp
@@ -26,7 +26,7 @@ Tensor mkldnn_reshape(const Tensor& self, IntArrayRef size) {
   TORCH_CHECK(false, "mkldnn_reshape: ATen not compiled with MKLDNN support");
 }
 
-Tensor mkldnn_clone(const Tensor& self, c10::optional<c10::MemoryFormat> optional_memory_format) {
+Tensor mkldnn_clone(const Tensor& self, std::optional<c10::MemoryFormat> optional_memory_format) {
   TORCH_CHECK(false, "mkldnn_clone: ATen not compiled with MKLDNN support");
 }
 
@@ -65,7 +65,7 @@ Tensor mkldnn_reshape(const Tensor& self, IntArrayRef size) {
                                  self.options().device_opt());
 }
 
-Tensor mkldnn_clone(const Tensor& self, c10::optional<c10::MemoryFormat> optional_memory_format) {
+Tensor mkldnn_clone(const Tensor& self, std::optional<c10::MemoryFormat> optional_memory_format) {
   TORCH_CHECK(
       !optional_memory_format.has_value(),
       "unsupported memory format option ",
diff --git a/aten/src/ATen/native/mkldnn/Utils.cpp b/aten/src/ATen/native/mkldnn/Utils.cpp
index 400eb9165f347..6578b23ff9c92 100644
--- a/aten/src/ATen/native/mkldnn/Utils.cpp
+++ b/aten/src/ATen/native/mkldnn/Utils.cpp
@@ -79,14 +79,14 @@ void check_mkldnn_binary_fusion_inputs(
 #if AT_MKLDNN_ENABLED()
 
 #define ATTR_FUNC(NAME)                              \
-  [](torch::List<c10::optional<at::Scalar>> scalars, \
-     c10::optional<c10::string_view> algorithm) {    \
+  [](torch::List<std::optional<at::Scalar>> scalars, \
+     std::optional<c10::string_view> algorithm) {    \
     return ideep::attr_t::fuse_##NAME();             \
   }
 
 AttrFunction attr_func_leaky_relu =
-    [](torch::List<c10::optional<at::Scalar>> scalars,
-       c10::optional<c10::string_view> algorithm) {
+    [](torch::List<std::optional<at::Scalar>> scalars,
+       std::optional<c10::string_view> algorithm) {
       TORCH_CHECK(
           scalars.size() == 1 &&
               scalars[0].get().toOptional<at::Scalar>().has_value(),
@@ -97,8 +97,8 @@ AttrFunction attr_func_leaky_relu =
     };
 
 AttrFunction attr_func_hardtanh =
-    [](torch::List<c10::optional<at::Scalar>> scalars,
-       c10::optional<c10::string_view> algorithm) {
+    [](torch::List<std::optional<at::Scalar>> scalars,
+       std::optional<c10::string_view> algorithm) {
       TORCH_CHECK(
           scalars.size() == 2 &&
               scalars[0].get().toOptional<at::Scalar>().has_value() &&
@@ -112,8 +112,8 @@ AttrFunction attr_func_hardtanh =
       return ideep::attr_t::fuse_clamp(lower_bound_value, upper_bound_value);
     };
 
-AttrFunction attr_func_gelu = [](torch::List<c10::optional<at::Scalar>> scalars,
-                                 c10::optional<c10::string_view> algorithm) {
+AttrFunction attr_func_gelu = [](torch::List<std::optional<at::Scalar>> scalars,
+                                 std::optional<c10::string_view> algorithm) {
   TORCH_CHECK(
       algorithm.has_value(),
       "gelu is expected to have one str input: algorithm");
@@ -131,8 +131,8 @@ AttrFunction attr_func_gelu = [](torch::List<c10::optional<at::Scalar>> scalars,
 };
 
 AttrFunction attr_func_hardsigmoid =
-    [](torch::List<c10::optional<at::Scalar>> scalars,
-       c10::optional<c10::string_view> algorithm) {
+    [](torch::List<std::optional<at::Scalar>> scalars,
+       std::optional<c10::string_view> algorithm) {
       ideep::attr_t attr;
       ideep::post_ops po;
       po.append_eltwise(
diff --git a/aten/src/ATen/native/mkldnn/Utils.h b/aten/src/ATen/native/mkldnn/Utils.h
index aa804d6bc1877..75f1b2c1b709a 100644
--- a/aten/src/ATen/native/mkldnn/Utils.h
+++ b/aten/src/ATen/native/mkldnn/Utils.h
@@ -73,8 +73,8 @@ static inline Tensor may_convert_to_default_contiguous_strides(const Tensor& inp
 #if AT_MKLDNN_ENABLED()
 
 using AttrFunction = std::function<ideep::attr_t(
-    torch::List<c10::optional<at::Scalar>>,
-    c10::optional<c10::string_view>)>;
+    torch::List<std::optional<at::Scalar>>,
+    std::optional<c10::string_view>)>;
 
 const std::map<c10::string_view, AttrFunction>& fusion_unary_attr_map();
 
diff --git a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
index 8ac19605b1c79..7f84704d30907 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
@@ -563,7 +563,7 @@ Tensor _convolution(
 Tensor convolution_overrideable(
     const Tensor& input_r,
     const Tensor& weight_r,
-    const c10::optional<at::Tensor>& bias_r_opt,
+    const std::optional<at::Tensor>& bias_r_opt,
     IntArrayRef stride_,
     IntArrayRef padding_,
     IntArrayRef dilation_,
diff --git a/aten/src/ATen/native/mps/TensorFactory.cpp b/aten/src/ATen/native/mps/TensorFactory.cpp
index 6fe145a6cc556..03ff521db1046 100644
--- a/aten/src/ATen/native/mps/TensorFactory.cpp
+++ b/aten/src/ATen/native/mps/TensorFactory.cpp
@@ -50,7 +50,7 @@ static inline void maybe_resize_storage_mps(TensorImpl* self, uint64_t new_size)
 inline TensorImpl* resize_impl_mps_(
     TensorImpl* self,
     IntArrayRef size,
-    c10::optional<IntArrayRef> stride,
+    std::optional<IntArrayRef> stride,
     bool device_guard = true) {
   if (self->sizes() == size && (!stride || self->strides() == stride)) {
     return self;
@@ -72,11 +72,11 @@ inline TensorImpl* resize_impl_mps_(
 
 Tensor empty_mps(
     IntArrayRef size,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt) {
 
   return at::detail::empty_mps(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
 }
@@ -84,10 +84,10 @@ Tensor empty_mps(
 Tensor empty_strided_mps(
     IntArrayRef size,
     IntArrayRef stride,
-    c10::optional<ScalarType> dtype_opt,
-    c10::optional<Layout> layout_opt,
-    c10::optional<Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt) {
   check_size_nonnegative(size);
   // empty memory formatempty
   auto t = at::native::empty_mps(
@@ -103,7 +103,7 @@ Tensor empty_strided_mps(
 const Tensor& resize_mps_(
     const Tensor& self,
     IntArrayRef size,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   if (self.has_names()) {
     return resize_named_tensor_(self, size, optional_memory_format);
   }
@@ -142,17 +142,17 @@ Tensor& set_storage_mps_(Tensor& result, Storage storage, int64_t storage_offset
   checkSetStorage(result, storage, storage_offset, size, stride);
   //std::cout << "set storage_mps " << storage_offset << " stride " << stride << std::endl;
   result.unsafeGetTensorImpl()->set_storage_offset(storage_offset);
-  c10::optional<IntArrayRef> stride_opt = stride.data() != nullptr ?
-                                          c10::optional<IntArrayRef>(stride) : c10::nullopt;
+  std::optional<IntArrayRef> stride_opt = stride.data() != nullptr ?
+                                          std::optional<IntArrayRef>(stride) : c10::nullopt;
   at::native::resize_impl_mps_(result.unsafeGetTensorImpl(), size, stride_opt);
   return result;
 }
 
 Tensor _efficientzerotensor_mps(IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
     auto device_ = device_or_default(device);
     auto allocator = at::native::ZeroTensorAllocator(device_);
     auto dtype_ = dtype_or_default(dtype);
diff --git a/aten/src/ATen/native/mps/operations/Quantized.mm b/aten/src/ATen/native/mps/operations/Quantized.mm
index 3c77ec67b42df..4d0f569ea062b 100644
--- a/aten/src/ATen/native/mps/operations/Quantized.mm
+++ b/aten/src/ATen/native/mps/operations/Quantized.mm
@@ -12,6 +12,8 @@
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>
 
+// #define _CAPTURE_KERNEL 1
+
 namespace at::native {
 
 using namespace mps;
@@ -82,6 +84,85 @@ kernel void int4pack_mm(
 INSTANTIATE_INT4MM(bfloat, 128);
 INSTANTIATE_INT4MM(bfloat, 256);
 #endif
+
+template<typename T>
+struct Vec4Type {};
+
+template<>
+struct Vec4Type<float> {
+  using type = float4;
+};
+
+template<>
+struct Vec4Type<half> {
+  using type = half4;
+};
+
+#if __METAL_VERSION__ >= 310
+template<>
+struct Vec4Type<bfloat> {
+  using type = bfloat4;
+};
+#endif
+
+template <typename T, unsigned blockSize=8>
+kernel void
+int8pack_mm(constant T *A [[buffer(0)]], constant char *B [[buffer(1)]],
+            constant T *scales [[buffer(2)]],
+            device T *outputData [[buffer(3)]],
+            constant int3 &sizes [[buffer(4)]],
+            uint2 group_index [[threadgroup_position_in_grid]],
+            uint2 threadgroup_index [[thread_position_in_threadgroup]]) {
+  using vecT = typename Vec4Type<T>::type;
+  const uint lda = sizes.y;
+  const uint ldc = sizes.z;
+  int out_idx = (group_index.x * blockSize + threadgroup_index.x) * 4;
+  int n = out_idx % sizes.z;
+  int m = out_idx / sizes.z;
+  // Offset pointers
+  A += m * lda;
+  B += n * lda;
+  outputData += m *ldc;
+
+  float4 rc = 0;
+  for (unsigned k = threadgroup_index.y * 4; k < sizes.y; k += 4 * blockSize) {
+    threadgroup_barrier(mem_flags::mem_none);
+    auto a_val = float4(*reinterpret_cast<constant vecT *>(A  + k));
+    float4x4 b_val;
+    for (int i = 0; i < 4; ++i) {
+      b_val[i] = float4(*reinterpret_cast<constant char4 *>(B + i * lda + k));
+    }
+    rc += transpose(b_val) * a_val;
+  }
+
+  // Accumulate results acorss SIMD group? (8 threads using vec4)
+  threadgroup float4 tgp_memory[blockSize][blockSize];
+  tgp_memory[threadgroup_index.x][threadgroup_index.y] = rc;
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  if (threadgroup_index.y == 0) {
+    for (int i = 1; i < blockSize; i++) {
+      rc += tgp_memory[threadgroup_index.x][i];
+    }
+    *reinterpret_cast<device vecT *>(outputData + n) =
+        vecT(rc * float4(*reinterpret_cast<constant vecT *>(scales + n)));
+  }
+}
+
+#define INSTANTIATE_INT8MM(DTYPE)                                              \
+  template [[host_name("int8pack_mm_" #DTYPE)]] kernel void                    \
+  int8pack_mm<DTYPE>(                                                          \
+      constant DTYPE * A [[buffer(0)]], constant char *B [[buffer(1)]],        \
+      constant DTYPE *scales [[buffer(2)]],                                    \
+      device DTYPE *outputData [[buffer(3)]],                                  \
+      constant int3 &sizes [[buffer(4)]],                                      \
+      uint2 group_index [[threadgroup_position_in_grid]],                      \
+      uint2 threadgroup_index [[thread_position_in_threadgroup]]);
+
+INSTANTIATE_INT8MM(half);
+INSTANTIATE_INT8MM(float);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_INT8MM(bfloat);
+#endif
 )METAL_QUANTIZED");
 
 Tensor _weight_int4pack_mm_mps(const Tensor& A, const Tensor& B, int64_t qGroupSize, const Tensor& qScaleAndZeros) {
@@ -114,8 +195,7 @@ Tensor _weight_int4pack_mm_mps(const Tensor& A, const Tensor& B, int64_t qGroupS
 
   auto C = at::empty({M, N}, A.options());
   MPSStream* mpsStream = getCurrentMPSStream();
-  std::array<uint32_t, 3> sizes = {static_cast<uint32_t>(M), static_cast<uint32_t>(K), static_cast<uint32_t>(N)};
-  static bool firstCapture = false;
+  std::array<uint32_t, 4> sizes = {static_cast<uint32_t>(M), static_cast<uint32_t>(K), static_cast<uint32_t>(N), 0};
   dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
     @autoreleasepool {
 #if _CAPTURE_KERNEL
@@ -163,7 +243,35 @@ Tensor _weight_int8pack_mm_mps(const Tensor& A, const Tensor& B, const Tensor& s
   TORCH_CHECK(scales.dim() == 1 && scales.size(0) == N, __func__, " : expect scales to be 1d tensor with size ", N);
 
   auto C = at::empty({M, N}, A.options());
-
+  TORCH_CHECK(N % 32 == 0 && K % 32 == 0);
+#if 1
+  MPSStream* mpsStream = getCurrentMPSStream();
+  std::array<uint32_t, 4> sizes = {static_cast<uint32_t>(M), static_cast<uint32_t>(K), static_cast<uint32_t>(N), 0};
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+#if _CAPTURE_KERNEL
+      if (getMPSProfiler().isCaptureEnabled()) {
+        getMPSProfiler().startCapture(fmt::format("int8pack_mm_{}x{}x{}", M, N, K), mpsStream);
+      }
+#endif
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      const std::string kernel = fmt::format("int8pack_mm_{}", scalarToMetalTypeString(A));
+      id<MTLComputePipelineState> quantizedPSO = lib.getPipelineStateForFunc(kernel);
+      [computeEncoder setComputePipelineState:quantizedPSO];
+      mtl_setBuffer(computeEncoder, A, 0);
+      mtl_setBuffer(computeEncoder, B, 1);
+      mtl_setBuffer(computeEncoder, scales, 2);
+      mtl_setBuffer(computeEncoder, C, 3);
+      [computeEncoder setBytes:sizes.data() length:sizeof(uint32_t) * sizes.size() atIndex:4];
+      [computeEncoder dispatchThreads:MTLSizeMake(M * N / 4, 8, 1) threadsPerThreadgroup:MTLSizeMake(8, 8, 1)];
+#if _CAPTURE_KERNEL
+      if (getMPSProfiler().isCapturing()) {
+        getMPSProfiler().stopCapture(mpsStream);
+      }
+#endif
+    }
+  });
+#else
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor *ATensor = nil, *BTensor = nil, *scalesTensor = nil;
@@ -193,6 +301,7 @@ Tensor _weight_int8pack_mm_mps(const Tensor& A, const Tensor& B, const Tensor& s
                 dictionaryFromPlaceholders(APlaceholder, BPlaceholder, scalesPlaceholder),
                 outputPlaceholder);
   }
+#endif
 
   return C;
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 119c0b8572301..1ea973f93261b 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -15527,6 +15527,7 @@
     CPU: foobar
   autogen: _foobar.out
 
+# Fused Optimizer CUDA kernels.
 - func: _fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
   # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
   variants: function
@@ -15581,12 +15582,6 @@
     CUDA: _fused_sgd_kernel_cuda_
   autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out
 
-- func: _fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
-  variants: function
-  dispatch:
-    CPU: _fused_adagrad_kernel_cpu_
-  autogen: _fused_adagrad, _fused_adagrad.out
-
 # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
 - func: _propagate_xla_data(Tensor input, Tensor output) -> ()
   variants: function
diff --git a/aten/src/ATen/native/nested/NestedTensorBackward.cpp b/aten/src/ATen/native/nested/NestedTensorBackward.cpp
index e4465b792c21e..488dab9e37cb2 100644
--- a/aten/src/ATen/native/nested/NestedTensorBackward.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorBackward.cpp
@@ -197,8 +197,8 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_nested(
     IntArrayRef normalized_shape,
     const Tensor& mean,
     const Tensor& rstd,
-    const c10::optional<Tensor>& weight_opt /* optional */,
-    const c10::optional<Tensor>& bias_opt /*{ optional */,
+    const std::optional<Tensor>& weight_opt /* optional */,
+    const std::optional<Tensor>& bias_opt /*{ optional */,
     std::array<bool, 3> grad_input_mask) {
   // For NestedTensors weight and bias are non nested.
   auto* nt_impl_grad = get_nested_tensor_impl(grad);
diff --git a/aten/src/ATen/native/nested/NestedTensorFactories.cpp b/aten/src/ATen/native/nested/NestedTensorFactories.cpp
index 45425ed63315c..40e5082832021 100644
--- a/aten/src/ATen/native/nested/NestedTensorFactories.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorFactories.cpp
@@ -8,11 +8,11 @@ namespace native {
 
 static TensorOptions verify_empty_parameters(
     const at::Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   TensorOptions options_ = TensorOptions()
                                .dtype(dtype)
                                .layout(layout)
@@ -37,11 +37,11 @@ static TensorOptions verify_empty_parameters(
 
 Tensor empty_like_nested(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   auto options = verify_empty_parameters(
       self, dtype, layout, device, pin_memory, optional_memory_format);
   auto self_nt = get_nested_tensor_impl(self);
@@ -83,12 +83,12 @@ static inline Device ensure_has_index(Device device) {
 
 Tensor _to_copy_nested(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
     bool non_blocking,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   TORCH_CHECK(
       !layout.has_value() || self.layout() == layout.value(),
       "to(options) doesn't support converting to a different layout, "
@@ -132,7 +132,7 @@ Tensor& copy_nested_(Tensor& self, const Tensor& src, bool non_blocking) {
 
 Tensor clone_nested(
     const Tensor& self,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   auto memory_format = optional_memory_format.value_or(c10::MemoryFormat::Preserve);
   auto self_ptr = get_nested_tensor_impl(self);
   if (memory_format == c10::MemoryFormat::Preserve ||
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp
index 7d3e826ef53e9..1974b4fe2cea0 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@@ -113,10 +113,10 @@ bool NestedTensor_nested_tensor_from_mask_left_aligned(const Tensor& t, const Te
 
 Tensor _nested_tensor_from_tensor_list(
     TensorList list,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   for (const auto i : c10::irange(list.size())) {
     if (i > 0) {
       int64_t dim_i = list[i].dim();
@@ -146,8 +146,8 @@ Tensor _nested_tensor_from_tensor_list(
 std::tuple<Tensor, Tensor, Tensor> nested_layer_norm(
     const Tensor& input,
     IntArrayRef normalized_shape,
-    const c10::optional<Tensor>& weight_opt,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& bias_opt,
     double eps) {
   TORCH_CHECK(weight_opt && bias_opt, "NestedTensor layer_norm requires weight and bias");
   const auto& weight = *weight_opt;
@@ -356,7 +356,7 @@ Tensor NestedTensor_sum_dim_CPU(
     const Tensor& self,
     OptionalIntArrayRef opt_dims,
     bool keepdim,
-    c10::optional<ScalarType> dtype) {
+    std::optional<ScalarType> dtype) {
   // Only allow reductions across the last dim
   auto dims = opt_dims.value_or(IntArrayRef{});
   TORCH_CHECK(
@@ -479,7 +479,7 @@ Tensor select_nested(const Tensor& self, int64_t dim, int64_t index) {
 
 }
 
-std::tuple<Tensor,Tensor> native_dropout_nested(const Tensor& input, double p, c10::optional<bool> train) {
+std::tuple<Tensor,Tensor> native_dropout_nested(const Tensor& input, double p, std::optional<bool> train) {
   auto input_ptr = get_nested_tensor_impl(input);
   const Tensor& input_buffer = input_ptr-> get_unsafe_storage_as_tensor(),
       & sizemat = input_ptr->get_nested_sizes(),
@@ -587,7 +587,7 @@ Tensor squeeze_dim_nested(const Tensor& self, IntArrayRef dims) {
   // if tensor.size(dim) != 1 torch.squeeze will return the result, we do the same here
   for (const auto d : c10::irange(ndim)) {
     if (mask.test(d)) {
-      c10::optional<int64_t> size_dim = self_ptr->opt_size(d);
+      std::optional<int64_t> size_dim = self_ptr->opt_size(d);
       if (!(size_dim.has_value() && *size_dim == 1)) {
         mask.reset(d);
       }
@@ -925,7 +925,7 @@ Tensor reshape_as_nested(const Tensor& self, const Tensor& other) {
   //       if an accessor is provided in the future, can replace this
   std::vector<int64_t> sizes;
   for (int64_t i = 0; i < other_ptr->dim(); i++) {
-    c10::optional<int64_t> opt_size = other_ptr->opt_size(i);
+    std::optional<int64_t> opt_size = other_ptr->opt_size(i);
     if (opt_size.has_value()) {
       sizes.push_back(*opt_size);
     }
@@ -937,7 +937,7 @@ Tensor reshape_as_nested(const Tensor& self, const Tensor& other) {
   return self.reshape(sizes);
 }
 
-Tensor& normal_nested_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+Tensor& normal_nested_(Tensor& self, double mean, double std, std::optional<Generator> gen) {
   const auto& self_buf = get_nested_tensor_impl(self)->get_buffer();
   self_buf.normal_(mean, std, gen);
   return self;
diff --git a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
index 88e2a94570185..aa683ff854ef6 100644
--- a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
@@ -320,7 +320,7 @@ Tensor& matmul_out_nested(
   //       if an accessor is provided in the future, can replace this
   std::vector<int64_t> sizes;
   for (int64_t i = 0; i < function_result_ptr->dim(); i++) {
-    c10::optional<int64_t> opt_size = function_result_ptr->opt_size(i);
+    std::optional<int64_t> opt_size = function_result_ptr->opt_size(i);
     if (opt_size.has_value()) {
       sizes.push_back(*opt_size);
     } else {
diff --git a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
index 96d13c366f7ac..6285f2ca1223e 100644
--- a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
@@ -59,7 +59,7 @@ inline void check_nested_tensor_matrix_constraints(
 Tensor nested_linear(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<Tensor>& bias_opt) {
+    const std::optional<Tensor>& bias_opt) {
   check_nested_tensor_matrix_constraints(input, weight, c10::string_view{"Linear"});
   auto* nt_input = get_nested_tensor_impl(input);
   const Tensor& input_buffer = nt_input->get_buffer();
@@ -93,7 +93,7 @@ Tensor NestedTensor_times_Tensor_plus_Tensor_addmm(
     const Tensor& mat2,
     const c10::Scalar& beta,
     const c10::Scalar& alpha,
-    c10::optional<bool> use_gelu) {
+    std::optional<bool> use_gelu) {
   // Interesting case: alpha * NT * T + beta * T
   const auto* nt_mat1 = get_nested_tensor_impl_or_null(mat1);
   TORCH_INTERNAL_ASSERT(nt_mat1 != nullptr);
@@ -184,7 +184,7 @@ Tensor NestedTensor_softmax_dropout(const Tensor& self, const Tensor& query) {
 }
 
 Tensor NestedTensor_softmax_dropout_cuda(const Tensor& self, const Tensor& query) {
-  c10::optional<Tensor> attn_mask;
+  std::optional<Tensor> attn_mask;
 
   attn_mask = NestedTensor_to_mask(query, 2, self.size(2));
   attn_mask = attn_mask->to(query.device(), /*non-blocking=*/true);
@@ -211,7 +211,7 @@ Tensor NestedTensor_batch_offsets_from_size_tensor(
 }
 
 
-Tensor NestedTensor_to_mask(const Tensor& nt, c10::optional<int64_t> mask_dim, c10::optional<int64_t> mask_dim_length) {
+Tensor NestedTensor_to_mask(const Tensor& nt, std::optional<int64_t> mask_dim, c10::optional<int64_t> mask_dim_length) {
   auto* nt_impl = get_nested_tensor_impl(nt);
   TORCH_CHECK(nested_tensor_impl_is_contiguous(nt_impl), "to_mask only works on contiguous NestedTensors.");
   TORCH_CHECK(
diff --git a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h
index cee721d7bc8f6..b0df6975304d2 100644
--- a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h
+++ b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h
@@ -36,7 +36,7 @@ Tensor NestedTensor_times_Tensor_plus_Tensor_addmm(
     const Tensor& mat2,
     const c10::Scalar& beta,
     const c10::Scalar& alpha,
-    c10::optional<bool> use_gelu = c10::nullopt);
+    std::optional<bool> use_gelu = c10::nullopt);
 
 Tensor NestedTensor_add_NestedTensor_in_place(
     const Tensor& self,
@@ -50,7 +50,7 @@ Tensor NestedTensor_from_padded_tensor_cpu(
     const Tensor& padded,
     const NestedTensorImpl& nt);
 
-Tensor NestedTensor_to_mask(const Tensor& nt, c10::optional<int64_t> mask_dim, c10::optional<int64_t> mask_dim_length);
+Tensor NestedTensor_to_mask(const Tensor& nt, std::optional<int64_t> mask_dim, c10::optional<int64_t> mask_dim_length);
 
 template <typename T>
 void remove_padding_kernelLauncher(
diff --git a/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp b/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
index c41b6f15214aa..dc31b2c0de240 100644
--- a/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
@@ -132,7 +132,7 @@ Tensor cos_nested(const Tensor& self) {
   return map_nt(self, at::cos);
 }
 
-Tensor _pin_memory_nested(const Tensor& self, c10::optional<Device> device) {
+Tensor _pin_memory_nested(const Tensor& self, std::optional<Device> device) {
   auto* nt_input = get_nested_tensor_impl(self);
   const auto& input_buffer = nt_input->get_unsafe_storage_as_tensor();
   return wrap_buffer(
diff --git a/aten/src/ATen/native/nested/NestedTensorUtils.cpp b/aten/src/ATen/native/nested/NestedTensorUtils.cpp
index a5394404543f8..6539475cd1fdd 100644
--- a/aten/src/ATen/native/nested/NestedTensorUtils.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorUtils.cpp
@@ -59,7 +59,7 @@ std::vector<int64_t> NestedTensor_get_max_size(const NestedTensorImpl& nt) {
 }
 
 int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt) {
-  c10::optional<int64_t> last_dim = nt.opt_size(-1);
+  std::optional<int64_t> last_dim = nt.opt_size(-1);
   TORCH_CHECK(
       last_dim != c10::nullopt,
       "Expected all tensors in nested tensor to have the same trailing dimension, instead last dimension equals: ",
diff --git a/aten/src/ATen/native/nested/NestedTensorUtils.h b/aten/src/ATen/native/nested/NestedTensorUtils.h
index 3b4f18f11b64b..572b0a827dd06 100644
--- a/aten/src/ATen/native/nested/NestedTensorUtils.h
+++ b/aten/src/ATen/native/nested/NestedTensorUtils.h
@@ -340,10 +340,10 @@ inline TensorNode get_nested_tensor_structure(at::Tensor tensor) {
 
 inline Tensor wrap_tensor_node(
     TensorNode tensor_node,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   TORCH_CHECK(
       !tensor_node.is_leaf(), "Expected TensorNode to wrap a list of Tensors.");
   TensorOptions options_ =
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index 0da0c3e361d1f..977ace14fb34d 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -234,7 +234,7 @@ _scaled_dot_product_flash_attention_nestedtensor_cuda(
     double dropout_p,
     bool is_causal,
     bool return_debug_mask,
-    c10::optional<double> scale) {
+    std::optional<double> scale) {
   Tensor query_buffer_reshaped, key_buffer_reshaped, value_buffer_reshaped,
       cumulative_sequence_length_q, cumulative_sequence_length_kv, output_shape;
   int64_t max_seqlen_batch_q{0}, max_seqlen_batch_kv{0};
@@ -285,11 +285,11 @@ _scaled_dot_product_efficient_attention_nestedtensor_cuda(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
-    const c10::optional<at::Tensor>&  attn_bias,
+    const std::optional<at::Tensor>&  attn_bias,
     bool compute_log_sumexp,
     double dropout_p,
     bool is_causal,
-    c10::optional<double> scale) {
+    std::optional<double> scale) {
   Tensor query_buffer_reshaped, key_buffer_reshaped, value_buffer_reshaped,
       cumulative_sequence_length_q, cumulative_sequence_length_kv, output_shape;
   int64_t max_seqlen_batch_q{0};
@@ -344,7 +344,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_flash_attenti
     bool is_causal,
     const at::Tensor& philox_seed,
     const at::Tensor& philox_offset,
-    c10::optional<double> scale){
+    std::optional<double> scale){
   if (!grad_out_.defined()) {
     return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
   }
diff --git a/aten/src/ATen/native/quantized/PackedParams.h b/aten/src/ATen/native/quantized/PackedParams.h
index a442628573fec..d73bc0adbc4ef 100644
--- a/aten/src/ATen/native/quantized/PackedParams.h
+++ b/aten/src/ATen/native/quantized/PackedParams.h
@@ -111,11 +111,11 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
     return output;
   }
 
-  virtual std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() = 0;
+  virtual std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() = 0;
 
-  virtual c10::optional<at::Tensor> bias() = 0;
+  virtual std::optional<at::Tensor> bias() = 0;
 
-  virtual void set_bias(c10::optional<at::Tensor> /*bias*/) {
+  virtual void set_bias(std::optional<at::Tensor> /*bias*/) {
     throw std::runtime_error(
         "set_bias is not implemented for this packed "
         "parameter type");
@@ -136,7 +136,7 @@ struct ConvPackedParamsBase : public torch::jit::CustomClassHolder {
       const at::Tensor& input,
       bool reduce_range) = 0;
 
-  virtual std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() = 0;
+  virtual std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() = 0;
 
   virtual torch::List<int64_t> stride() const = 0;
   virtual torch::List<int64_t> padding() const = 0;
diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index 9705de0a4a54d..a6817984c12d2 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -188,13 +188,13 @@ QScheme qscheme_quant(const Tensor& self) {
 
 Tensor quantized_clone(
     const Tensor& self,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   auto memory_format =
       optional_memory_format.value_or(MemoryFormat::Contiguous);
 
   // TODO: To support all features of MemoryFormat::Preserve we need to add
   // _empty_affine_quantized_strided function and use it similarly to
-  // Tensor clone(const Tensor& src, c10::optional<c10::MemoryFormat>
+  // Tensor clone(const Tensor& src, std::optional<c10::MemoryFormat>
   // optional_memory_format) if (self.is_non_overlapping_and_dense()) ->
   // _empty_affine_quantized_strided
   if (memory_format == MemoryFormat::Preserve) {
diff --git a/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp
index 4f06b133771d9..11b005dc924c9 100644
--- a/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp
@@ -121,7 +121,7 @@ Tensor & masked_fill__quantized_cuda(Tensor& self, const Tensor & mask, const Te
   return masked_fill_impl_quantized_cuda(self, mask, value.item());
 }
 
-Tensor& _index_put_impl_quantized_cpu_(Tensor & self, const torch::List<c10::optional<Tensor>>& indices, const Tensor & value, const bool accumulate, const bool unsafe) {
+Tensor& _index_put_impl_quantized_cpu_(Tensor & self, const torch::List<std::optional<Tensor>>& indices, const Tensor & value, const bool accumulate, const bool unsafe) {
   TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
   TORCH_CHECK(!value.is_quantized(), "Value argument for quantized input_put should not be quantized");
   TORCH_CHECK(self.qscheme() == c10::kPerTensorAffine, "index_put for quantized tensors is currently only supported for per tensor quantized tensors");
@@ -145,7 +145,7 @@ Tensor& _index_put_impl_quantized_cpu_(Tensor & self, const torch::List<c10::opt
   }
   at::assert_no_overlap(self, value);
   // NOLINTNEXTLINE(performance-implicit-conversion-in-loop)
-  for (const c10::optional<Tensor>& index: indices) {
+  for (const std::optional<Tensor>& index: indices) {
     if (index.has_value()) {
       at::assert_no_overlap(self, *index);
     }
@@ -157,7 +157,7 @@ Tensor& _index_put_impl_quantized_cpu_(Tensor & self, const torch::List<c10::opt
   return self;
 }
 
-Tensor& _index_put_impl_quantized_cuda_(Tensor & self, const torch::List<c10::optional<Tensor>>& indices, const Tensor & value, const bool accumulate, const bool unsafe) {
+Tensor& _index_put_impl_quantized_cuda_(Tensor & self, const torch::List<std::optional<Tensor>>& indices, const Tensor & value, const bool accumulate, const bool unsafe) {
   TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
   TORCH_CHECK(!value.is_quantized(), "Value argument for quantized input_put should not be quantized");
   TORCH_CHECK(self.qscheme() == c10::kPerTensorAffine, "index_put for quantized tensors is currently only supported for per tensor quantized tensors");
@@ -183,7 +183,7 @@ Tensor& _index_put_impl_quantized_cuda_(Tensor & self, const torch::List<c10::op
 
   at::assert_no_overlap(self, value);
   // NOLINTNEXTLINE(performance-implicit-conversion-in-loop)
-  for (const c10::optional<Tensor>& index: indices) {
+  for (const std::optional<Tensor>& index: indices) {
     if (index.has_value()) {
       at::assert_no_overlap(self, *index);
     }
diff --git a/aten/src/ATen/native/quantized/TensorCompare.cpp b/aten/src/ATen/native/quantized/TensorCompare.cpp
index def1622863e1d..2cc6ebcda603f 100644
--- a/aten/src/ATen/native/quantized/TensorCompare.cpp
+++ b/aten/src/ATen/native/quantized/TensorCompare.cpp
@@ -47,7 +47,7 @@ Tensor& min_quantized_unary_out(const Tensor& self, Tensor& out) {
 
 std::tuple<Tensor, Tensor> sort_quantized_cpu_stable(
     const Tensor& self,
-    c10::optional<bool> stable,
+    std::optional<bool> stable,
     int64_t dim,
     bool descending) {
   auto [sort_int, sort_indicies] =
diff --git a/aten/src/ATen/native/quantized/TensorFactories.cpp b/aten/src/ATen/native/quantized/TensorFactories.cpp
index e79f657e0de95..54dcdc37c5b23 100644
--- a/aten/src/ATen/native/quantized/TensorFactories.cpp
+++ b/aten/src/ATen/native/quantized/TensorFactories.cpp
@@ -14,13 +14,13 @@ namespace native {
 // change to use quantizer
 Tensor empty_affine_quantized(
     IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
     double scale,
     int64_t zero_point,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -44,11 +44,11 @@ Tensor empty_per_channel_affine_quantized(
     const Tensor& scales,
     const Tensor& zero_points,
     int64_t axis,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -70,11 +70,11 @@ Tensor empty_per_channel_affine_quantized(
 
 Tensor empty_unknown_quantized(
     IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -93,10 +93,10 @@ Tensor empty_unknown_quantized(
 Tensor empty_strided_unknown_quantized(
     IntArrayRef size,
     IntArrayRef strided,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
 
   TORCH_CHECK(false, "empty_strided not supported on quantized tensors yet see https://github.com/pytorch/pytorch/issues/74540")
 
@@ -105,13 +105,13 @@ Tensor empty_strided_unknown_quantized(
 // Provide better error message if dtype is wrong
 Tensor empty_affine_quantized_other_backends_stub(
     IntArrayRef,
-    c10::optional<ScalarType>,
-    c10::optional<Layout>,
-    c10::optional<Device>,
-    c10::optional<bool>,
+    std::optional<ScalarType>,
+    std::optional<Layout>,
+    std::optional<Device>,
+    std::optional<bool>,
     double,
     int64_t,
-    c10::optional<c10::MemoryFormat>) {
+    std::optional<c10::MemoryFormat>) {
   TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8");
 }
 
@@ -120,11 +120,11 @@ Tensor empty_per_channel_affine_quantized_other_backends_stub(
     const Tensor&,
     const Tensor&,
     int64_t,
-    c10::optional<ScalarType>,
-    c10::optional<Layout>,
-    c10::optional<Device>,
-    c10::optional<bool>,
-    c10::optional<c10::MemoryFormat>) {
+    std::optional<ScalarType>,
+    std::optional<Layout>,
+    std::optional<Device>,
+    std::optional<bool>,
+    std::optional<c10::MemoryFormat>) {
   TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8");
 }
 
@@ -133,11 +133,11 @@ Tensor empty_per_channel_affine_quantized_other_backends_stub(
 Tensor empty_quantized(
     IntArrayRef size,
     const Tensor& qtensor,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> memory_format) {
   TensorOptions specified_options =
       TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
diff --git a/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp b/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp
index 754c7d6bd529b..d7b53f8457868 100644
--- a/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp
@@ -47,7 +47,7 @@ static void avg_pool2d_out_frame(
     int padW,
     int padH,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   Tensor input_contig = input.contiguous();
   auto input_data = input_contig.data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
@@ -185,7 +185,7 @@ Tensor q_avg_pool2d(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   auto [kW, kH] = get_kernel(kernel_size);
   auto [dW, dH] = get_stride(stride, kW, kH);
@@ -265,7 +265,7 @@ Tensor qnnpack_avg_pool2d(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   auto [kW, kH] = get_kernel(kernel_size);
   auto [dW, dH] = get_stride(stride, kW, kH);
   auto [padW, padH] = get_padding(padding);
@@ -362,7 +362,7 @@ Tensor avg_pool2d_quantized_cpu(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   Tensor output;
 #ifdef USE_PYTORCH_QNNPACK
   if (at::globalContext().qEngine() == at::QEngine::QNNPACK &&
diff --git a/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp b/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp
index 875ae28e46a96..b83e3e313cd08 100644
--- a/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp
@@ -100,7 +100,7 @@ Tensor q_avg_pool3d(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   auto [kW, kH, kD] = get_kernel(kernel_size);
   auto [dW, dH, dD] = get_stride(stride, kW, kH, kD);
   auto [padW, padH, padD] = get_padding(padding);
@@ -165,7 +165,7 @@ Tensor avg_pool3d_quantized_cpu(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   Tensor output;
   AT_DISPATCH_QINT_TYPES(input.scalar_type(), "avg_pool3d_quantized_cpu", [&]() {
     output = q_avg_pool3d<scalar_t>(
diff --git a/aten/src/ATen/native/quantized/cpu/EmbeddingPackedParams.h b/aten/src/ATen/native/quantized/cpu/EmbeddingPackedParams.h
index 140b716df2691..e6f47d611a19f 100644
--- a/aten/src/ATen/native/quantized/cpu/EmbeddingPackedParams.h
+++ b/aten/src/ATen/native/quantized/cpu/EmbeddingPackedParams.h
@@ -6,19 +6,19 @@
 struct EmbeddingPackedParamsBase : public torch::jit::CustomClassHolder {
   virtual at::Tensor embeddingbag_byte(
     const at::Tensor& indices,
-    const c10::optional<at::Tensor>& offsets,
+    const std::optional<at::Tensor>& offsets,
     bool pruned_weights,
-    const c10::optional<at::Tensor>& per_sample_weights_,
-    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    const std::optional<at::Tensor>& per_sample_weights_,
+    const std::optional<at::Tensor>& compressed_indices_mapping,
     bool include_last_offset,
     bool is_embedding_op) = 0;
 
   virtual at::Tensor embeddingbag_4bit(
     const at::Tensor& indices,
-    const c10::optional<at::Tensor>& offsets,
+    const std::optional<at::Tensor>& offsets,
     bool pruned_weights,
-    const c10::optional<at::Tensor>& per_sample_weights_,
-    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    const std::optional<at::Tensor>& per_sample_weights_,
+    const std::optional<at::Tensor>& compressed_indices_mapping,
     bool include_last_offset,
     bool is_embedding_op) = 0;
 
diff --git a/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp b/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp
index 7bff3e3d4b443..df74b10d70f97 100644
--- a/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp
+++ b/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp
@@ -22,7 +22,7 @@
 int register_linear_params();
 
 #ifdef USE_FBGEMM
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeight::unpack() {
+std::tuple<at::Tensor, std::optional<at::Tensor>> PackedLinearWeight::unpack() {
   auto packB = w.get();
 
   int64_t N = static_cast<int64_t>(packB->numCols());
@@ -53,16 +53,16 @@ std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeight::unpack() {
   // (QLinearUnpackWeightInt8): ");
   packB->unpack(weight_ptr_int8);
 
-  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(
+  return std::tuple<at::Tensor, std::optional<at::Tensor>>(
       weight_origin, bias_);
 }
 #endif // USE_FBGEMM
 
 #ifdef USE_PYTORCH_QNNPACK
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeightsQnnp::
+std::tuple<at::Tensor, std::optional<at::Tensor>> PackedLinearWeightsQnnp::
     unpack() {
   if (orig_weight.defined()) {
-    return std::tuple<at::Tensor, c10::optional<at::Tensor>>(
+    return std::tuple<at::Tensor, std::optional<at::Tensor>>(
         orig_weight, bias_);
   } else {
     // Unpacking requires reverting *make_zero_points_and_scales_tensor*
@@ -110,14 +110,14 @@ std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeightsQnnp::
       weight_ptr_int8[i] = (int8_t)(weight_ptr_int8[i] - 128);
     }
 
-    return std::tuple<at::Tensor, c10::optional<at::Tensor>>(
+    return std::tuple<at::Tensor, std::optional<at::Tensor>>(
         weight_origin, bias_);
   }
 }
 #endif // USE_PYTORCH_QNNPACK
 
 #ifdef USE_FBGEMM
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeightFp16::
+std::tuple<at::Tensor, std::optional<at::Tensor>> PackedLinearWeightFp16::
     unpack() {
   auto& packed_weight_ptr = w;
 
@@ -135,8 +135,8 @@ std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeightFp16::
 #endif // USE_FBGEMM
 
 #if AT_MKLDNN_ENABLED()
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeightsOnednn::unpack() {
-  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(
+std::tuple<at::Tensor, std::optional<at::Tensor>> PackedLinearWeightsOnednn::unpack() {
+  return std::tuple<at::Tensor, std::optional<at::Tensor>>(
       orig_weight_, orig_bias_);
 }
 #endif // #if AT_MKLDNN_ENABLED()
diff --git a/aten/src/ATen/native/quantized/cpu/Normalization.cpp b/aten/src/ATen/native/quantized/cpu/Normalization.cpp
index 0f5fb9884a9c5..e92a9669cce04 100644
--- a/aten/src/ATen/native/quantized/cpu/Normalization.cpp
+++ b/aten/src/ATen/native/quantized/cpu/Normalization.cpp
@@ -54,8 +54,8 @@ void compute_fused_params(
 template <bool ReluFused>
 Tensor q_batch_norm1d_impl(
     Tensor qx,
-    c10::optional<Tensor> mb_weight,
-    c10::optional<Tensor> mb_bias,
+    std::optional<Tensor> mb_weight,
+    std::optional<Tensor> mb_bias,
     Tensor mean,
     Tensor var,
     double eps,
@@ -162,8 +162,8 @@ Tensor q_batch_norm1d_impl(
 template <bool ReluFused>
 Tensor q_batch_norm2d_impl(
     Tensor qx,
-    c10::optional<Tensor> mb_weight,
-    c10::optional<Tensor> mb_bias,
+    std::optional<Tensor> mb_weight,
+    std::optional<Tensor> mb_bias,
     Tensor mean,
     Tensor var,
     double eps,
@@ -256,8 +256,8 @@ Tensor q_batch_norm2d_impl(
 template <bool ReluFused>
 Tensor q_batch_norm3d_impl(
     Tensor qx,
-    c10::optional<Tensor> mb_weight,
-    c10::optional<Tensor> mb_bias,
+    std::optional<Tensor> mb_weight,
+    std::optional<Tensor> mb_bias,
     Tensor mean,
     Tensor var,
     double eps,
@@ -353,8 +353,8 @@ Tensor q_batch_norm3d_impl(
 template <bool ReluFused>
 Tensor q_batch_norm_impl(
     Tensor qx,
-    c10::optional<Tensor> mb_weight,
-    c10::optional<Tensor> mb_bias,
+    std::optional<Tensor> mb_weight,
+    std::optional<Tensor> mb_bias,
     Tensor mean,
     Tensor var,
     double eps,
@@ -380,7 +380,7 @@ Tensor q_batch_norm_impl(
 } // namespace
 
 Tensor quantized_batch_norm(
-    const Tensor& qx, const c10::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */,
+    const Tensor& qx, const std::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */,
     const Tensor& mean /* optional */,
     const Tensor& var /* optional */,
     double eps,
diff --git a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
index 8887bb83deb91..535ccaf9acba1 100644
--- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
@@ -119,9 +119,9 @@ enum PostOps {
 struct PackedLinearWeightsOnednn : public LinearPackedParamsBase {
   PackedLinearWeightsOnednn(
       std::unique_ptr<ideep::tensor> weight,
-      c10::optional<ideep::tensor> bias,
+      std::optional<ideep::tensor> bias,
       at::Tensor orig_weight,
-      c10::optional<at::Tensor> orig_bias)
+      std::optional<at::Tensor> orig_bias)
       : weight_(std::move(weight)),
         bias_(std::move(bias)),
         orig_weight_(std::move(orig_weight)),
@@ -129,9 +129,9 @@ struct PackedLinearWeightsOnednn : public LinearPackedParamsBase {
     cache_initialized_flag = std::make_unique<c10::once_flag>();
   }
   std::unique_ptr<ideep::tensor> weight_;
-  c10::optional<ideep::tensor> bias_;
+  std::optional<ideep::tensor> bias_;
   at::Tensor orig_weight_;
-  c10::optional<at::Tensor> orig_bias_;
+  std::optional<at::Tensor> orig_bias_;
 
   at::Tensor apply(
       at::Tensor input,
@@ -156,15 +156,15 @@ struct PackedLinearWeightsOnednn : public LinearPackedParamsBase {
       double output_scale,
       int64_t output_zero_point);
 
-  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
 
-  c10::optional<at::Tensor> bias() override {
+  std::optional<at::Tensor> bias() override {
     return orig_bias_;
   }
 
   static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
       at::Tensor weight,
-      c10::optional<at::Tensor> bias);
+      std::optional<at::Tensor> bias);
 
  private:
   LinearPrimitiveCache prim_cache;
@@ -189,9 +189,9 @@ template <int kSpatialDim = 2>
 struct PackedConvWeightsOnednn : public ConvPackedParamsBase<kSpatialDim> {
   PackedConvWeightsOnednn(
       std::unique_ptr<ideep::tensor> weight,
-      c10::optional<ideep::tensor> bias,
+      std::optional<ideep::tensor> bias,
       at::Tensor orig_weight,
-      c10::optional<at::Tensor> orig_bias,
+      std::optional<at::Tensor> orig_bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> output_padding,
@@ -212,9 +212,9 @@ struct PackedConvWeightsOnednn : public ConvPackedParamsBase<kSpatialDim> {
   }
 
   std::unique_ptr<ideep::tensor> weight_;
-  c10::optional<ideep::tensor> bias_;
+  std::optional<ideep::tensor> bias_;
   at::Tensor orig_weight_;
-  c10::optional<at::Tensor> orig_bias_;
+  std::optional<at::Tensor> orig_bias_;
   torch::List<int64_t> stride_;
   torch::List<int64_t> padding_;
   torch::List<int64_t> output_padding_;
@@ -248,11 +248,11 @@ struct PackedConvWeightsOnednn : public ConvPackedParamsBase<kSpatialDim> {
       double output_scale,
       int64_t output_zero_point);
 
-  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
 
   static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
       at::Tensor weight,
-      c10::optional<at::Tensor> bias,
+      std::optional<at::Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> output_padding,
@@ -292,7 +292,7 @@ struct PackedConvWeightsOnednn : public ConvPackedParamsBase<kSpatialDim> {
   template <bool ReluFused>
   at::Tensor apply_impl(
       const at::Tensor& input,
-      const c10::optional<at::Tensor>& accum,
+      const std::optional<at::Tensor>& accum,
       double output_scale,
       int64_t output_zero_point);
 
@@ -316,7 +316,7 @@ static ideep::attr_t create_attr_by_post_op(
     int64_t input1_zero_point,
     const ideep::tensor::desc& input1_desc,
     const c10::string_view& unary_post_op,
-    const torch::List<c10::optional<at::Scalar>>& unary_post_op_args,
+    const torch::List<std::optional<at::Scalar>>& unary_post_op_args,
     const c10::string_view& unary_post_op_algorithm) {
   using ideep::tensor;
   if (binary_post_op == "none") {
@@ -470,7 +470,7 @@ at::Tensor _qconv_prepack_onednn(
     torch::List<int64_t> padding,
     torch::List<int64_t> dilation,
     int64_t groups,
-    c10::optional<torch::List<int64_t>> input_shape=c10::nullopt);
+    std::optional<torch::List<int64_t>> input_shape=c10::nullopt);
 
 static at::Tensor _quantized_convolution_onednn(
     at::Tensor act, // contains quantized values but not QTensor
@@ -479,7 +479,7 @@ static at::Tensor _quantized_convolution_onednn(
     at::Tensor weight, // MKLDNN tensor with quantized values
     at::Tensor weight_scales,
     at::Tensor weight_zero_points,
-    c10::optional<at::Tensor> bias, // Bias is packed if not None
+    std::optional<at::Tensor> bias, // Bias is packed if not None
     torch::List<int64_t> stride,
     torch::List<int64_t> padding,
     torch::List<int64_t> dilation,
@@ -487,14 +487,14 @@ static at::Tensor _quantized_convolution_onednn(
     int64_t groups,
     double output_scale,
     int64_t output_zero_point,
-    c10::optional<at::Tensor> accum=c10::nullopt, // accum to fused with conv add
+    std::optional<at::Tensor> accum=c10::nullopt, // accum to fused with conv add
     double accum_scale=1.0,
     int64_t accum_zero_point=0,
     bool fp32_output=false,
-    c10::optional<c10::string_view> binary_attr=c10::nullopt,
-    c10::optional<at::Scalar> binary_alpha=c10::nullopt,
-    c10::optional<c10::string_view> unary_attr=c10::nullopt,
-    torch::List<c10::optional<at::Scalar>> unary_scalars=torch::List<c10::optional<at::Scalar>>(),
-    c10::optional<c10::string_view> unary_algorithm=c10::nullopt);
+    std::optional<c10::string_view> binary_attr=c10::nullopt,
+    std::optional<at::Scalar> binary_alpha=c10::nullopt,
+    std::optional<c10::string_view> unary_attr=c10::nullopt,
+    torch::List<std::optional<at::Scalar>> unary_scalars=torch::List<c10::optional<at::Scalar>>(),
+    std::optional<c10::string_view> unary_algorithm=c10::nullopt);
 
 #endif // #if AT_MKLDNN_ENABLED()
diff --git a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
index 88ff258be891f..b217c757740b3 100644
--- a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
@@ -38,7 +38,7 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase {
       std::unique_ptr<qnnpack::PackBMatrix> w,
       at::Tensor orig_weight,
       at::Tensor bias,
-      c10::optional<double> input_scale,
+      std::optional<double> input_scale,
       at::Tensor w_scales,
       std::vector<uint8_t>&& w_zps)
       : w(std::move(w)),
@@ -57,7 +57,7 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase {
   at::Tensor orig_weight;
   at::Tensor bias_;
   bool per_channel_;
-  c10::optional<double> input_scale;
+  std::optional<double> input_scale;
   at::Tensor w_scales;
   std::vector<uint8_t> w_zero_points;
   std::vector<float> requantization_scales;
@@ -76,15 +76,15 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase {
   at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override;
   at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override;
 
-  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
 
-  c10::optional<at::Tensor> bias() override {
+  std::optional<at::Tensor> bias() override {
     return bias_;
   }
 
   static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
       at::Tensor weight,
-      c10::optional<at::Tensor> bias);
+      std::optional<at::Tensor> bias);
 
   bool per_channel() const {
     return per_channel_;
@@ -125,7 +125,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
       torch::List<int64_t> dilation,
       int64_t groups,
       bool transpose,
-      c10::optional<double> input_scale,
+      std::optional<double> input_scale,
       std::vector<int64_t> kernel,
       at::Tensor w_scale,
       std::vector<uint8_t>&& w_zps,
@@ -302,7 +302,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
   int64_t groups_;
   bool transpose_;
   bool is_per_channel_;
-  c10::optional<double> input_scale;
+  std::optional<double> input_scale;
   std::vector<int64_t> kernel_;
   at::Tensor w_scales;
   std::vector<uint8_t> w_zero_points;
@@ -323,11 +323,11 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
       const at::Tensor& input,
       bool reduce_range=false) override;
 
-  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
 
   static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
       at::Tensor weight,
-      c10::optional<at::Tensor> bias,
+      std::optional<at::Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> output_padding,
@@ -438,7 +438,7 @@ Tensor qnnpack_avg_pool2d(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override);
+    std::optional<int64_t> divisor_override);
 } // qnnp_avgpool_helper
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/QuantizedOps.h b/aten/src/ATen/native/quantized/cpu/QuantizedOps.h
index 3ef8a3f4f4f42..9257f57b65dcd 100644
--- a/aten/src/ATen/native/quantized/cpu/QuantizedOps.h
+++ b/aten/src/ATen/native/quantized/cpu/QuantizedOps.h
@@ -129,7 +129,7 @@ using qavg_pool2d_fn = void (*)(
     int padW,
     int padH,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override);
+    std::optional<int64_t> divisor_override);
 
 using qavg_pool3d_fn = void (*)(
     const Tensor& qx,
@@ -152,7 +152,7 @@ using qavg_pool3d_fn = void (*)(
     int padH,
     int padD,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override);
+    std::optional<int64_t> divisor_override);
 
 using qupsample_bilinear2d_fn = void (*)(
     Tensor& output,
@@ -164,8 +164,8 @@ using qupsample_bilinear2d_fn = void (*)(
     int64_t nbatch,
     int64_t channels,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w);
+    std::optional<double> scales_h,
+    std::optional<double> scales_w);
 
 using qcat_nhwc_fn = Tensor (*)(
     const MaterializedITensorListRef& qxs,
@@ -192,13 +192,13 @@ using qmean_inner_dim_fn = void (*)(
     const Tensor& /* X */,
     OptionalIntArrayRef /* opt_dim */,
     bool /* keepdim */,
-    c10::optional<ScalarType> /* opt_dtype */,
+    std::optional<ScalarType> /* opt_dtype */,
     Tensor& /* Y */);
 
 using qstd_inner_dim_fn = void (*)(
     const Tensor& /* X */,
     OptionalIntArrayRef /* dim */,
-    const c10::optional<Scalar>& /* correction */,
+    const std::optional<Scalar>& /* correction */,
     bool /* keepdim */,
     Tensor& /* Y */);
 
diff --git a/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp b/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp
index 0ad1a5ae013bc..113c57f2cc351 100644
--- a/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp
+++ b/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp
@@ -47,7 +47,7 @@ inline bool is_innnermost_dim(
 inline bool is_mean_inner_dim_fast_path(
     const Tensor& self,
     OptionalIntArrayRef opt_dim,
-    c10::optional<ScalarType> opt_dtype) {
+    std::optional<ScalarType> opt_dtype) {
   bool is_fast_path =
       is_innnermost_dim(self, opt_dim) &&
       (!opt_dtype.has_value() || opt_dtype.value() == self.scalar_type());
@@ -131,7 +131,7 @@ Tensor& mean_out_quantized_cpu(
     const Tensor& self,
     OptionalIntArrayRef opt_dim,
     bool keepdim,
-    c10::optional<ScalarType> opt_dtype,
+    std::optional<ScalarType> opt_dtype,
     Tensor& result) {
 #ifdef USE_PYTORCH_QNNPACK
   if (at::globalContext().qEngine() == at::QEngine::QNNPACK &&
@@ -177,7 +177,7 @@ static Tensor& mean_out_quantized_cpu(
     const Tensor& self,
     DimnameList dim,
     bool keepdim,
-    c10::optional<ScalarType> opt_dtype) {
+    std::optional<ScalarType> opt_dtype) {
   return mean_out_quantized_cpu(
       self, dimnames_to_positions(self, dim), keepdim, opt_dtype, result);
 }
@@ -186,7 +186,7 @@ static Tensor& mean_out_quantized_cpu(
 inline bool is_std_inner_dim_fast_path(
     const Tensor& self,
     OptionalIntArrayRef dim,
-    const c10::optional<Scalar>& correction) {
+    const std::optional<Scalar>& correction) {
   // Do not enter fast path if there are too few elements
   IntArrayRef dims = dim.has_value() ? dim.value() : IntArrayRef();
   auto all_dims = std::vector<int64_t>(self.dim());
@@ -206,7 +206,7 @@ inline bool is_std_inner_dim_fast_path(
 Tensor& std_out_quantized_cpu(
     const Tensor& self,
     OptionalIntArrayRef dim,
-    const c10::optional<Scalar>& correction,
+    const std::optional<Scalar>& correction,
     bool keepdim,
     Tensor& result) {
   // Fast path
@@ -230,7 +230,7 @@ Tensor& std_out_quantized_cpu(
 Tensor std_quantized_cpu(
     const Tensor& self,
     OptionalIntArrayRef dim,
-    const c10::optional<Scalar>& correction,
+    const std::optional<Scalar>& correction,
     bool keepdim) {
   Tensor result;
   std_out_quantized_cpu(self, dim, correction, keepdim, result);
@@ -240,7 +240,7 @@ Tensor std_quantized_cpu(
 static Tensor std_quantized_cpu(
     const Tensor& self,
     DimnameList dim,
-    const c10::optional<Scalar>& correction,
+    const std::optional<Scalar>& correction,
     bool keepdim) {
   return std_quantized_cpu(
       self, dimnames_to_positions(self, dim), correction, keepdim);
@@ -250,7 +250,7 @@ static Tensor& std_out_quantized_cpu(
     Tensor& result,
     const Tensor& self,
     DimnameList dim,
-    const c10::optional<Scalar>& correction,
+    const std::optional<Scalar>& correction,
     bool keepdim) {
   return std_out_quantized_cpu(
       self, dimnames_to_positions(self, dim), correction, keepdim, result);
diff --git a/aten/src/ATen/native/quantized/cpu/TensorOperators.cpp b/aten/src/ATen/native/quantized/cpu/TensorOperators.cpp
index 1ee305c64fc5f..388218c01ca02 100644
--- a/aten/src/ATen/native/quantized/cpu/TensorOperators.cpp
+++ b/aten/src/ATen/native/quantized/cpu/TensorOperators.cpp
@@ -81,7 +81,7 @@ AT_FORALL_OPERATORS(DEFINE_COMPARATOR)
 const Tensor& quantized_resize_cpu_(
     const Tensor& self,
     IntArrayRef size,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   // See Note [Writing Nondeterministic Operations]
   // Nondeterministic because if storage is resized, new elements are uninitialized
   globalContext().alertNotDeterministic("quantized_resize_cpu_");
diff --git a/aten/src/ATen/native/quantized/cpu/TensorShape.cpp b/aten/src/ATen/native/quantized/cpu/TensorShape.cpp
index 58af539cb142f..4c810ef97b5bc 100644
--- a/aten/src/ATen/native/quantized/cpu/TensorShape.cpp
+++ b/aten/src/ATen/native/quantized/cpu/TensorShape.cpp
@@ -126,8 +126,8 @@ template <bool ReLUFused = false>
 Tensor qcat(
     const c10::List<Tensor>& qxs,
     int64_t dim,
-    c10::optional<double> scale,
-    c10::optional<int64_t> zero_point) {
+    std::optional<double> scale,
+    std::optional<int64_t> zero_point) {
   TORCH_CHECK(is_valid_quantization_scheme(qxs[0]),
               "Only per-tensor quantization is supported in 'cat'!")
   double _scale = scale.has_value() ? scale.value() : qxs.get(0).q_scale();
diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
index f428745eaa86f..d4dfa7ff08c91 100644
--- a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
@@ -46,8 +46,8 @@ static void upsample_bilinear2d_out_frame(
     int64_t nbatch,
     int64_t channels,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   auto* idata = static_cast<const scalar_t*>(input.const_data_ptr());
   auto* odata = static_cast<scalar_t*>(output.data_ptr());
 
@@ -146,8 +146,8 @@ Tensor upsample_bilinear2d_quantized_cpu(
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TORCH_CHECK(
       output_size.size() == 2,
       "It is expected output_size equals to 2, but got size ",
@@ -223,7 +223,7 @@ static Tensor upsample_bilinear2d_quantized_cpu(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
       bool align_corners,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_h = get_scale_value(scale_factors, 0);
   auto scale_w = get_scale_value(scale_factors, 1);
diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
index 1020aef797e50..191407bed66a8 100644
--- a/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
@@ -36,8 +36,8 @@ static void upsample_nearest2d_out_frame(
     int64_t output_width,
     int64_t nbatch,
     int64_t channels,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   float height_scale = compute_scales_value<float>(scales_h, input_height, output_height);
   float width_scale = compute_scales_value<float>(scales_w, input_width, output_width);
 
@@ -92,8 +92,8 @@ static void upsample_nearest2d_out_frame_nhwc(
     int64_t output_width,
     int64_t nbatch,
     int64_t channels,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   float height_scale = compute_scales_value<float>(scales_h, input_height, output_height);
   float width_scale = compute_scales_value<float>(scales_w, input_width, output_width);
 
@@ -121,8 +121,8 @@ template <nn_compute_source_index_fn_t nn_compute_source_index_fn>
 Tensor _upsample_nearest2d_quantized_cpu(
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TORCH_CHECK(
       output_size.size() == 2,
       "It is expected output_size equals to 2, but got size ",
@@ -205,23 +205,23 @@ using at::native::upsample::get_scale_value;
 Tensor upsample_nearest2d_quantized_cpu(
     const Tensor& input,
     IntArrayRef osize,
-    c10::optional<double> scale_h,
-    c10::optional<double> scale_w) {
+    std::optional<double> scale_h,
+    std::optional<double> scale_w) {
   return _upsample_nearest2d_quantized_cpu<nearest_neighbor_compute_source_index>(input, osize, scale_h, scale_w);
 }
 
 Tensor _upsample_nearest_exact2d_quantized_cpu(
     const Tensor& input,
     IntArrayRef osize,
-    c10::optional<double> scale_h,
-    c10::optional<double> scale_w) {
+    std::optional<double> scale_h,
+    std::optional<double> scale_w) {
   return _upsample_nearest2d_quantized_cpu<nearest_neighbor_exact_compute_source_index>(input, osize, scale_h, scale_w);
 }
 
 static Tensor upsample_nearest2d_quantized_cpu(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_h = get_scale_value(scale_factors, 0);
   auto scale_w = get_scale_value(scale_factors, 1);
@@ -231,7 +231,7 @@ static Tensor upsample_nearest2d_quantized_cpu(
 static Tensor _upsample_nearest_exact2d_quantized_cpu(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
+    std::optional<ArrayRef<double>> scale_factors) {
   auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
   auto scale_h = get_scale_value(scale_factors, 0);
   auto scale_w = get_scale_value(scale_factors, 1);
diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
index 91ddfefcd4d4e..d98883123f057 100644
--- a/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
@@ -36,9 +36,9 @@ static void upsample_nearest3d_out_frame(
     int64_t output_width,
     int64_t nbatch,
     int64_t channels,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   float depth_scale = compute_scales_value<float>(scales_d, input_depth, output_depth);
   float height_scale = compute_scales_value<float>(scales_h, input_height, output_height);
   float width_scale = compute_scales_value<float>(scales_w, input_width, output_width);
@@ -93,9 +93,9 @@ static void upsample_nearest3d_out_frame_nhwc(
     int64_t output_width,
     int64_t nbatch,
     int64_t channels,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   float depth_scale = compute_scales_value<float>(scales_d, input_depth, output_depth);
   float height_scale = compute_scales_value<float>(scales_h, input_height, output_height);
   float width_scale = compute_scales_value<float>(scales_w, input_width, output_width);
@@ -133,9 +133,9 @@ template <nn_compute_source_index_fn_t nn_compute_source_index_fn>
 Tensor _upsample_nearest3d_quantized_cpu(
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales_d,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TORCH_CHECK(
       output_size.size() == 3,
       "It is expected output_size equals to 3, but got size ",
@@ -217,9 +217,9 @@ Tensor _upsample_nearest3d_quantized_cpu(
 Tensor upsample_nearest3d_quantized_cpu(
     const Tensor& input,
     IntArrayRef osize,
-    c10::optional<double> scale_d,
-    c10::optional<double> scale_h,
-    c10::optional<double> scale_w) {
+    std::optional<double> scale_d,
+    std::optional<double> scale_h,
+    std::optional<double> scale_w) {
   return _upsample_nearest3d_quantized_cpu<nearest_neighbor_compute_source_index>(
       input, osize, scale_d, scale_h, scale_w);
 }
@@ -227,9 +227,9 @@ Tensor upsample_nearest3d_quantized_cpu(
 Tensor _upsample_nearest_exact3d_quantized_cpu(
     const Tensor& input,
     IntArrayRef osize,
-    c10::optional<double> scale_d,
-    c10::optional<double> scale_h,
-    c10::optional<double> scale_w) {
+    std::optional<double> scale_d,
+    std::optional<double> scale_h,
+    std::optional<double> scale_w) {
   return _upsample_nearest3d_quantized_cpu<nearest_neighbor_exact_compute_source_index>(
       input, osize, scale_d, scale_h, scale_w);
 }
diff --git a/aten/src/ATen/native/quantized/cpu/conv_serialization.h b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
index 9f452a1cc7213..85451fb57482a 100644
--- a/aten/src/ATen/native/quantized/cpu/conv_serialization.h
+++ b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
@@ -73,7 +73,7 @@ using ConvParamsSerializationTypeV2 = std::tuple<
   // non-optional tensors
   std::vector<at::Tensor>,
   // optional tensors
-  std::vector<c10::optional<at::Tensor>>>;
+  std::vector<std::optional<at::Tensor>>>;
 
 using ConvParamsSerializationTypeV3 = std::tuple<
   // version, int for versions 3 and up
@@ -81,7 +81,7 @@ using ConvParamsSerializationTypeV3 = std::tuple<
   // configuration values
   std::vector<int64_t>,
   // optional tensors
-  std::vector<c10::optional<at::Tensor>>>;
+  std::vector<std::optional<at::Tensor>>>;
 
 // Parses any historical conv packed params format into
 // the current format.
@@ -119,7 +119,7 @@ ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) {
     const auto& elements = v.toTupleRef().elements();
 
     at::Tensor weight = elements[0].toTensor();
-    c10::optional<at::Tensor> bias = elements[1].toOptional<at::Tensor>();
+    std::optional<at::Tensor> bias = elements[1].toOptional<at::Tensor>();
     torch::List<at::Tensor> stride_x_kSpatialDim = elements[2].toTensorList();
     torch::List<at::Tensor> padding_x_kSpatialDim = elements[3].toTensorList();
     torch::List<at::Tensor> dilation_x_kSpatialDim = elements[4].toTensorList();
@@ -150,7 +150,7 @@ ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) {
     // transpose does not exist in v1, so we fill in a default value
     config_vals.push_back(0);
 
-    std::vector<c10::optional<at::Tensor>> tensors;
+    std::vector<std::optional<at::Tensor>> tensors;
     tensors.emplace_back();
     tensors.emplace_back(weight);
     tensors.emplace_back(bias);
@@ -161,7 +161,7 @@ ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) {
     // version 2
     const auto& elements = v.toTupleRef().elements();
     std::vector<at::Tensor> non_optional = elements[1].toTensorList().vec();
-    std::vector<c10::optional<at::Tensor>> optional;
+    std::vector<std::optional<at::Tensor>> optional;
 
     if (elements[2].isTensorList()) {
       for (const auto& elem : elements[2].toTensorList()) {
@@ -187,7 +187,7 @@ ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) {
     auto weight = non_optional[1];
     auto bias = optional[0];
 
-    std::vector<c10::optional<at::Tensor>> tensors;
+    std::vector<std::optional<at::Tensor>> tensors;
     tensors.emplace_back();
     tensors.emplace_back(weight);
     tensors.emplace_back(bias);
@@ -213,7 +213,7 @@ ConvParamsSerializationTypeV2 serialize_conv(
 
   std::string version = "2";
   std::vector<at::Tensor> non_optional;
-  std::vector<c10::optional<at::Tensor>> optional;
+  std::vector<std::optional<at::Tensor>> optional;
 
   // create a packed int8_t tensor for conv params
   std::vector<int16_t> params_vec;
@@ -267,7 +267,7 @@ ConvParamsSerializationTypeV3 serialize_conv(
 
   auto [weight, bias] = params->unpack();
 
-  std::vector<c10::optional<at::Tensor>> tensors;
+  std::vector<std::optional<at::Tensor>> tensors;
   tensors.emplace_back();
   tensors.emplace_back(weight);
   tensors.emplace_back(bias);
@@ -287,8 +287,8 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
   TORCH_INTERNAL_ASSERT(version == 3, "Unexpected serialized qconv version: ", version);
 
   TORCH_CHECK(tensors.size() == 3, "Wrong number of tensors", tensors.size());
-  c10::optional<at::Tensor> weight = tensors[1];
-  c10::optional<at::Tensor> bias = tensors[2];
+  std::optional<at::Tensor> weight = tensors[1];
+  std::optional<at::Tensor> bias = tensors[2];
   TORCH_INTERNAL_ASSERT(weight, "Weight should always be present in serialized qconv.");
 
   torch::List<int64_t> stride, padding, output_padding, dilation;
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
index d942e2f161a26..d6ac157a116b5 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@@ -433,7 +433,7 @@ TORCH_API int register_conv_params<3>();
 TORCH_API int register_linear_params();
 
 TORCH_API int register_linear_params() {
-  using SerializationType = std::tuple<at::Tensor, c10::optional<at::Tensor>>;
+  using SerializationType = std::tuple<at::Tensor, std::optional<at::Tensor>>;
   static auto register_linear_params =
       torch::selective_class_<LinearPackedParamsBase>(
           "quantized", TORCH_SELECTIVE_CLASS("LinearPackedParamsBase"))
@@ -446,7 +446,7 @@ TORCH_API int register_linear_params() {
                   -> c10::intrusive_ptr<
                       LinearPackedParamsBase> { // __setstate__
                 at::Tensor weight;
-                c10::optional<at::Tensor> bias;
+                std::optional<at::Tensor> bias;
                 weight = std::move(std::get<0>(state));
                 bias = std::move(std::get<1>(state));
 
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
index bfaf5b93d667b..75b5047713bb0 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@@ -23,7 +23,7 @@
 struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase {
   PackedLinearWeight(
       std::unique_ptr<fbgemm::PackBMatrix<int8_t>> w,
-      c10::optional<at::Tensor> bias,
+      std::optional<at::Tensor> bias,
       std::vector<int32_t> col_offsets,
       std::vector<float> w_scale,
       std::vector<int32_t> w_zp,
@@ -35,7 +35,7 @@ struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase {
         w_zp(std::move(w_zp)),
         q_scheme(std::move(q_scheme)) {}
   std::unique_ptr<fbgemm::PackBMatrix<int8_t>> w;
-  c10::optional<at::Tensor> bias_;
+  std::optional<at::Tensor> bias_;
   std::vector<int32_t> col_offsets;
   std::vector<float> w_scale;
   std::vector<int32_t> w_zp;
@@ -79,15 +79,15 @@ struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase {
   at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false)
       override;
 
-  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
 
-  c10::optional<at::Tensor> bias() override {
+  std::optional<at::Tensor> bias() override {
     return bias_;
   }
 
   static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
       at::Tensor weight,
-      c10::optional<at::Tensor> bias);
+      std::optional<at::Tensor> bias);
 
  private:
   template <bool ReluFused>
@@ -110,11 +110,11 @@ struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase {
 struct TORCH_API PackedLinearWeightFp16 : public LinearPackedParamsBase {
   PackedLinearWeightFp16(
       std::unique_ptr<fbgemm::PackedGemmMatrixFP16> w,
-      c10::optional<at::Tensor> bias)
+      std::optional<at::Tensor> bias)
       : w(std::move(w)), bias_(std::move(bias)) {}
 
   std::unique_ptr<fbgemm::PackedGemmMatrixFP16> w;
-  c10::optional<at::Tensor> bias_;
+  std::optional<at::Tensor> bias_;
 
   at::Tensor apply(
       at::Tensor /*input*/,
@@ -143,17 +143,17 @@ struct TORCH_API PackedLinearWeightFp16 : public LinearPackedParamsBase {
       at::Tensor& output,
       bool reduce_range = false) override;
 
-  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
 
-  c10::optional<at::Tensor> bias() override {
+  std::optional<at::Tensor> bias() override {
     return bias_;
   }
 
   static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
       at::Tensor weight,
-      c10::optional<at::Tensor> bias);
+      std::optional<at::Tensor> bias);
 
-  void set_bias(c10::optional<at::Tensor> bias) override;
+  void set_bias(std::optional<at::Tensor> bias) override;
 
  private:
   template <bool ReluFused>
@@ -164,7 +164,7 @@ template <int kSpatialDim = 2>
 struct TORCH_API PackedConvWeight : public ConvPackedParamsBase<kSpatialDim> {
   PackedConvWeight(
       std::unique_ptr<fbgemm::PackWeightsForConv<kSpatialDim>> w,
-      c10::optional<at::Tensor> bias,
+      std::optional<at::Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> output_padding,
@@ -191,7 +191,7 @@ struct TORCH_API PackedConvWeight : public ConvPackedParamsBase<kSpatialDim> {
         q_scheme(q_scheme) {}
 
   std::unique_ptr<fbgemm::PackWeightsForConv<kSpatialDim>> w;
-  c10::optional<at::Tensor> bias;
+  std::optional<at::Tensor> bias;
   torch::List<int64_t> stride_;
   torch::List<int64_t> padding_;
   torch::List<int64_t> output_padding_;
@@ -218,11 +218,11 @@ struct TORCH_API PackedConvWeight : public ConvPackedParamsBase<kSpatialDim> {
     const at::Tensor& input,
     bool reduce_range) override;
 
-  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
 
   static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
       at::Tensor weight,
-      c10::optional<at::Tensor> bias,
+      std::optional<at::Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> output_padding,
@@ -393,19 +393,19 @@ struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase {
 
   at::Tensor embeddingbag_byte(
       const at::Tensor& indices,
-      const c10::optional<at::Tensor>& offsets,
+      const std::optional<at::Tensor>& offsets,
       bool pruned_weights,
-      const c10::optional<at::Tensor>& per_sample_weights_,
-      const c10::optional<at::Tensor>& compressed_indices_mapping,
+      const std::optional<at::Tensor>& per_sample_weights_,
+      const std::optional<at::Tensor>& compressed_indices_mapping,
       bool include_last_offset,
       bool is_embedding_op) override;
 
   at::Tensor embeddingbag_4bit(
       const at::Tensor& indices,
-      const c10::optional<at::Tensor>& offsets,
+      const std::optional<at::Tensor>& offsets,
       bool pruned_weights,
-      const c10::optional<at::Tensor>& per_sample_weights_,
-      const c10::optional<at::Tensor>& compressed_indices_mapping,
+      const std::optional<at::Tensor>& per_sample_weights_,
+      const std::optional<at::Tensor>& compressed_indices_mapping,
       bool include_last_offset,
       bool is_embedding_op) override;
 };
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index dc9063ecf46f1..11828f273bbc8 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -2023,7 +2023,7 @@ void _qavg_pool_nhwc_kernel(
     int padH,
     int padD,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   T* idata = static_cast<T*>(qx.data_ptr());
   T* odata = static_cast<T*>(qy.data_ptr());
   int strideC = 1;
@@ -2135,7 +2135,7 @@ void qavg_pool2d_nhwc_kernel(
     int padW,
     int padH,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "avg_pool2d_nhwc", [&]() {
     _qavg_pool_nhwc_kernel<scalar_t>(
       qx,
@@ -2183,7 +2183,7 @@ void qavg_pool3d_nhwc_kernel(
     int padH,
     int padD,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "avg_pool3d_nhwc", [&]() {
     _qavg_pool_nhwc_kernel<scalar_t>(
       qx,
@@ -2288,8 +2288,8 @@ void qupsample_bilinear2d_nhwc_kernel(
     int64_t nbatch,
     int64_t channels,
     bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   AT_DISPATCH_QINT_TYPES(input.scalar_type(), "upsample_bilinear2d_nhwc", [&]() {
     auto* idata = static_cast<scalar_t*>(input.data_ptr());
     auto* odata = static_cast<scalar_t*>(output.data_ptr());
@@ -2940,7 +2940,7 @@ void qmean_inner_dim_kernel(
     const Tensor& self,
     OptionalIntArrayRef opt_dim,
     bool keepdim,
-    c10::optional<ScalarType> opt_dtype,
+    std::optional<ScalarType> opt_dtype,
     Tensor& result) {
   // 'opt_dtype' should be none or equal to that of input
   ScalarType dtype = self.scalar_type();
@@ -2989,7 +2989,7 @@ void qmean_inner_dim_kernel(
 void qstd_inner_dim_kernel(
     const Tensor& self,
     OptionalIntArrayRef dim,
-    const c10::optional<Scalar>& correction_opt,
+    const std::optional<Scalar>& correction_opt,
     bool keepdim,
     Tensor& result) {
   ScalarType dtype = self.scalar_type();
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index f915c014af143..82223d6d3314c 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -1152,7 +1152,7 @@ template <int kSpatialDim>
 template <bool kReluFused>
 at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
     const at::Tensor& act,
-    const c10::optional<at::Tensor>& accum,
+    const std::optional<at::Tensor>& accum,
     double output_scale,
     int64_t output_zero_point) {
   std::string func_name = "quantized::conv";
@@ -1391,7 +1391,7 @@ static at::Tensor _quantized_convolution_onednn(
     at::Tensor weight, // MKLDNN tensor with quantized values
     at::Tensor weight_scales,
     at::Tensor weight_zero_points,
-    c10::optional<at::Tensor> bias, // Bias is not packed into MKLDNN tensor
+    std::optional<at::Tensor> bias, // Bias is not packed into MKLDNN tensor
     torch::List<int64_t> stride,
     torch::List<int64_t> padding,
     torch::List<int64_t> dilation,
@@ -1399,15 +1399,15 @@ static at::Tensor _quantized_convolution_onednn(
     int64_t groups,
     double output_scale,
     int64_t output_zero_point,
-    c10::optional<at::Tensor> accum, // accum to fused with conv add
+    std::optional<at::Tensor> accum, // accum to fused with conv add
     double accum_scale,
     int64_t accum_zero_point,
-    c10::optional<c10::ScalarType> output_dtype,
-    c10::optional<c10::string_view> binary_attr,
-    c10::optional<at::Scalar> binary_alpha,
-    c10::optional<c10::string_view> unary_attr,
-    torch::List<c10::optional<at::Scalar>> unary_scalars,
-    c10::optional<c10::string_view> unary_algorithm) {
+    std::optional<c10::ScalarType> output_dtype,
+    std::optional<c10::string_view> binary_attr,
+    std::optional<at::Scalar> binary_alpha,
+    std::optional<c10::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<c10::string_view> unary_algorithm) {
   /*********************************/
   /*          Checks               */
   /*********************************/
@@ -1867,17 +1867,17 @@ class QConvoneDNN final {
       at::Tensor weight, // contains quantized values but not QTensor
       at::Tensor weight_scales,
       at::Tensor weight_zero_points,
-      c10::optional<at::Tensor> bias,
+      std::optional<at::Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> dilation,
       int64_t groups,
       double output_scale,
       int64_t output_zero_point,
-      c10::optional<c10::ScalarType> output_dtype,
+      std::optional<c10::ScalarType> output_dtype,
       c10::string_view attr,
-      torch::List<c10::optional<at::Scalar>> scalars,
-      c10::optional<c10::string_view> algorithm) {
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<c10::string_view> algorithm) {
 #if AT_MKLDNN_ENABLED()
     if (act.dim() == 3 || act.dim() == 5) {
       // Conv1D/3D post op check
@@ -1919,19 +1919,19 @@ class QConvoneDNN final {
       at::Tensor weight, // contains quantized values but not QTensor
       at::Tensor weight_scales,
       at::Tensor weight_zero_points,
-      c10::optional<at::Tensor> bias,
+      std::optional<at::Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> dilation,
       int64_t groups,
       double output_scale,
       int64_t output_zero_point,
-      c10::optional<c10::ScalarType> output_dtype,
+      std::optional<c10::ScalarType> output_dtype,
       c10::string_view binary_attr,
-      c10::optional<at::Scalar> alpha,
-      c10::optional<c10::string_view> unary_attr,
-      torch::List<c10::optional<at::Scalar>> unary_scalars,
-      c10::optional<c10::string_view> unary_algorithm) {
+      std::optional<at::Scalar> alpha,
+      std::optional<c10::string_view> unary_attr,
+      torch::List<std::optional<at::Scalar>> unary_scalars,
+      std::optional<c10::string_view> unary_algorithm) {
 #if AT_MKLDNN_ENABLED()
     // Conv2D post op check
     TORCH_CHECK(
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index 46172f0c199f4..5f76890da2cae 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -28,7 +28,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeight<
     kSpatialDim>::
     prepack(
         at::Tensor weight,
-        c10::optional<at::Tensor> bias,
+        std::optional<at::Tensor> bias,
         torch::List<int64_t> stride,
         torch::List<int64_t> padding,
         torch::List<int64_t> output_padding,
@@ -155,7 +155,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeight<
     }
   }
 
-  c10::optional<at::Tensor> bias_contig;
+  std::optional<at::Tensor> bias_contig;
   if (bias.has_value()) {
     at::Tensor bias_vec = bias.value();
     TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)");
@@ -196,7 +196,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
     kSpatialDim>::
     prepack(
         at::Tensor weight,
-        c10::optional<at::Tensor> bias_in,
+        std::optional<at::Tensor> bias_in,
         torch::List<int64_t> stride,
         torch::List<int64_t> padding,
         torch::List<int64_t> output_padding,
@@ -313,7 +313,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<2>> PackedConvWeightsQnnp<
     2>::
     prepack(
         at::Tensor weight,
-        c10::optional<at::Tensor> bias_in,
+        std::optional<at::Tensor> bias_in,
         torch::List<int64_t> stride,
         torch::List<int64_t> padding,
         torch::List<int64_t> output_padding,
@@ -328,7 +328,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsOnednn<
     kSpatialDim>::
     prepack(
         at::Tensor weight,
-        c10::optional<at::Tensor> bias,
+        std::optional<at::Tensor> bias,
         torch::List<int64_t> stride,
         torch::List<int64_t> padding,
         torch::List<int64_t> output_padding,
@@ -458,7 +458,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsOnednn<
   packed_weight_p->set_zero_point(wgt_zero_points);
   std::unique_ptr<ideep::tensor> weight_ptr(packed_weight_p);
   // Bias
-  c10::optional<ideep::tensor> onednn_bias{c10::nullopt};
+  std::optional<ideep::tensor> onednn_bias{c10::nullopt};
   if (bias.has_value()) {
     at::Tensor bias_vec = bias.value();
     TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)");
@@ -468,7 +468,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsOnednn<
     auto bias_desc = ideep::tensor::desc(bias.value().sizes().vec(), dnnl::memory::data_type::f32);
     ideep::tensor packed_bias;
     packed_bias.init(bias_desc, bias.value().data_ptr());
-    onednn_bias = c10::optional<ideep::tensor>(packed_bias);
+    onednn_bias = std::optional<ideep::tensor>(packed_bias);
   }
   auto ret_ptr = c10::make_intrusive<PackedConvWeightsOnednn<kSpatialDim>>(
       PackedConvWeightsOnednn<kSpatialDim>{
@@ -499,7 +499,7 @@ at::Tensor _qconv_prepack_onednn(
     torch::List<int64_t> padding,
     torch::List<int64_t> dilation,
     int64_t groups,
-    c10::optional<torch::List<int64_t>> input_shape) {
+    std::optional<torch::List<int64_t>> input_shape) {
   int kSpatialDim = weight.ndimension() - 2;
   TORCH_CHECK(
       weight.ndimension() == kSpatialDim + 2,
@@ -624,7 +624,7 @@ class QConvPackWeightInt8 final {
  public:
   static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> run_conv(
       Tensor weight,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> dilation,
@@ -640,7 +640,7 @@ class QConvPackWeightInt8 final {
 
   static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> run_deconv(
       Tensor weight,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> output_padding,
@@ -653,7 +653,7 @@ class QConvPackWeightInt8 final {
  private:
   static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> _run(
       Tensor weight,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> output_padding,
@@ -713,7 +713,7 @@ class QConv1dPackWeightInt8 final {
  public:
   static c10::intrusive_ptr<ConvPackedParamsBase<2>> run_conv(
       Tensor weight,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> dilation,
@@ -725,7 +725,7 @@ class QConv1dPackWeightInt8 final {
 
   static c10::intrusive_ptr<ConvPackedParamsBase<2>> run_deconv(
       Tensor weight,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> output_padding,
@@ -738,7 +738,7 @@ class QConv1dPackWeightInt8 final {
  private:
   static c10::intrusive_ptr<ConvPackedParamsBase<2>> _run(
       Tensor weight,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> output_padding,
@@ -814,7 +814,7 @@ class QConvPrepackOneDNN final {
     torch::List<int64_t> padding,
     torch::List<int64_t> dilation,
     int64_t groups,
-    c10::optional<torch::List<int64_t>> input_shape) {
+    std::optional<torch::List<int64_t>> input_shape) {
 #if AT_MKLDNN_ENABLED()
     return _qconv_prepack_onednn(
         weight, weight_scales, input_scale, input_zero_point,
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp b/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp
index 8af8d62f2f8a9..4f11cc2bc9393 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp
@@ -11,7 +11,7 @@
 
 #ifdef USE_FBGEMM
 template <int kSpatialDim>
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeight<
+std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeight<
     kSpatialDim>::unpack() {
   auto* packed_weights_p = w.get();
   // output channels
@@ -90,19 +90,19 @@ std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeight<
         at::native::fbgemm_utils::TransposeConvTensorUnpackConversion<
             kSpatialDim>(unpacked_weights, groups);
   }
-  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(
+  return std::tuple<at::Tensor, std::optional<at::Tensor>>(
       unpacked_weights, bias);
 }
 
-template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeight<
+template std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeight<
     2>::unpack();
-template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeight<
+template std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeight<
     3>::unpack();
 #endif // USE_FBGEMM
 
 #ifdef USE_PYTORCH_QNNPACK
 template <int kSpatialDim>
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsQnnp<
+std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeightsQnnp<
     kSpatialDim>::unpack() {
   TORCH_CHECK(
       kSpatialDim == 2,
@@ -112,25 +112,25 @@ std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsQnnp<
         orig_weight.defined(),
         "Cannot unpack weights. "
         "Call at::globalContext()::setReleaseOriginalWeights(false) before packing or loading to enable unpacking.");
-  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(orig_weight, bias);
+  return std::tuple<at::Tensor, std::optional<at::Tensor>>(orig_weight, bias);
 }
 
-template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsQnnp<
+template std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeightsQnnp<
     2>::unpack();
-template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsQnnp<
+template std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeightsQnnp<
     3>::unpack();
 #endif // USE_PYTORCH_QNNPACK
 
 #if AT_MKLDNN_ENABLED()
 template <int kSpatialDim>
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsOnednn<
+std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeightsOnednn<
     kSpatialDim>::unpack() {
-  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(
+  return std::tuple<at::Tensor, std::optional<at::Tensor>>(
       orig_weight_.clone(), orig_bias_);
 }
 
-template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsOnednn<
+template std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeightsOnednn<
     2>::unpack();
-template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightsOnednn<
+template std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeightsOnednn<
     3>::unpack();
 #endif // #if AT_MKLDNN_ENABLED()
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
index 7e5083057a0ba..8b3f9b8afc8d2 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
@@ -38,8 +38,8 @@ at::Tensor& embedding_lookup_fallback_impl(
     const at::Tensor& weight,
     const at::Tensor& indices,
     const at::Tensor& offsets,
-    const c10::optional<at::Tensor>& per_sample_weights_,
-    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    const std::optional<at::Tensor>& per_sample_weights_,
+    const std::optional<at::Tensor>& compressed_indices_mapping,
     at::Tensor& output,
     const int64_t block_size,
     const int64_t output_size,
@@ -227,8 +227,8 @@ at::Tensor& embedding_bag_nbit_impl(
     const at::Tensor& indices,
     const at::Tensor& offsets,
     bool pruned_weights,
-    const c10::optional<at::Tensor>& per_sample_weights_,
-    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    const std::optional<at::Tensor>& per_sample_weights_,
+    const std::optional<at::Tensor>& compressed_indices_mapping,
     bool include_last_offset,
     bool is_embedding_op) {
   TORCH_CHECK(weight.dim() == 2);
@@ -399,8 +399,8 @@ at::Tensor& embedding_bag_byte_impl(
     const at::Tensor& indices,
     const at::Tensor& offsets,
     bool pruned_weights,
-    const c10::optional<at::Tensor>& per_sample_weights_,
-    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    const std::optional<at::Tensor>& per_sample_weights_,
+    const std::optional<at::Tensor>& compressed_indices_mapping,
     bool include_last_offset,
     bool is_embedding_op) {
   TORCH_CHECK(weight.scalar_type() == at::kByte);
@@ -558,10 +558,10 @@ at::Tensor& embedding_bag_byte_helper(
     at::Tensor& output,
     const at::Tensor& weight,
     const at::Tensor& indices,
-    const c10::optional<at::Tensor>& offsets_in,
+    const std::optional<at::Tensor>& offsets_in,
     bool pruned_weights,
-    const c10::optional<at::Tensor>& per_sample_weights_,
-    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    const std::optional<at::Tensor>& per_sample_weights_,
+    const std::optional<at::Tensor>& compressed_indices_mapping,
     bool include_last_offset,
     bool is_embedding_op) {
   c10::MaybeOwned<at::Tensor> offsets;
@@ -656,10 +656,10 @@ at::Tensor& _embedding_bag_nbit_helper(
     const at::Tensor& weight,
     const int bit_width,
     const at::Tensor& indices,
-    const c10::optional<at::Tensor>& offsets_in,
+    const std::optional<at::Tensor>& offsets_in,
     bool pruned_weights,
-    const c10::optional<at::Tensor>& per_sample_weights_,
-    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    const std::optional<at::Tensor>& per_sample_weights_,
+    const std::optional<at::Tensor>& compressed_indices_mapping,
     bool include_last_offset,
     bool is_embedding_op) {
   c10::MaybeOwned<at::Tensor> offsets;
@@ -760,10 +760,10 @@ at::Tensor& _embedding_bag_nbit_helper(
 
 at::Tensor PackedEmbeddingBagWeight::embeddingbag_byte(
     const at::Tensor& indices,
-    const c10::optional<at::Tensor>& offsets_in,
+    const std::optional<at::Tensor>& offsets_in,
     bool pruned_weights,
-    const c10::optional<at::Tensor>& per_sample_weights_,
-    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    const std::optional<at::Tensor>& per_sample_weights_,
+    const std::optional<at::Tensor>& compressed_indices_mapping,
     bool include_last_offset,
     bool is_embedding_op) {
   auto output = at::empty({0}, packed_w.options().dtype(at::kFloat));
@@ -781,10 +781,10 @@ at::Tensor PackedEmbeddingBagWeight::embeddingbag_byte(
 
 at::Tensor PackedEmbeddingBagWeight::embeddingbag_4bit(
     const at::Tensor& indices,
-    const c10::optional<at::Tensor>& offsets_in,
+    const std::optional<at::Tensor>& offsets_in,
     bool pruned_weights,
-    const c10::optional<at::Tensor>& per_sample_weights_,
-    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    const std::optional<at::Tensor>& per_sample_weights_,
+    const std::optional<at::Tensor>& compressed_indices_mapping,
     bool include_last_offset,
     bool is_embedding_op) {
   if (per_sample_weights_.has_value()) {
@@ -819,12 +819,12 @@ Tensor& embedding_bag_byte_rowwise_offsets_out(
     Tensor& output,
     const Tensor& weight,
     const Tensor& indices,
-    const c10::optional<Tensor>& offsets_in,
+    const std::optional<Tensor>& offsets_in,
     const bool /* scale_grad_by_freq */,
     const int64_t /* mode */,
     bool pruned_weights,
-    const c10::optional<Tensor>& per_sample_weights_,
-    const c10::optional<Tensor>& compressed_indices_mapping,
+    const std::optional<Tensor>& per_sample_weights_,
+    const std::optional<Tensor>& compressed_indices_mapping,
     bool include_last_offset) {
   return embedding_bag_byte_helper(
       output,
@@ -842,12 +842,12 @@ Tensor& embedding_bag_4bit_rowwise_offsets_out(
     Tensor& output,
     const Tensor& weight,
     const Tensor& indices,
-    const c10::optional<Tensor>& offsets_in,
+    const std::optional<Tensor>& offsets_in,
     const bool /* scale_grad_by_freq */,
     const int64_t /* mode */,
     bool pruned_weights,
-    const c10::optional<Tensor>& per_sample_weights_,
-    const c10::optional<Tensor>& compressed_indices_mapping,
+    const std::optional<Tensor>& per_sample_weights_,
+    const std::optional<Tensor>& compressed_indices_mapping,
     bool include_last_offset) {
 
   if (per_sample_weights_.has_value()) {
@@ -877,12 +877,12 @@ static Tensor& embedding_bag_2bit_rowwise_offsets_out(
     Tensor& output,
     const Tensor& weight,
     const Tensor& indices,
-    const c10::optional<Tensor>& offsets_in,
+    const std::optional<Tensor>& offsets_in,
     const bool /* scale_grad_by_freq */,
     const int64_t /* mode */,
     bool pruned_weights,
-    const c10::optional<Tensor>& per_sample_weights_,
-    const c10::optional<Tensor>& compressed_indices_mapping,
+    const std::optional<Tensor>& per_sample_weights_,
+    const std::optional<Tensor>& compressed_indices_mapping,
     bool include_last_offset) {
 
   if (per_sample_weights_.has_value()) {
@@ -921,12 +921,12 @@ inline at::Tensor create_empty_from(
 Tensor embedding_bag_byte_rowwise_offsets(
     const Tensor& weight,
     const Tensor& indices,
-    const c10::optional<Tensor>& offsets_in,
+    const std::optional<Tensor>& offsets_in,
     const bool /* scale_grad_by_freq */,
     const int64_t /* mode */,
     bool pruned_weights,
-    const c10::optional<Tensor>& per_sample_weights_,
-    const c10::optional<Tensor>& compressed_indices_mapping,
+    const std::optional<Tensor>& per_sample_weights_,
+    const std::optional<Tensor>& compressed_indices_mapping,
     bool include_last_offset) {
   auto output = create_empty_from(weight, at::kFloat);
   embedding_bag_byte_rowwise_offsets_out(
@@ -946,12 +946,12 @@ Tensor embedding_bag_byte_rowwise_offsets(
 Tensor embedding_bag_4bit_rowwise_offsets(
     const Tensor& weight,
     const Tensor& indices,
-    const c10::optional<Tensor>& offsets_in,
+    const std::optional<Tensor>& offsets_in,
     const bool /* scale_grad_by_freq */,
     const int64_t /* mode */,
     bool pruned_weights,
-    const c10::optional<Tensor>& per_sample_weights_,
-    const c10::optional<Tensor>& compressed_indices_mapping,
+    const std::optional<Tensor>& per_sample_weights_,
+    const std::optional<Tensor>& compressed_indices_mapping,
     bool include_last_offset) {
   auto output = create_empty_from(weight, at::kFloat);
   embedding_bag_4bit_rowwise_offsets_out(
@@ -971,12 +971,12 @@ Tensor embedding_bag_4bit_rowwise_offsets(
 Tensor embedding_bag_2bit_rowwise_offsets(
     const Tensor& weight,
     const Tensor& indices,
-    const c10::optional<Tensor>& offsets_in,
+    const std::optional<Tensor>& offsets_in,
     const bool /* scale_grad_by_freq */,
     const int64_t /* mode */,
     bool pruned_weights,
-    const c10::optional<Tensor>& per_sample_weights_,
-    const c10::optional<Tensor>& compressed_indices_mapping,
+    const std::optional<Tensor>& per_sample_weights_,
+    const std::optional<Tensor>& compressed_indices_mapping,
     bool include_last_offset) {
   auto output = create_empty_from(weight, at::kFloat);
   embedding_bag_2bit_rowwise_offsets_out(
@@ -996,12 +996,12 @@ Tensor embedding_bag_2bit_rowwise_offsets(
 Tensor embedding_bag_byte_rowwise_offsets_meta(
     const Tensor& weight,
     const Tensor& indices,
-    const c10::optional<Tensor>& offsets_in,
+    const std::optional<Tensor>& offsets_in,
     const bool /* scale_grad_by_freq */,
     const int64_t /* mode */,
     bool /* pruned_weights */,
-    const c10::optional<Tensor>& /* per_sample_weights_ */,
-    const c10::optional<Tensor>& /* compressed_indices_mapping */,
+    const std::optional<Tensor>& /* per_sample_weights_ */,
+    const std::optional<Tensor>& /* compressed_indices_mapping */,
     bool include_last_offset) {
   TORCH_CHECK(
       indices.dim() == 1 || indices.dim() == 2,
@@ -1038,12 +1038,12 @@ class QEmbeddingBag final {
   static at::Tensor run(
       const c10::intrusive_ptr<EmbeddingPackedParamsBase>& packed_weight,
       const Tensor& indices,
-      const c10::optional<Tensor>& offsets,
+      const std::optional<Tensor>& offsets,
       const bool /* scale_grad_by_freq */,
       const int64_t /* mode */,
       bool pruned_weights,
-      const c10::optional<Tensor>& per_sample_weights_,
-      const c10::optional<Tensor>& compressed_indices_mapping,
+      const std::optional<Tensor>& per_sample_weights_,
+      const std::optional<Tensor>& compressed_indices_mapping,
       bool include_last_offset) {
     if (bit_rate == 8) {
       return packed_weight->embeddingbag_byte(
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.h b/aten/src/ATen/native/quantized/cpu/qembeddingbag.h
index 86ed0f530f9c3..644d85fa357ee 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.h
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.h
@@ -8,24 +8,24 @@ Tensor& embedding_bag_byte_rowwise_offsets_out(
     Tensor& output,
     const Tensor& weight,
     const Tensor& indices,
-    const c10::optional<Tensor>& offsets_in,
+    const std::optional<Tensor>& offsets_in,
     const bool /* scale_grad_by_freq */,
     const int64_t /* mode */,
     bool pruned_weights,
-    const c10::optional<Tensor>& per_sample_weights_,
-    const c10::optional<Tensor>& compressed_indices_mapping,
+    const std::optional<Tensor>& per_sample_weights_,
+    const std::optional<Tensor>& compressed_indices_mapping,
     bool include_last_offset);
 
 Tensor& embedding_bag_4bit_rowwise_offsets_out(
     Tensor& output,
     const Tensor& weight,
     const Tensor& indices,
-    const c10::optional<Tensor>& offsets_in,
+    const std::optional<Tensor>& offsets_in,
     const bool /* scale_grad_by_freq */,
     const int64_t /* mode */,
     bool pruned_weights,
-    const c10::optional<Tensor>& per_sample_weights_,
-    const c10::optional<Tensor>& compressed_indices_mapping,
+    const std::optional<Tensor>& per_sample_weights_,
+    const std::optional<Tensor>& compressed_indices_mapping,
     bool include_last_offset);
 
 Tensor& qembeddingbag_byte_unpack_out(Tensor& output, const Tensor& packed_weight);
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index df6df3c35201d..1c180173aab53 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -917,17 +917,17 @@ static at::Tensor linear_int8_with_onednn_weight(
     at::Tensor onednn_weight, // int8 tensor from MkldnnCPU
     at::Tensor weight_scales,
     at::Tensor weight_zero_points,
-    c10::optional<at::Tensor> bias, // plain tensor
+    std::optional<at::Tensor> bias, // plain tensor
     double output_scale,
     int64_t output_zero_point,
-    c10::optional<c10::ScalarType> output_dtype,
-    c10::optional<at::Tensor> other, // extra input for binary post-op
+    std::optional<c10::ScalarType> output_dtype,
+    std::optional<at::Tensor> other, // extra input for binary post-op
     double other_scale,
     int64_t other_zero_point,
     const c10::string_view& binary_post_op, // e.g. "none", "sum", "add"
     double binary_alpha,
     const c10::string_view& unary_post_op, // e.g. "none", "relu"
-    torch::List<c10::optional<at::Scalar>>& unary_post_op_args,
+    torch::List<std::optional<at::Scalar>>& unary_post_op_args,
     c10::string_view& unary_post_op_algorithm) {
   using ideep::tensor;
   const int64_t dim = input.dim();
@@ -989,7 +989,7 @@ static at::Tensor linear_int8_with_onednn_weight(
   auto output_size = input.sizes().vec();
   output_size[dim - 1] = N;
 
-  c10::optional<ideep::tensor> onednn_bias{c10::nullopt};
+  std::optional<ideep::tensor> onednn_bias{c10::nullopt};
   bool with_bias = bias.has_value();
   at::Tensor bias_val_float;
   if (with_bias) {
@@ -1194,15 +1194,15 @@ class QLinearOnednn final {
       Tensor onednn_weight, // int8 tensor from MkldnnCPU
       Tensor weight_scales,
       Tensor weight_zero_points,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       double output_scale,
       int64_t output_zero_point,
-      c10::optional<c10::ScalarType> output_dtype,
+      std::optional<c10::ScalarType> output_dtype,
       c10::string_view post_op_name,
-      torch::List<c10::optional<at::Scalar>> post_op_args,
+      torch::List<std::optional<at::Scalar>> post_op_args,
       c10::string_view post_op_algorithm) {
 #if AT_MKLDNN_ENABLED()
-    static c10::optional<at::Tensor> other = c10::nullopt;
+    static std::optional<at::Tensor> other = c10::nullopt;
     static const c10::string_view binary_post_op = "none";
     return linear_int8_with_onednn_weight(
         act, act_scale, act_zero_point,
@@ -1223,17 +1223,17 @@ class QLinearOnednn final {
       Tensor onednn_weight, // int8 tensor from MkldnnCPU
       Tensor weight_scales,
       Tensor weight_zero_points,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       double output_scale,
       int64_t output_zero_point,
-      c10::optional<c10::ScalarType> output_dtype,
+      std::optional<c10::ScalarType> output_dtype,
       c10::string_view post_op_name,
-      torch::List<c10::optional<at::Scalar>> post_op_args,
+      torch::List<std::optional<at::Scalar>> post_op_args,
       c10::string_view post_op_algorithm) {
 #if AT_MKLDNN_ENABLED()
     TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1,
         "onednn int8 linear: act scale/zp size should be 1");
-    static c10::optional<at::Tensor> other = c10::nullopt;
+    static std::optional<at::Tensor> other = c10::nullopt;
     static const c10::string_view binary_post_op = "none";
     return linear_int8_with_onednn_weight(
         act, act_scale.item().toDouble(), act_zero_point.item().toLong(),
@@ -1254,17 +1254,17 @@ class QLinearOnednn final {
       Tensor onednn_weight, // int8 tensor from MkldnnCPU
       Tensor weight_scales,
       Tensor weight_zero_points,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       double output_scale,
       int64_t output_zero_point,
-      c10::optional<c10::ScalarType> output_dtype,
-      c10::optional<at::Tensor> other, // extra input for binary post-op
+      std::optional<c10::ScalarType> output_dtype,
+      std::optional<at::Tensor> other, // extra input for binary post-op
       double other_scale,
       int64_t other_zero_point,
       c10::string_view binary_post_op, // e.g. "none", "sum", "add"
       double binary_alpha,
       c10::string_view unary_post_op, // e.g. "none", "relu"
-      torch::List<c10::optional<at::Scalar>> unary_post_op_args,
+      torch::List<std::optional<at::Scalar>> unary_post_op_args,
       c10::string_view unary_post_op_algorithm) {
 #if AT_MKLDNN_ENABLED()
     return linear_int8_with_onednn_weight(
@@ -1286,17 +1286,17 @@ class QLinearOnednn final {
       Tensor onednn_weight, // int8 tensor from MkldnnCPU
       Tensor weight_scales,
       Tensor weight_zero_points,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       double output_scale,
       int64_t output_zero_point,
-      c10::optional<c10::ScalarType> output_dtype,
-      c10::optional<at::Tensor> other, // extra input for binary post-op
+      std::optional<c10::ScalarType> output_dtype,
+      std::optional<at::Tensor> other, // extra input for binary post-op
       double other_scale,
       int64_t other_zero_point,
       c10::string_view binary_post_op, // e.g. "none", "sum", "add"
       double binary_alpha,
       c10::string_view unary_post_op, // e.g. "none", "relu"
-      torch::List<c10::optional<at::Scalar>> unary_post_op_args,
+      torch::List<std::optional<at::Scalar>> unary_post_op_args,
       c10::string_view unary_post_op_algorithm) {
 #if AT_MKLDNN_ENABLED()
     TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1,
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
index 935ad081bd908..111990ad4e277 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -483,7 +483,7 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_relu_out(
   return apply_dynamic_impl<true>(input, output);
 }
 
-void PackedLinearWeightFp16::set_bias(c10::optional<at::Tensor> bias) {
+void PackedLinearWeightFp16::set_bias(std::optional<at::Tensor> bias) {
   bias_ = std::move(bias);
 }
 
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index a2fb34f90b289..d8427076b5afd 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -58,7 +58,7 @@ void calc_col_offsets_transpose(
 
 c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeight::prepack(
     at::Tensor weight,
-    c10::optional<at::Tensor> bias) {
+    std::optional<at::Tensor> bias) {
   TORCH_CHECK(
       weight.dim() == 2,
       "The weight tensor for quantized::linear_prepack (fbgemm) should"
@@ -102,7 +102,7 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeight::prepack(
       /*col_offsets=*/col_offsets.data(),
       /*qtype=*/qtype);
 
-  c10::optional<at::Tensor> bias_contig;
+  std::optional<at::Tensor> bias_contig;
   if (bias.has_value()) {
     at::Tensor bias_vec = bias.value();
     TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)");
@@ -132,7 +132,7 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeight::prepack(
 #ifdef USE_PYTORCH_QNNPACK
 c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsQnnp::prepack(
     at::Tensor weight,
-    c10::optional<at::Tensor> bias_in) {
+    std::optional<at::Tensor> bias_in) {
   TORCH_CHECK(
       weight.dim() == 2,
       "quantized::linear_prepack (qnnpack): Weight tensor rank should be == 2");
@@ -181,7 +181,7 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsQnnp::prepack(
 
 c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightFp16::prepack(
     at::Tensor weight,
-    c10::optional<at::Tensor> bias) {
+    std::optional<at::Tensor> bias) {
 
   weight = at::_saturate_weight_to_fp16(weight);
 
@@ -208,7 +208,7 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightFp16::prepack(
 #if AT_MKLDNN_ENABLED()
 c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsOnednn::prepack(
     at::Tensor weight,
-    c10::optional<at::Tensor> bias) {
+    std::optional<at::Tensor> bias) {
   TORCH_CHECK(
       weight.dim() == 2,
       "The weight tensor for quantized::linear_prepack (onednn) should"
@@ -257,7 +257,7 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsOnednn::prepack(
   packed_weight_p->set_zero_point(wgt_zero_points);
   std::unique_ptr<ideep::tensor> weight_ptr(packed_weight_p);
   // Bias
-  c10::optional<ideep::tensor> onednn_bias{c10::nullopt};
+  std::optional<ideep::tensor> onednn_bias{c10::nullopt};
   if (bias.has_value()) {
     auto& b = bias.value();
     auto bias_size = b.sizes().vec();
@@ -270,7 +270,7 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsOnednn::prepack(
     auto bias_desc = ideep::tensor::desc(bias_size, dnnl::memory::data_type::f32);
     ideep::tensor packed_bias;
     packed_bias.init(bias_desc, b.data_ptr());
-    onednn_bias = c10::optional<ideep::tensor>(packed_bias);
+    onednn_bias = std::optional<ideep::tensor>(packed_bias);
   }
   auto ret_ptr = c10::make_intrusive<PackedLinearWeightsOnednn>(
       PackedLinearWeightsOnednn{
@@ -283,7 +283,7 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsOnednn::prepack(
 
 inline at::Tensor pack_weight_to_onednn_tensor(
     const at::Tensor& weight,
-    c10::optional<torch::List<int64_t>>& input_shape) {
+    std::optional<torch::List<int64_t>>& input_shape) {
   std::vector<int64_t> w_dims = weight.sizes().vec();
   ideep::tensor wei = ideep::tensor({w_dims, dnnl::memory::data_type::s8}, weight.data_ptr());
   wei.transpose_(0, 1); // oneDNN requires transposed weight
@@ -319,7 +319,7 @@ class QLinearPackWeightInt8 final {
  public:
   static c10::intrusive_ptr<LinearPackedParamsBase> run(
       at::Tensor weight,
-      c10::optional<Tensor> bias) {
+      std::optional<Tensor> bias) {
     auto& ctx = at::globalContext();
 
 #ifdef USE_FBGEMM
@@ -350,7 +350,7 @@ class QLinearPackWeightFp16 final {
  public:
   static c10::intrusive_ptr<LinearPackedParamsBase> run(
       at::Tensor weight,
-      c10::optional<Tensor> bias) {
+      std::optional<Tensor> bias) {
     auto& ctx = at::globalContext();
 #ifdef USE_FBGEMM
     // temporarily convert weight back to fp32, needs to be fixed
@@ -387,7 +387,7 @@ class QLinearPackWeightFp16 final {
 
 class QLinearPackWeightInt8Legacy final {
  public:
-  static Tensor run(at::Tensor weight, c10::optional<Tensor> bias) {
+  static Tensor run(at::Tensor weight, std::optional<Tensor> bias) {
     TORCH_CHECK(false,
         "This model uses an outdated version of quantized.linear_prepack. "
         "Please re-export your model using the newer definitions in torch.jit.quantized");
@@ -396,7 +396,7 @@ class QLinearPackWeightInt8Legacy final {
 
 class QLinearPackWeightFp16Legacy final {
  public:
-  static Tensor run(at::Tensor weight, c10::optional<Tensor> bias) {
+  static Tensor run(at::Tensor weight, std::optional<Tensor> bias) {
     TORCH_CHECK(false,
         "This model uses an outdated version of quantized.linear_prepack_fp16. "
         "Please re-export your model using the newer definitions in torch.jit.quantized");
@@ -407,7 +407,7 @@ class QLinearPackWeightInt8Onednn final {
  public:
   static at::Tensor run(
     at::Tensor weight, // Not QTensor
-    c10::optional<torch::List<int64_t>> input_shape) {
+    std::optional<torch::List<int64_t>> input_shape) {
 #if AT_MKLDNN_ENABLED()
     return pack_weight_to_onednn_tensor(weight, input_shape);
 #else
diff --git a/aten/src/ATen/native/quantized/cpu/qnormalization.cpp b/aten/src/ATen/native/quantized/cpu/qnormalization.cpp
index b803bdd8aff7a..9de75e80bc4df 100644
--- a/aten/src/ATen/native/quantized/cpu/qnormalization.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qnormalization.cpp
@@ -135,8 +135,8 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::layer_norm"), [](
     Tensor input,
     std::vector<int64_t> normalized_shape,  // because IntArrayRef doesn't work
-    c10::optional<Tensor> weight,
-    c10::optional<Tensor> bias,
+    std::optional<Tensor> weight,
+    std::optional<Tensor> bias,
     double eps,
     double output_scale,
     int64_t output_zero_point) {
@@ -149,8 +149,8 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::group_norm"), [](
       Tensor qx,
       int64_t num_groups,
-      c10::optional<Tensor> weight,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> weight,
+      std::optional<Tensor> bias,
       double eps,
       double output_scale,
       int64_t output_zero_point) {
@@ -162,8 +162,8 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   });
   m.impl(TORCH_SELECTIVE_NAME("quantized::instance_norm"), [](
       Tensor qx,
-      c10::optional<Tensor> weight,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> weight,
+      std::optional<Tensor> bias,
       double eps,
       double output_scale,
       int64_t output_zero_point) {
diff --git a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
index a2d3ed6305fc3..0d764aee90d09 100644
--- a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
@@ -44,8 +44,8 @@ Tensor qsoftmax_qnnpack(const Tensor& qx, const int64_t dim) {
    */
 
   const int64_t last_dim = qx.dim() - 1;
-  c10::optional<std::vector<int64_t>> permuted_dims = c10::nullopt;
-  c10::optional<at::Tensor> qx_contig = c10::nullopt;
+  std::optional<std::vector<int64_t>> permuted_dims = c10::nullopt;
+  std::optional<at::Tensor> qx_contig = c10::nullopt;
   const at::Tensor* qx_contig_ptr = nullptr;
 
   if (qx.stride(dim) == 1) {
diff --git a/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu b/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu
index 3574bfe28f505..de3f1032dbcae 100644
--- a/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu
@@ -90,7 +90,7 @@ __global__ void embedding_bag_nbits_rowwise_offsets_kernel(
     const PackedTensorAccessor32<index_t, 1, RestrictPtrTraits> offsets,
     const bool /* pruned_weights */,
     const PackedTensorAccessor32<float, 1, RestrictPtrTraits> per_sample_weights_,
-    const c10::optional<Tensor>& compressed_indices_mapping,
+    const std::optional<Tensor>& compressed_indices_mapping,
     const bool include_last_offset,
     PackedTensorAccessor32<float, 2, RestrictPtrTraits> output) {
   static_assert(bits_per_dim == 4 || bits_per_dim == 8, "the current embedding_bag_nbits_rowwise_offsets_kernel only has been tested for 4 and 8 bits per dim");
@@ -192,8 +192,8 @@ at::Tensor& embedding_bag_byte_impl(
     const at::Tensor& indices,
     const at::Tensor& offsets,
     bool pruned_weights,
-    const c10::optional<at::Tensor>& per_sample_weights_,
-    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    const std::optional<at::Tensor>& per_sample_weights_,
+    const std::optional<at::Tensor>& compressed_indices_mapping,
     bool include_last_offset,
     bool is_embedding_op) {
   TORCH_CHECK(weight.is_cuda());
@@ -267,12 +267,12 @@ at::Tensor& embedding_bag_byte_impl(
 Tensor embedding_bag_byte_rowwise_offsets(
     const Tensor& weight,
     const Tensor& indices,
-    const c10::optional<Tensor>& offsets_in,
+    const std::optional<Tensor>& offsets_in,
     const bool /* scale_grad_by_freq */,
     const int64_t /* mode */,
     bool pruned_weights,
-    const c10::optional<Tensor>& per_sample_weights_,
-    const c10::optional<Tensor>& compressed_indices_mapping,
+    const std::optional<Tensor>& per_sample_weights_,
+    const std::optional<Tensor>& compressed_indices_mapping,
     bool include_last_offset) {
   bool is_embedding_op = false;
   auto output = create_empty_from(weight, at::kFloat);
@@ -375,8 +375,8 @@ at::Tensor& embedding_bag_4bit_impl(
     const at::Tensor& indices,
     const at::Tensor& offsets,
     bool pruned_weights,
-    const c10::optional<at::Tensor>& per_sample_weights_,
-    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    const std::optional<at::Tensor>& per_sample_weights_,
+    const std::optional<at::Tensor>& compressed_indices_mapping,
     bool include_last_offset) {
   TORCH_CHECK(weight.is_cuda());
   TORCH_CHECK(indices.is_cuda());
@@ -449,12 +449,12 @@ at::Tensor& embedding_bag_4bit_impl(
 Tensor embedding_bag_4bit_rowwise_offsets(
     const Tensor& weight,
     const Tensor& indices,
-    const c10::optional<Tensor>& offsets_in,
+    const std::optional<Tensor>& offsets_in,
     const bool /* scale_grad_by_freq */,
     const int64_t /* mode */,
     bool pruned_weights,
-    const c10::optional<Tensor>& per_sample_weights_,
-    const c10::optional<Tensor>& compressed_indices_mapping,
+    const std::optional<Tensor>& per_sample_weights_,
+    const std::optional<Tensor>& compressed_indices_mapping,
     bool include_last_offset) {
   auto output = create_empty_from(weight, at::kFloat);
 
diff --git a/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp b/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp
index a225a86eeb903..07ccc19c48282 100644
--- a/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp
@@ -186,7 +186,7 @@ Tensor add(Tensor qa, Tensor qb, double output_scale, int64_t output_zero_point)
   // relu_op computes
   // relu( (qa_int8 + qb_int8 * ( qb_scale/qa_scale ) )  )
   // output is a fp32 tensor
-  c10::optional<cudnn_frontend::Operation> relu_op;
+  std::optional<cudnn_frontend::Operation> relu_op;
   if (kReluFused) {
     // we use inplace operation here where the output is assigned to the input
     relu_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
diff --git a/aten/src/ATen/native/quantized/cudnn/Conv.cpp b/aten/src/ATen/native/quantized/cudnn/Conv.cpp
index bb97a69859cb4..606d769fe6eb4 100644
--- a/aten/src/ATen/native/quantized/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/Conv.cpp
@@ -70,8 +70,8 @@ void PackedConvWeightCudnn<kSpatialDim>::apply_impl_helper(const at::Tensor& qua
   auto requantize_multiplier = act_scale * weight_scale / output_scale;
   at::Tensor requantize_multiplier_tensor = cudnn_utils::getRequantMultiplierTensor(requantize_multiplier, kSpatialDim + 2);
 
-  c10::optional<at::Tensor> bias_multiplier_tensor;
-  c10::optional<at::Tensor> broadcasted_bias;
+  std::optional<at::Tensor> bias_multiplier_tensor;
+  std::optional<at::Tensor> broadcasted_bias;
   if (bias_.has_value()) {
     // the input bias is a 1-D tensor whose size is the same as the size of the second dimension of quantized_output.
     // we need to add trailing dimensions in order to properly broadcast bias, otherwise broadcast_to will fail.
@@ -154,12 +154,12 @@ void PackedConvWeightCudnn<kSpatialDim>::apply_impl_helper(const at::Tensor& qua
       .build();
   // std::cout << "operator:" << conv_op.describe() << std::endl;
 
-  c10::optional<cudnn_frontend::Operation> bias_mult_op;
-  c10::optional<cudnn_frontend::Operation> sum_conv_bias_op;
+  std::optional<cudnn_frontend::Operation> bias_mult_op;
+  std::optional<cudnn_frontend::Operation> sum_conv_bias_op;
   if (bias_.has_value()) {
     // we can't directly assign bias_mult_op because operator= is deleted for cudnn_frontend::Operation;
     // alternatively, I think we can use std::unique_ptr and dynamically allocate these builder ops
-    // but here, we chose to do it statically. c10::optional<T>::emplace() enables this approach
+    // but here, we chose to do it statically. std::optional<T>::emplace() enables this approach
 
     // bias_mult_op computes bias_fp32 / (act_scale * w_scale) or bias_fp32 * (1 / (act_scale * w_scale))
     // where bias_multiplier = (1 / (act_scale * w_scale))
@@ -188,7 +188,7 @@ void PackedConvWeightCudnn<kSpatialDim>::apply_impl_helper(const at::Tensor& qua
   // relu_op computes relu(act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]
   // or relu(act_int8 * w_int8) if bias is not present.
   // output is a fp32 tensor
-  c10::optional<cudnn_frontend::Operation> relu_op;
+  std::optional<cudnn_frontend::Operation> relu_op;
   std::shared_ptr<cudnn_frontend::OpaqueBackendPointer> tensor2requant_ptr = bias_.has_value() ? sum_conv_bias_op.value().getOutputTensor() : conv_op.getOutputTensor();
   if (kReluFused) {
     // we use inplace operation here where the output is assigned to the input
diff --git a/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp b/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
index 44d37f27bf6f6..b1bd94ee7a55c 100644
--- a/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
@@ -27,7 +27,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightCudnn<
     kSpatialDim>::
     prepack(
         at::Tensor weight,
-        c10::optional<at::Tensor> bias,
+        std::optional<at::Tensor> bias,
         torch::List<int64_t> stride,
         torch::List<int64_t> padding,
         torch::List<int64_t> output_padding,
@@ -116,7 +116,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<2>> PackedConvWeightCudnn<
     2>::
     prepack(
         at::Tensor weight,
-        c10::optional<at::Tensor> bias_in,
+        std::optional<at::Tensor> bias_in,
         torch::List<int64_t> stride,
         torch::List<int64_t> padding,
         torch::List<int64_t> output_padding,
@@ -133,7 +133,7 @@ class QConvPackWeightInt8Cudnn final {
  public:
   static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> run_conv(
       Tensor weight,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> dilation,
@@ -150,7 +150,7 @@ class QConvPackWeightInt8Cudnn final {
  private:
   static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> _run(
       Tensor weight,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> output_padding,
@@ -167,7 +167,7 @@ class QConv1dPackWeightInt8Cudnn final {
  public:
   static c10::intrusive_ptr<ConvPackedParamsBase<2>> run_conv(
       Tensor weight,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> dilation,
@@ -180,7 +180,7 @@ class QConv1dPackWeightInt8Cudnn final {
  private:
   static c10::intrusive_ptr<ConvPackedParamsBase<2>> _run(
       Tensor weight,
-      c10::optional<Tensor> bias,
+      std::optional<Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> output_padding,
diff --git a/aten/src/ATen/native/quantized/cudnn/ConvUnpackImpl.cpp b/aten/src/ATen/native/quantized/cudnn/ConvUnpackImpl.cpp
index ce5ee36cad4f0..fbb4a1fe94111 100644
--- a/aten/src/ATen/native/quantized/cudnn/ConvUnpackImpl.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/ConvUnpackImpl.cpp
@@ -11,12 +11,12 @@
 #include <tuple>
 
 template <int kSpatialDim>
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightCudnn<
+std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeightCudnn<
     kSpatialDim>::unpack() {
-  return std::tuple<at::Tensor, c10::optional<at::Tensor>>{maybe_padded_weight_, bias_};
+  return std::tuple<at::Tensor, std::optional<at::Tensor>>{maybe_padded_weight_, bias_};
 }
 
-template std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedConvWeightCudnn<
+template std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeightCudnn<
     2>::unpack();
 
 #endif  // AT_CUDNN_ENABLED
diff --git a/aten/src/ATen/native/quantized/cudnn/Linear.cpp b/aten/src/ATen/native/quantized/cudnn/Linear.cpp
index f9333d6fbed7a..d3219592e25bb 100644
--- a/aten/src/ATen/native/quantized/cudnn/Linear.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/Linear.cpp
@@ -98,8 +98,8 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp
   auto weight_scale = orig_weight.q_scale();
   auto requantize_multiplier = act_scale * weight_scale / output_scale;
   at::Tensor requantize_multiplier_tensor = cudnn_utils::getRequantMultiplierTensor(requantize_multiplier, quantized_output.dim());
-  c10::optional<at::Tensor> bias_multiplier_tensor;
-  c10::optional<at::Tensor> broadcasted_bias;
+  std::optional<at::Tensor> bias_multiplier_tensor;
+  std::optional<at::Tensor> broadcasted_bias;
   if (bias_.has_value()) {
     // the input bias is a 1-D tensor whose size is the same as the size of the last dimension of quantized_output
     // we need to add trailing dimensions in order to properly broadcast bias, otherwise broadcast_to will fail.
@@ -183,12 +183,12 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp
       .build();
   // std::cout << "operator:" << linear_op.describe() << std::endl;
 
-  c10::optional<cudnn_frontend::Operation> bias_mult_op;
-  c10::optional<cudnn_frontend::Operation> sum_linear_bias_op;
+  std::optional<cudnn_frontend::Operation> bias_mult_op;
+  std::optional<cudnn_frontend::Operation> sum_linear_bias_op;
   if (bias_.has_value()) {
     // we can't directly assign bias_mult_op because operator= is deleted for cudnn_frontend::Operation;
     // alternatively, I think we can use std::unique_ptr and dynamically allocate these builder ops
-    // but here, we chose to do it statically. c10::optional<T>::emplace() enables this approach
+    // but here, we chose to do it statically. std::optional<T>::emplace() enables this approach
 
     // bias_mult_op computes bias_fp32 / (act_scale * w_scale) or bias_fp32 * (1 / (act_scale * w_scale))
     // where bias_multiplier = (1 / (act_scale * w_scale))
@@ -222,7 +222,7 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp
   // relu_op computes relu(act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]
   // or relu(act_int8 * w_int8) if bias is not present.
   // output is a fp32 tensor
-  c10::optional<cudnn_frontend::Operation> relu_op;
+  std::optional<cudnn_frontend::Operation> relu_op;
   std::shared_ptr<cudnn_frontend::OpaqueBackendPointer> tensor2requant_ptr = bias_.has_value() ? sum_linear_bias_op.value().getOutputTensor() : linear_op.getOutputTensor();
   if (kReluFused) {
     // we use inplace operation here where the output is assigned to the input
diff --git a/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp b/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp
index abbb5922f3933..fd7c870e006d1 100644
--- a/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp
@@ -16,7 +16,7 @@ int register_linear_params();
 
 c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightCudnn::prepack(
         at::Tensor weight,
-        c10::optional<at::Tensor> bias) {
+        std::optional<at::Tensor> bias) {
   TORCH_CHECK(weight.qscheme() == c10::kPerTensorAffine, "Unsupported qscheme: ", toString(weight.qscheme()));
   const int output_channels = weight.size(0);
   const auto qtype = weight.qscheme();
@@ -42,7 +42,7 @@ class QLinearPackWeightInt8Cudnn final {
  public:
   static c10::intrusive_ptr<LinearPackedParamsBase> run(
       at::Tensor weight,
-      c10::optional<Tensor> bias) {
+      std::optional<Tensor> bias) {
       return PackedLinearWeightCudnn::prepack(std::move(weight), std::move(bias));
   }
 };
diff --git a/aten/src/ATen/native/quantized/cudnn/LinearUnpackImpl.cpp b/aten/src/ATen/native/quantized/cudnn/LinearUnpackImpl.cpp
index 7200872480efd..40088052cd151 100644
--- a/aten/src/ATen/native/quantized/cudnn/LinearUnpackImpl.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/LinearUnpackImpl.cpp
@@ -10,8 +10,8 @@
 
 #include <tuple>
 
-std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeightCudnn::unpack() {
-  return std::tuple<at::Tensor, c10::optional<at::Tensor>>{orig_weight, bias_};
+std::tuple<at::Tensor, std::optional<at::Tensor>> PackedLinearWeightCudnn::unpack() {
+  return std::tuple<at::Tensor, std::optional<at::Tensor>>{orig_weight, bias_};
 }
 
 #endif  // AT_CUDNN_ENABLED
diff --git a/aten/src/ATen/native/quantized/cudnn/utils.h b/aten/src/ATen/native/quantized/cudnn/utils.h
index 18c891fcaa1c0..fbd10e2ec95e7 100644
--- a/aten/src/ATen/native/quantized/cudnn/utils.h
+++ b/aten/src/ATen/native/quantized/cudnn/utils.h
@@ -27,7 +27,7 @@ C10_DIAGNOSTIC_POP()
 struct PackedLinearWeightCudnn : public LinearPackedParamsBase {
   PackedLinearWeightCudnn(
       at::Tensor orig_weight,
-      c10::optional<at::Tensor> bias,
+      std::optional<at::Tensor> bias,
       c10::QScheme q_scheme)
       : orig_weight(std::move(orig_weight)),
         bias_(std::move(bias)),
@@ -53,19 +53,19 @@ struct PackedLinearWeightCudnn : public LinearPackedParamsBase {
     "parameter type");
   }
 
-  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
 
-  c10::optional<at::Tensor> bias() override {
+  std::optional<at::Tensor> bias() override {
     return bias_;
   }
 
   static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
       at::Tensor weight,
-      c10::optional<at::Tensor> bias);
+      std::optional<at::Tensor> bias);
 
  private:
   at::Tensor orig_weight;
-  c10::optional<at::Tensor> bias_;
+  std::optional<at::Tensor> bias_;
   c10::QScheme q_scheme;
 
   template <bool ReluFused>
@@ -85,7 +85,7 @@ template <int kSpatialDim = 2>
 struct PackedConvWeightCudnn : public ConvPackedParamsBase<kSpatialDim> {
   PackedConvWeightCudnn(
       at::Tensor orig_weight,
-      c10::optional<at::Tensor> bias,
+      std::optional<at::Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> output_padding,
@@ -127,11 +127,11 @@ struct PackedConvWeightCudnn : public ConvPackedParamsBase<kSpatialDim> {
     TORCH_CHECK(false, "apply_dynamic_relu is currently not reported");
   }
 
-  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
 
   static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
       at::Tensor weight,
-      c10::optional<at::Tensor> bias,
+      std::optional<at::Tensor> bias,
       torch::List<int64_t> stride,
       torch::List<int64_t> padding,
       torch::List<int64_t> output_padding,
@@ -171,7 +171,7 @@ struct PackedConvWeightCudnn : public ConvPackedParamsBase<kSpatialDim> {
   // convention "maybe"_padded_weight.
   // TODO: when and if cudnn enables padding in their operators, we can remove padding on our end and rename this to orig_weight_
   at::Tensor maybe_padded_weight_;
-  c10::optional<at::Tensor> bias_;
+  std::optional<at::Tensor> bias_;
   torch::List<int64_t> stride_;
   torch::List<int64_t> padding_;
   torch::List<int64_t> output_padding_;
diff --git a/aten/src/ATen/native/quantized/qconv_unpack.cpp b/aten/src/ATen/native/quantized/qconv_unpack.cpp
index fe4007c712ce5..1fdc7745cfa2e 100644
--- a/aten/src/ATen/native/quantized/qconv_unpack.cpp
+++ b/aten/src/ATen/native/quantized/qconv_unpack.cpp
@@ -49,7 +49,7 @@ namespace {
 template <int kSpatialDim = 2>
 class QConvUnpackWeightsInt8 final {
  public:
-  static std::tuple<at::Tensor, c10::optional<at::Tensor>> run(
+  static std::tuple<at::Tensor, std::optional<at::Tensor>> run(
       const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight) {
     auto& ctx = at::globalContext();
 
@@ -85,17 +85,17 @@ class QConvUnpackWeightsInt8 final {
 
 class QConv1dUnpackWeightsInt8 final {
  public:
-  static std::tuple<at::Tensor, c10::optional<at::Tensor>> run(
+  static std::tuple<at::Tensor, std::optional<at::Tensor>> run(
       const c10::intrusive_ptr<ConvPackedParamsBase<2>>& packed_weight) {
     auto& ctx = at::globalContext();
     at::Tensor weight;
-    c10::optional<at::Tensor> bias;
+    std::optional<at::Tensor> bias;
 #ifdef USE_FBGEMM
     if (ctx.qEngine() == at::QEngine::FBGEMM ||
         ctx.qEngine() == at::QEngine::X86) {
       std::tie(weight, bias) = packed_weight->unpack();
       weight = weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2);
-      return std::tuple<at::Tensor, c10::optional<at::Tensor>>(weight, bias);
+      return std::tuple<at::Tensor, std::optional<at::Tensor>>(weight, bias);
     }
 #endif
 
@@ -104,7 +104,7 @@ class QConv1dUnpackWeightsInt8 final {
       std::tie(weight, bias) = packed_weight->unpack();
       at::Tensor new_weight = weight.clone();
       new_weight = new_weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2);
-      return std::tuple<at::Tensor, c10::optional<at::Tensor>>(new_weight, bias);
+      return std::tuple<at::Tensor, std::optional<at::Tensor>>(new_weight, bias);
     }
 #endif
 
@@ -113,7 +113,7 @@ class QConv1dUnpackWeightsInt8 final {
       std::tie(weight, bias) = packed_weight->unpack();
       at::Tensor new_weight = weight.clone();
       new_weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2);
-      return std::tuple<at::Tensor, c10::optional<at::Tensor>>(new_weight, bias);
+      return std::tuple<at::Tensor, std::optional<at::Tensor>>(new_weight, bias);
     }
 #endif
 
diff --git a/aten/src/ATen/native/quantized/qlinear_unpack.cpp b/aten/src/ATen/native/quantized/qlinear_unpack.cpp
index 19c9890c82e38..85eab571df9e0 100644
--- a/aten/src/ATen/native/quantized/qlinear_unpack.cpp
+++ b/aten/src/ATen/native/quantized/qlinear_unpack.cpp
@@ -21,7 +21,7 @@ namespace {
 
 class QLinearUnpackWeightInt8 final {
  public:
-  static std::tuple<at::Tensor, c10::optional<Tensor>> run(
+  static std::tuple<at::Tensor, std::optional<Tensor>> run(
       const c10::intrusive_ptr<LinearPackedParamsBase>& packed_weight) {
     return packed_weight->unpack();
   }
@@ -29,7 +29,7 @@ class QLinearUnpackWeightInt8 final {
 
 class QLinearUnpackWeightFp16 final {
  public:
-  static std::tuple<at::Tensor, c10::optional<Tensor>> run(
+  static std::tuple<at::Tensor, std::optional<Tensor>> run(
       const c10::intrusive_ptr<LinearPackedParamsBase>& packed_weight) {
     auto& ctx = at::globalContext();
 
@@ -44,7 +44,7 @@ class QLinearUnpackWeightFp16 final {
 
 class QLinearUnpackWeightInt8Legacy final {
  public:
-  static std::tuple<at::Tensor, c10::optional<Tensor>> run(
+  static std::tuple<at::Tensor, std::optional<Tensor>> run(
       const at::Tensor& packed_weight) {
     TORCH_CHECK(false,
         "quantized.linear_unpack(Tensor) is unsupported! Please "
@@ -55,7 +55,7 @@ class QLinearUnpackWeightInt8Legacy final {
 
 class QLinearUnpackWeightFp16Legacy final {
  public:
-  static std::tuple<at::Tensor, c10::optional<Tensor>> run(
+  static std::tuple<at::Tensor, std::optional<Tensor>> run(
       const at::Tensor& packed_weight) {
     TORCH_CHECK(false,
         "quantized.linear_unpack(Tensor) is unsupported! Please "
diff --git a/aten/src/ATen/native/sparse/SoftMax.cpp b/aten/src/ATen/native/sparse/SoftMax.cpp
index 883c2b9c4ea95..179db48beacca 100644
--- a/aten/src/ATen/native/sparse/SoftMax.cpp
+++ b/aten/src/ATen/native/sparse/SoftMax.cpp
@@ -615,7 +615,7 @@ static Tensor _sparse_softmax(const Tensor& input_, const int64_t dim_) {
   return result;
 }
 
-Tensor _sparse_softmax(const Tensor& input_, const int64_t dim_, c10::optional<ScalarType> dtype) {
+Tensor _sparse_softmax(const Tensor& input_, const int64_t dim_, std::optional<ScalarType> dtype) {
   auto result = [&]() {
     NoNamesGuard guard;
     if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){
@@ -642,7 +642,7 @@ static Tensor _sparse_log_softmax(const Tensor& input_, const int64_t dim_) {
   return result;
 }
 
-Tensor _sparse_log_softmax(const Tensor& input_, const int64_t dim_, c10::optional<ScalarType> dtype) {
+Tensor _sparse_log_softmax(const Tensor& input_, const int64_t dim_, std::optional<ScalarType> dtype) {
   auto result = [&]() {
     NoNamesGuard guard;
     if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){
diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
index 8782031c49aa1..608f5291e607e 100644
--- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
@@ -133,8 +133,8 @@ void _sparse_binary_op_intersection_kernel_impl(
     const Tensor& x_,
     const Tensor& y_,
     const std::vector<int64_t>& broadcasted_shape,
-    const c10::optional<Tensor>& x_hash_opt_ = c10::nullopt,
-    const c10::optional<Tensor>& y_hash_opt_ = c10::nullopt,
+    const std::optional<Tensor>& x_hash_opt_ = c10::nullopt,
+    const std::optional<Tensor>& y_hash_opt_ = c10::nullopt,
     const bool accumulate_matches = true,
     const bool distributive_with_sum = true
 ) {
@@ -148,7 +148,7 @@ void _sparse_binary_op_intersection_kernel_impl(
       " to output ", res.scalar_type());
 
   using KernelLauncher = KernelLauncher<kernel_t>;
-  using OptTensor = c10::optional<Tensor>;
+  using OptTensor = std::optional<Tensor>;
 
   // If the op and sum are not distributive, coalesce is required.
   const auto coalesce_if_not_distributive = [distributive_with_sum](const Tensor& t, const OptTensor& t_hash_opt) -> auto {
@@ -275,8 +275,11 @@ void _sparse_binary_op_intersection_kernel_impl(
       KernelLauncher::launch(iter,
           // NOTE: capture by value required by CUDA
           [=] FUNCAPI (index_t nnz_idx) -> int64_t {
-          const auto* RESTRICT ptr_indices_dim = ptr_indices ? ptr_indices + nnz_idx * indices_nnz_stride : nullptr;
           int64_t hash = 0;
+          if (!ptr_indices) {
+            return hash;
+          }
+          const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
           for (int64_t dim = 0; dim < sparse_dim; ++dim) {
             const auto dim_hash_coeff = hash_coeffs[dim];
             const auto dim_index = ptr_indices_dim[dim * indices_dim_stride];
@@ -423,8 +426,8 @@ void _sparse_binary_op_intersection_kernel_out(
     Tensor& res,
     const Tensor& x,
     const Tensor& y,
-    const c10::optional<Tensor>& x_hash_opt = c10::nullopt,
-    const c10::optional<Tensor>& y_hash_opt = c10::nullopt,
+    const std::optional<Tensor>& x_hash_opt = c10::nullopt,
+    const std::optional<Tensor>& y_hash_opt = c10::nullopt,
     // If op distributes with the sum, the arguments are processed as is,
     // without the calls to coalesce().
     const bool distributive_with_sum = true
@@ -439,7 +442,7 @@ void _sparse_binary_op_intersection_kernel_out(
       x._indices().scalar_type() == y._indices().scalar_type(),
       NAME, "(): expects inputs' indices to be of the same dtype (i.e. long or int)");
 
-  const auto check_hash_validity = [](const Tensor& t, const c10::optional<Tensor>& t_hash_opt) {
+  const auto check_hash_validity = [](const Tensor& t, const std::optional<Tensor>& t_hash_opt) {
     if (!t_hash_opt.has_value()) {
       return;
     }
diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
index 2db8c9e9404cc..94a1e3d622355 100644
--- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
@@ -119,7 +119,7 @@ struct CPUValueSelectionIntersectionKernel {
   }
 };
 
-using OptTensor = c10::optional<Tensor>;
+using OptTensor = std::optional<Tensor>;
 
 void mul_sparse_sparse_out_cpu_kernel(
     Tensor& result,
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
index d1973c43e9ad7..59b048f5d147c 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
@@ -363,10 +363,10 @@ Tensor sparse_compressed_tensor_with_dims(
      c10::IntArrayRef size,
      c10::IntArrayRef blocksize,
      ScalarType index_dtype,
-     c10::optional<ScalarType> dtype,
-     c10::optional<Layout> layout,
-     c10::optional<Device> device,
-     c10::optional<bool> pin_memory) {
+     std::optional<ScalarType> dtype,
+     std::optional<Layout> layout,
+     std::optional<Device> device,
+     std::optional<bool> pin_memory) {
   // sparse_compressed_tensor_with_dims is a generalization of empty
   // that enables the specification of nnz, dense_dim, blocksize, and
   // index_dtype for sparse compressed tensors.
@@ -435,10 +435,10 @@ Tensor _sparse_compressed_tensor_unsafe_symint(
      const Tensor& plain_indices,
      const Tensor& values,
      c10::SymIntArrayRef size,
-     c10::optional<ScalarType> dtype,
-     c10::optional<Layout> layout,
-     c10::optional<Device> device,
-     c10::optional<bool> pin_memory) {
+     std::optional<ScalarType> dtype,
+     std::optional<Layout> layout,
+     std::optional<Device> device,
+     std::optional<bool> pin_memory) {
   if (!layout) {
     AT_ERROR("sparse_compressed_tensor_unsafe expected sparse compressed tensor layout but got none");
   }
@@ -458,10 +458,10 @@ Tensor _sparse_compressed_tensor_unsafe_template(const Tensor& compressed_indice
                                                  const Tensor& plain_indices,
                                                  const Tensor& values,
                                                  IntArrayRef size,
-                                                 c10::optional<ScalarType> dtype,
-                                                 c10::optional<Layout> layout,
-                                                 c10::optional<Device> device,
-                                                 c10::optional<bool> pin_memory) {
+                                                 std::optional<ScalarType> dtype,
+                                                 std::optional<Layout> layout,
+                                                 std::optional<Device> device,
+                                                 std::optional<bool> pin_memory) {
   Layout layout_ = layout.value_or(required_layout);
   TORCH_CHECK(layout_ == required_layout, "sparse compressed layout must be ",required_layout, " but got ", layout_);
   if (at::globalContext().checkSparseTensorInvariants()) {
@@ -478,10 +478,10 @@ Tensor _sparse_compressed_tensor_unsafe_template(const Tensor& compressed_indice
                                         const Tensor& plain_indices,    \
                                         const Tensor& values,           \
                                         IntArrayRef size,               \
-                                        c10::optional<ScalarType> dtype, \
-                                        c10::optional<Layout> layout,   \
-                                        c10::optional<Device> device,   \
-                                        c10::optional<bool> pin_memory) { \
+                                        std::optional<ScalarType> dtype, \
+                                        std::optional<Layout> layout,   \
+                                        std::optional<Device> device,   \
+                                        std::optional<bool> pin_memory) { \
     return _sparse_compressed_tensor_unsafe_template<REQUIRED_LAYOUT>(compressed_indices, plain_indices, values, size, dtype, layout, device, pin_memory); \
   }
 
@@ -554,10 +554,10 @@ Tensor sparse_compressed_tensor(
     const Tensor& plain_indices,
     const Tensor& values,
     IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
 
   if (!layout) {
     AT_ERROR("sparse_compressed_tensor expected sparse compressed tensor layout but got none");
@@ -583,10 +583,10 @@ Tensor sparse_compressed_tensor(
     const Tensor& compressed_indices,
     const Tensor& plain_indices,
     const Tensor& values,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
 
   if (!layout) {
     AT_ERROR("sparse_compressed_tensor expected sparse compressed tensor layout but got none");
@@ -614,28 +614,28 @@ Tensor sparse_compressed_tensor(
   Tensor sparse_##KIND##_tensor(const Tensor& compressed_indices,       \
                                 const Tensor& plain_indices,            \
                                 const Tensor& values,                   \
-                                c10::optional<ScalarType> dtype,        \
-                                c10::optional<Layout> layout,           \
-                                c10::optional<Device> device,           \
-                                c10::optional<bool> pin_memory) {       \
+                                std::optional<ScalarType> dtype,        \
+                                std::optional<Layout> layout,           \
+                                std::optional<Device> device,           \
+                                std::optional<bool> pin_memory) {       \
     if (layout) {                                                       \
       TORCH_CHECK(layout.value() == REQUIRED_LAYOUT, "sparse " # KIND " layout must be ", REQUIRED_LAYOUT, " but got ", layout.value()); \
     }                                                                   \
-    c10::optional<Layout> layout_(REQUIRED_LAYOUT);                     \
+    std::optional<Layout> layout_(REQUIRED_LAYOUT);                     \
     return at::native::sparse_compressed_tensor(compressed_indices, plain_indices, values, dtype, layout_, device, pin_memory); \
   }                                                                     \
   Tensor sparse_##KIND##_tensor(const Tensor& compressed_indices,       \
                                 const Tensor& plain_indices,            \
                                 const Tensor& values,                   \
                                 IntArrayRef size,                       \
-                                c10::optional<ScalarType> dtype,        \
-                                c10::optional<Layout> layout,           \
-                                c10::optional<Device> device,           \
-                                c10::optional<bool> pin_memory) {       \
+                                std::optional<ScalarType> dtype,        \
+                                std::optional<Layout> layout,           \
+                                std::optional<Device> device,           \
+                                std::optional<bool> pin_memory) {       \
     if (layout) {                                                       \
       TORCH_CHECK(layout.value() == REQUIRED_LAYOUT, "sparse " # KIND " layout must be ", REQUIRED_LAYOUT, " but got ", layout.value()); \
     }                                                                   \
-    c10::optional<Layout> layout_(REQUIRED_LAYOUT);                     \
+    std::optional<Layout> layout_(REQUIRED_LAYOUT);                     \
     return at::native::sparse_compressed_tensor(compressed_indices, plain_indices, values, size, dtype, layout_, device, pin_memory); \
   }
 
@@ -650,11 +650,11 @@ SPARSE_COMPRESSED_TENSOR(bsc, kSparseBsc)
 // indices. The implementation below is kept for BC.
 Tensor empty_sparse_compressed(
     IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<MemoryFormat> optional_memory_format) {
   check_size_nonnegative(size);
   TORCH_CHECK(size.size() >= 2, "torch.empty: Only batched sparse compressed (non-block) tensors are supported, but got size ", size);
 
@@ -699,7 +699,7 @@ Tensor empty_sparse_compressed(
 const Tensor& resize_sparse_csr_(
     const Tensor& self,
     IntArrayRef size,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   check_size_nonnegative(size);
   TORCH_CHECK(size.size() >= 2, "torch.resize_: Only batched sparse CSR matrices are supported, but got size ", size);
   TORCH_CHECK(
@@ -836,7 +836,7 @@ const SparseCsrTensor& resize_as_sparse_compressed_(
 
 SparseCsrTensor clone_sparse_compressed(
                                         const SparseCsrTensor& self,
-                                        c10::optional<c10::MemoryFormat> optional_memory_format) {
+                                        std::optional<c10::MemoryFormat> optional_memory_format) {
   TORCH_CHECK(
       !optional_memory_format.has_value(),
       "unsupported memory format option ",
@@ -863,11 +863,11 @@ SparseCsrTensor clone_sparse_compressed(
 
 Tensor empty_like_sparse_csr(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
   TensorOptions options =
       self.options()
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index bff9842a2a3ab..ccac30d65a1a7 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -326,7 +326,7 @@ Tensor& normal_sparse_csr_(
     Tensor& self,
     double mean,
     double std,
-    c10::optional<Generator> gen) {
+    std::optional<Generator> gen) {
   return unary_op_inplace(self, &Tensor::normal_, mean, std, gen);
 }
 
@@ -1000,7 +1000,7 @@ struct Reduction...Op {
   inline scalar_t identity() const { return ...; }
 };
 
-Tensor _sparse_csr_..._cpu(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional<ScalarType> dtype) {
+Tensor _sparse_csr_..._cpu(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, std::optional<ScalarType> dtype) {
   ...
       result = reduce_sparse_csr_cpu_template<scalar_t>(input_, dims_to_sum, keepdim, Reduction...Op<scalar_t>());
   ...
@@ -1336,7 +1336,7 @@ struct ReductionMulOp {
 
 }  // namespace
 
-Tensor _sparse_csr_sum_cpu(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional<ScalarType> dtype) {
+Tensor _sparse_csr_sum_cpu(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, std::optional<ScalarType> dtype) {
   ScalarType dtype_ = dtype.value_or(input.scalar_type());
   Tensor input_ = at::sparse_csr::to_type(input, dtype_);
   Tensor result;
@@ -1352,7 +1352,7 @@ Tensor _sparse_csr_sum_cpu(const Tensor& input, IntArrayRef dims_to_sum, bool ke
   return result;
 }
 
-Tensor _sparse_csr_prod_cpu(const Tensor& input, IntArrayRef dims_to_reduce, bool keepdim, c10::optional<ScalarType> dtype) {
+Tensor _sparse_csr_prod_cpu(const Tensor& input, IntArrayRef dims_to_reduce, bool keepdim, std::optional<ScalarType> dtype) {
   ScalarType dtype_ = dtype.value_or(input.scalar_type());
   Tensor input_ = input.to(dtype_);
   Tensor result;
diff --git a/aten/src/ATen/native/sparse/SparseFactories.cpp b/aten/src/ATen/native/sparse/SparseFactories.cpp
index 6ee92320e12d1..38a59b40c808a 100644
--- a/aten/src/ATen/native/sparse/SparseFactories.cpp
+++ b/aten/src/ATen/native/sparse/SparseFactories.cpp
@@ -22,7 +22,7 @@ Tensor spdiags(
     const Tensor& diagonals,
     const Tensor& offsets,
     IntArrayRef shape,
-    c10::optional<Layout> layout) {
+    std::optional<Layout> layout) {
   auto diagonals_2d = diagonals.dim() == 1 ? diagonals.unsqueeze(0) : diagonals;
   TORCH_CHECK(diagonals_2d.dim() == 2, "Diagonals must be vector or matrix");
   TORCH_CHECK(shape.size() == 2, "Output shape must be 2d");
diff --git a/aten/src/ATen/native/sparse/SparseStubs.h b/aten/src/ATen/native/sparse/SparseStubs.h
index 2a3aef5c8bd92..af6df0785fe92 100644
--- a/aten/src/ATen/native/sparse/SparseStubs.h
+++ b/aten/src/ATen/native/sparse/SparseStubs.h
@@ -13,10 +13,10 @@ namespace native {
 using mul_sparse_sparse_out_fn = void (*)(Tensor& res, const Tensor& x, const Tensor& y);
 DECLARE_DISPATCH(mul_sparse_sparse_out_fn, mul_sparse_sparse_out_stub);
 
-using sparse_mask_intersection_out_fn = void (*)(Tensor& res, const Tensor& x, const Tensor& y, const c10::optional<Tensor>& x_hash_opt);
+using sparse_mask_intersection_out_fn = void (*)(Tensor& res, const Tensor& x, const Tensor& y, const std::optional<Tensor>& x_hash_opt);
 DECLARE_DISPATCH(sparse_mask_intersection_out_fn, sparse_mask_intersection_out_stub);
 
-using sparse_mask_projection_out_fn = void (*)(Tensor& res, const Tensor& x, const Tensor& y, const c10::optional<Tensor>& x_hash_opt, bool accumulate_matches);
+using sparse_mask_projection_out_fn = void (*)(Tensor& res, const Tensor& x, const Tensor& y, const std::optional<Tensor>& x_hash_opt, bool accumulate_matches);
 DECLARE_DISPATCH(sparse_mask_projection_out_fn, sparse_mask_projection_out_stub);
 
 using flatten_indices_fn = Tensor (*)(const Tensor& indices, IntArrayRef size);
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index add7f433731a2..e9f10d964b320 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -143,10 +143,10 @@ Tensor values_default(const Tensor& self) {
 /*** Helper methods ***/
 
 static SparseTensor new_sparse(
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   AT_ASSERT(layout.has_value() && *layout == kSparse);
   DispatchKey dispatch_key;
   switch (device_or_default(device).type()) {
@@ -170,10 +170,10 @@ SparseTensor new_with_dims_sparse(
     int64_t sparse_dim,
     int64_t dense_dim,
     ArrayRef<int64_t> size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   SparseTensor self = new_sparse(dtype, layout, device, pin_memory);
   get_sparse_impl(self)->resize_and_clear_(sparse_dim, dense_dim, size);
   return self;
@@ -185,11 +185,11 @@ SparseTensor new_with_dims_and_tensor_sparse_symint(
     c10::SymIntArrayRef size,
     const Tensor& indices,
     const Tensor& values,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<bool> is_coalesced) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<bool> is_coalesced) {
   SparseTensor self = new_sparse(dtype, layout, device, pin_memory);
   auto impl = get_sparse_impl(self);
   impl->resize_(sparse_dim, dense_dim, size);
@@ -228,11 +228,11 @@ SparseTensor new_with_dims_and_tensor_sparse_symint(
 /** Empty init **/
 Tensor empty_sparse(
     IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<MemoryFormat> optional_memory_format) {
   TORCH_CHECK(
       !pin_memory.has_value() || !*pin_memory,
       "Only dense CPU tensors can be pinned");
@@ -242,10 +242,10 @@ Tensor empty_sparse(
 
 /* Shape init */
 Tensor sparse_coo_tensor(IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -268,11 +268,11 @@ static inline Tensor expand_values_if_needed(const Tensor& values) {
 } // namespace
 
 Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<bool> is_coalesced) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<bool> is_coalesced) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
@@ -352,7 +352,7 @@ void _validate_sparse_coo_tensor_args(
     const Tensor& indices,
     const Tensor& values_,
     ArrayRef<int64_t> size,
-    c10::optional<bool> is_coalesced_) {
+    std::optional<bool> is_coalesced_) {
   Tensor values = expand_values_if_needed(values_);
   bool is_coalesced = is_coalesced_.value_or(false);
 
@@ -425,11 +425,11 @@ void _validate_sparse_coo_tensor_args(
 
 // NB: Got rid of the sizes == NULL case
 Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<bool> is_coalesced) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<bool> is_coalesced) {
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
   // arg checking
@@ -449,11 +449,11 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, IntArrayRe
 }
 
 Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values_, at::IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<bool> is_coalesced) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<bool> is_coalesced) {
   if (at::globalContext().checkSparseTensorInvariants()) {
     at::native::_validate_sparse_coo_tensor_args(indices, values_, size, is_coalesced);
   }
@@ -467,11 +467,11 @@ Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values_, a
 // _validate_sparse_coo_tensor_args before using the tensor.
 // NB: Got rid of the size == NULL case
 Tensor _sparse_coo_tensor_unsafe_symint(const Tensor& indices, const Tensor& values_, c10::SymIntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<bool> is_coalesced) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<bool> is_coalesced) {
   // See [Note: hacky wrapper removal for TensorOptions]
 
   Tensor values = expand_values_if_needed(values_);
@@ -495,7 +495,7 @@ Tensor _sparse_coo_tensor_unsafe_symint(const Tensor& indices, const Tensor& val
 
 SparseTensor clone_sparse(
     const SparseTensor& self,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   TORCH_CHECK(
       !optional_memory_format.has_value(),
       "unsupported memory format option ",
@@ -687,7 +687,7 @@ SparseTensor _coalesce_sparse_cpu(const SparseTensor& self) {
 DEFINE_DISPATCH(sparse_mask_intersection_out_stub);
 DEFINE_DISPATCH(sparse_mask_projection_out_stub);
 
-using OptTensor = c10::optional<Tensor>;
+using OptTensor = std::optional<Tensor>;
 
 static std::tuple<Tensor, Tensor, OptTensor> sparse_mask_like_prepare_sparse_inputs(
     const std::string& method_name,
@@ -814,11 +814,11 @@ Tensor sparse_mask_projection(const Tensor& t, const Tensor& mask, bool accumula
 
 Tensor empty_like_sparse_coo(
     const Tensor& self,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
   TORCH_CHECK(
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index a3227df942c45..f058c68579f86 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -220,7 +220,7 @@ static SparseTensor& coalesce_(SparseTensor& tensor) {
 // div(SparseTensor, Scalar)
 // --------------------------------------------------------------------
 
-SparseTensor& div_out_sparse_zerodim(const SparseTensor& t, const Tensor& value, c10::optional<c10::string_view> rounding_mode, SparseTensor& r) {
+SparseTensor& div_out_sparse_zerodim(const SparseTensor& t, const Tensor& value, std::optional<c10::string_view> rounding_mode, SparseTensor& r) {
   TORCH_CHECK(value.dim() == 0, "Sparse division requires a scalar or ",
     "zero-dim dense tensor divisor (got shape ", value.sizes(), " for divisor)");
   TORCH_CHECK(!value.is_sparse(), "Sparse division requires a scalar or ",
@@ -274,7 +274,7 @@ static SparseTensor& div_out_sparse_scalar(const SparseTensor& t, Scalar value,
   return div_out_sparse_zerodim(t, wrapped_scalar_tensor(value), r);
 }
 
-Tensor div_sparse(const Tensor& self, const Tensor& value, c10::optional<c10::string_view> rounding_mode) {
+Tensor div_sparse(const Tensor& self, const Tensor& value, std::optional<c10::string_view> rounding_mode) {
   auto commonDtype = at::result_type(self, value);
   if (c10::isIntegralType(commonDtype, /*includeBool=*/true) && !rounding_mode.has_value()) {
     commonDtype = typeMetaToScalarType(at::get_default_dtype());
@@ -283,11 +283,11 @@ Tensor div_sparse(const Tensor& self, const Tensor& value, c10::optional<c10::st
   return div_out_sparse_zerodim(self, value, std::move(rounding_mode), result);
 }
 
-Tensor& div_sparse_(Tensor& self, const Tensor& value, c10::optional<c10::string_view> rounding_mode) {
+Tensor& div_sparse_(Tensor& self, const Tensor& value, std::optional<c10::string_view> rounding_mode) {
   return div_out_sparse_zerodim(self, value, std::move(rounding_mode), self);
 }
 
-static SparseTensor& div_out_sparse_scalar(const SparseTensor& t, Scalar value, c10::optional<c10::string_view> rounding_mode, SparseTensor& r) {
+static SparseTensor& div_out_sparse_scalar(const SparseTensor& t, Scalar value, std::optional<c10::string_view> rounding_mode, SparseTensor& r) {
   return div_out_sparse_zerodim(t, wrapped_scalar_tensor(value), std::move(rounding_mode), r);
 }
 
diff --git a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
index ce6e3d4eac11b..f5445ba4bd48d 100644
--- a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
+++ b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
@@ -257,16 +257,16 @@ Tensor& threshold_backward_sparse_out(
 }
 
 Tensor nan_to_num_sparse(
-    const Tensor &self, c10::optional<double> nan,
-    c10::optional<double> posinf, c10::optional<double> neginf) {
+    const Tensor &self, std::optional<double> nan,
+    std::optional<double> posinf, c10::optional<double> neginf) {
   return coalesced_unary_ufunc(
       self, [&](const Tensor &t) {
         return at::nan_to_num(t, nan, posinf, neginf);
       });
 }
 Tensor& nan_to_num_sparse_out(
-    const Tensor &self, c10::optional<double> nan,
-    c10::optional<double> posinf, c10::optional<double> neginf,
+    const Tensor &self, std::optional<double> nan,
+    std::optional<double> posinf, c10::optional<double> neginf,
     Tensor &out) {
   return coalesced_unary_ufunc_out(
       self, out, [&](const Tensor &t, Tensor &out) {
@@ -274,8 +274,8 @@ Tensor& nan_to_num_sparse_out(
       });
 }
 Tensor& nan_to_num_sparse_(
-    Tensor &self, c10::optional<double> nan,
-    c10::optional<double> posinf, c10::optional<double> neginf) {
+    Tensor &self, std::optional<double> nan,
+    std::optional<double> posinf, c10::optional<double> neginf) {
   TORCH_CHECK(self.is_coalesced(), "nan_to_num_ requires coalesced input");
   return nan_to_num_sparse_out(self, nan, posinf, neginf, self);
 }
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
index 75474e77ea848..1ee5a8b9d2c01 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
@@ -389,7 +389,7 @@ struct Reduction...Op {
 };
 
 
-Tensor _sparse_csr_..._cuda(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional<ScalarType> dtype) {
+Tensor _sparse_csr_..._cuda(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, std::optional<ScalarType> dtype) {
   ...
       result = reduce_sparse_csr_cuda_template<scalar_t>(input_, dims_to_sum, keepdim, Reduction...Op<scalar_t>());
   ...
@@ -708,7 +708,7 @@ struct ReductionMulOp {
 
 } // namespace
 
-Tensor _sparse_csr_sum_cuda(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional<ScalarType> dtype) {
+Tensor _sparse_csr_sum_cuda(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, std::optional<ScalarType> dtype) {
   ScalarType dtype_ = dtype.value_or(input.scalar_type());
   Tensor input_ = at::sparse_csr::to_type(input, dtype_);
   Tensor result;
@@ -724,7 +724,7 @@ Tensor _sparse_csr_sum_cuda(const Tensor& input, IntArrayRef dims_to_sum, bool k
   return result;
 }
 
-Tensor _sparse_csr_prod_cuda(const Tensor& input, IntArrayRef dims_to_reduce, bool keepdim, c10::optional<ScalarType> dtype) {
+Tensor _sparse_csr_prod_cuda(const Tensor& input, IntArrayRef dims_to_reduce, bool keepdim, std::optional<ScalarType> dtype) {
   ScalarType dtype_ = dtype.value_or(input.scalar_type());
   Tensor input_ = input.to(dtype_);
   Tensor result;
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu
index 47ee1568beb1e..01aa11dbdecb5 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu
@@ -600,9 +600,9 @@ Tensor two_four_sgemm_dispatch_layouts_bias_activation(
 // number of checks throughout the code.
 Tensor _sparse_semi_structured_linear(
       const Tensor& input, const Tensor& weight,
-      const Tensor& meta, const c10::optional<Tensor>& bias_opt,
-      const c10::optional<c10::string_view> activation_opt,
-      const c10::optional<c10::ScalarType> out_dtype_opt) {
+      const Tensor& meta, const std::optional<Tensor>& bias_opt,
+      const std::optional<c10::string_view> activation_opt,
+      const std::optional<c10::ScalarType> out_dtype_opt) {
     TORCH_WARN_ONCE("_sparse_semi_structured_linear is deprecated and will be "
                     "removed in a future PyTorch release.  Please use "
                     "_sparse_semi_structured_mm/_sparse_semi_structured_addmm "
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu
index 8c05acc66bc92..abd6cf9739c63 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu
@@ -522,8 +522,8 @@ void spgemm_cutlass_dispatch_layouts_tensor_c(
 // aten._sparse_semi_structured_addmm operators.
 Tensor sparse_semi_structured_mad_op(
       const Tensor& mat1, const Tensor& mat1_meta, const Tensor& mat2,
-      const c10::optional<Tensor>& input_opt, const Scalar& alpha,
-      const Scalar& beta, const c10::optional<c10::ScalarType> out_dtype_opt) {
+      const std::optional<Tensor>& input_opt, const Scalar& alpha,
+      const Scalar& beta, const std::optional<c10::ScalarType> out_dtype_opt) {
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
     AT_ERROR(__func__, " : CUTLASS not supported");
     return Tensor{};
@@ -787,9 +787,9 @@ Tensor sparse_semi_structured_mad_op(
 // Implementation of aten._sparse_semi_structured_mm operator.
 Tensor _sparse_semi_structured_mm(
       const Tensor& mat1, const Tensor& mat1_meta, const Tensor& mat2,
-      const c10::optional<c10::ScalarType> out_dtype_opt) {
+      const std::optional<c10::ScalarType> out_dtype_opt) {
     return sparse_semi_structured_mad_op(mat1, mat1_meta, mat2,
-                                         c10::optional<Tensor>(), 1, 0,
+                                         std::optional<Tensor>(), 1, 0,
                                          out_dtype_opt);
 }
 
@@ -797,7 +797,7 @@ Tensor _sparse_semi_structured_mm(
 Tensor _sparse_semi_structured_addmm(
       const Tensor& input, const Tensor& mat1, const Tensor& mat1_meta,
       const Tensor& mat2, const Scalar& alpha, const Scalar& beta,
-      const c10::optional<c10::ScalarType> out_dtype_opt) {
+      const std::optional<c10::ScalarType> out_dtype_opt) {
     return sparse_semi_structured_mad_op(mat1, mat1_meta, mat2, input, alpha,
                                          beta, out_dtype_opt);
 }
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu
index fd5a04fa61039..b5382b5b08486 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu
@@ -207,7 +207,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> sparse_semi_structured_tile_t
         std::string algorithm)
 {
   using KT = KernelTypes<Element>;
-  c10::optional<at::cuda::CUDAGuard> device_guard;
+  std::optional<at::cuda::CUDAGuard> device_guard;
   if (!input.is_meta()) {
     device_guard.emplace(input.device());
   }
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu
index 023e8f73930fd..2fbbaa0290703 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu
@@ -34,7 +34,7 @@ std::tuple<Tensor, Tensor> _sparse_semi_structured_apply_typed(Tensor input, Ten
   if (input.stride(1) != 1) {
     input = input.contiguous();
   }
-  c10::optional<at::cuda::CUDAGuard> device_guard;
+  std::optional<at::cuda::CUDAGuard> device_guard;
   if (!kIsMeta) {
     device_guard.emplace(input.device());
   }
diff --git a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
index c66fbf8f2a93d..384fa2422b247 100644
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@@ -101,9 +101,9 @@ at::Tensor _cslt_compress(const Tensor& sparse_input)
 std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
     const Tensor& compressed_A,
     const Tensor& dense_B,
-    const c10::optional<Tensor>& bias_opt,
-    const c10::optional<Tensor>& alpha_opt,
-    const c10::optional<c10::ScalarType> out_dtype_opt,
+    const std::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& alpha_opt,
+    const std::optional<c10::ScalarType> out_dtype_opt,
     bool transpose_result,
     int alg_id,
     bool search_alg_id
@@ -343,9 +343,9 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
 at::Tensor _cslt_sparse_mm(
     const Tensor& compressed_A,
     const Tensor& dense_B,
-    const c10::optional<Tensor>& bias_opt,
-    const c10::optional<Tensor>& alpha_opt,
-    const c10::optional<c10::ScalarType> out_dtype_opt,
+    const std::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& alpha_opt,
+    const std::optional<c10::ScalarType> out_dtype_opt,
     bool transpose_result,
     int64_t alg_id
 )
@@ -365,9 +365,9 @@ at::Tensor _cslt_sparse_mm(
 int64_t _cslt_sparse_mm_search(
     const Tensor& compressed_A,
     const Tensor& dense_B,
-    const c10::optional<Tensor>& bias_opt,
-    const c10::optional<Tensor>& alpha_opt,
-    const c10::optional<c10::ScalarType> out_dtype_opt,
+    const std::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& alpha_opt,
+    const std::optional<c10::ScalarType> out_dtype_opt,
     bool transpose_result
 )
 {
@@ -398,9 +398,9 @@ at::Tensor _cslt_compress(const Tensor& sparse_input){
 at::Tensor _cslt_sparse_mm(
     const Tensor& compressed_A,
     const Tensor& dense_B,
-    const c10::optional<Tensor>& bias_opt,
-    const c10::optional<Tensor>& alpha_opt,
-    const c10::optional<c10::ScalarType> out_dtype,
+    const std::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& alpha_opt,
+    const std::optional<c10::ScalarType> out_dtype,
     bool transpose_result,
     int64_t alg_id)
 {
@@ -410,9 +410,9 @@ at::Tensor _cslt_sparse_mm(
 int64_t _cslt_sparse_mm_search(
     const Tensor& compressed_A,
     const Tensor& dense_B,
-    const c10::optional<Tensor>& bias_opt,
-    const c10::optional<Tensor>& alpha_opt,
-    const c10::optional<c10::ScalarType> out_dtype,
+    const std::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& alpha_opt,
+    const std::optional<c10::ScalarType> out_dtype,
     bool transpose_result
 )
 {
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index e26de29537954..ede02ab1352f0 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -106,9 +106,9 @@ Tensor bmm_nt(const Tensor& a, const Tensor& b) {
 
 Tensor masked_softmax(
     Tensor& attn_scores,
-    c10::optional<Tensor> attn_mask,
+    std::optional<Tensor> attn_mask,
     const Tensor& query,
-    c10::optional<int64_t> mask_type) {
+    std::optional<int64_t> mask_type) {
   if (query.is_nested() && !attn_mask) {
     return at::_nested_tensor_softmax_with_shape(attn_scores, query);
   }
@@ -267,10 +267,10 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_cpu(
     const Tensor& qkv_bias,
     const Tensor& proj_weight,
     const Tensor& proj_bias,
-    const c10::optional<Tensor>& mask,
+    const std::optional<Tensor>& mask,
     bool need_weights,
     bool average_attn_weights,
-    const c10::optional<int64_t> mask_type) {
+    const std::optional<int64_t> mask_type) {
   // query shape: [B, T, D]
   // qkv_weight shape: [3 * D, D]
 
@@ -423,7 +423,7 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_cpu(
 }
 
 int64_t _fused_sdp_choice_cpp(const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal, c10::optional<double> scale){
+        const std::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal, c10::optional<double> scale){
   sdp::sdp_params kernel_params{query_, key, value, attn_mask_, dropout_p, is_causal};
   auto backend = sdp::select_sdp_backend_cpp(kernel_params);
   if (backend == sdp::SDPBackend::error) {
@@ -445,10 +445,10 @@ int64_t _fused_sdp_choice_meta(
     const Tensor& query_,
     const Tensor& key,
     const Tensor& value,
-    const c10::optional<Tensor>& attn_mask_,
+    const std::optional<Tensor>& attn_mask_,
     double dropout_p,
     bool is_causal,
-    c10::optional<double> scale) {
+    std::optional<double> scale) {
   auto query_key_set = query_.key_set();
 #if defined(USE_ROCM)
   bool has_rocm = query_key_set.has(c10::DispatchKey::HIP);
@@ -479,10 +479,10 @@ inline void validate_sdpa_input(
     const Tensor& query_,
     const Tensor& key,
     const Tensor& value,
-    const c10::optional<Tensor>& attn_mask_,
+    const std::optional<Tensor>& attn_mask_,
     double dropout_p,
     bool is_causal,
-    c10::optional<double> scale) {
+    std::optional<double> scale) {
   TORCH_CHECK(
       query_.dtype() == key.dtype() && query_.dtype() == value.dtype(),
       "Expected query, key, and value to have the same dtype, but got query.dtype: ",
@@ -512,7 +512,7 @@ inline void validate_sdpa_input(
 // the math and memory efficient attn_mask implementation
 //  Args:
 //    attn_mask: attn_mask of shape (B, L, S) or (L, S) or (B, N_heads, L, S)
-c10::optional<Tensor> convert_boolean_attn_mask(const c10::optional<Tensor>& attn_mask, caffe2::TypeMeta dtype) {
+std::optional<Tensor> convert_boolean_attn_mask(const c10::optional<Tensor>& attn_mask, caffe2::TypeMeta dtype) {
   // Pass through
   if(!attn_mask.has_value()){
     return c10::nullopt;
@@ -598,7 +598,7 @@ at::Tensor post_process_flash_output(
 }
 
 int64_t handle_private_use(const Tensor& query_, const Tensor& key, const Tensor& value,
-    const c10::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal, c10::optional<double> scale){
+    const std::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal, c10::optional<double> scale){
   int64_t choice_int = static_cast<int64_t>(sdp::SDPBackend::math);
   try {
     choice_int = _fused_sdp_choice_stub(query_.device().type(),
@@ -643,10 +643,10 @@ Tensor scaled_dot_product_attention(
     const Tensor& query_,
     const Tensor& key,
     const Tensor& value,
-    const c10::optional<Tensor>& attn_mask_,
+    const std::optional<Tensor>& attn_mask_,
     double dropout_p,
     bool is_causal,
-    c10::optional<double> scale) {
+    std::optional<double> scale) {
   validate_sdpa_input(query_, key, value, attn_mask_, dropout_p, is_causal, scale);
   int64_t choice_int = static_cast<int64_t>(sdp::SDPBackend::math);
   if (query_.device().type() == DeviceType::CUDA
@@ -662,7 +662,7 @@ Tensor scaled_dot_product_attention(
     }
   }
   sdp::SDPBackend backend = static_cast<sdp::SDPBackend>(choice_int);
-  c10::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
+  std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
   switch (backend) {
     case sdp::SDPBackend::cudnn_attention: {
       bool compute_logsumexp =
@@ -719,8 +719,8 @@ Tensor scaled_dot_product_attention(
 
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
         const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal,
-        const c10::optional<Tensor>& dropout_mask, c10::optional<double> scale) {
+        const std::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal,
+        const std::optional<Tensor>& dropout_mask, c10::optional<double> scale) {
   C10_LOG_API_USAGE_ONCE("torch.sdpa.math_fallback");
   if (query_.is_nested() || key.is_nested() || value.is_nested()) {
     TORCH_CHECK(
@@ -779,8 +779,8 @@ _scaled_dot_product_flash_attention_cpu(
     const Tensor& value,
     double dropout_p,
     bool is_causal,
-    const c10::optional<Tensor>& attn_mask,
-    c10::optional<double> scale) {
+    const std::optional<Tensor>& attn_mask,
+    std::optional<double> scale) {
   const auto dtype = query.scalar_type();
   int64_t batchSize = query.size(0);
   int64_t qSize = query.size(2);
@@ -827,8 +827,8 @@ _scaled_dot_product_flash_attention_cpu_backward(
     const Tensor& logsumexp,
     double dropout_p,
     bool is_causal,
-    const c10::optional<Tensor>& attn_mask,
-    c10::optional<double> scale) {
+    const std::optional<Tensor>& attn_mask,
+    std::optional<double> scale) {
   if (!grad_out.defined()) {
     return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
   }
@@ -864,7 +864,7 @@ Tensor triton_multi_head_attention(
     const Tensor& qkv_bias,
     const Tensor& proj_weight,
     const Tensor& proj_bias,
-    const c10::optional<Tensor>& mask) {
+    const std::optional<Tensor>& mask) {
   // query shape: [B, T, D]
   // qkv_weight shape: [3 * D, D]
   TORCH_CHECK(!mask, "Only causal mask is supported for Triton.");
diff --git a/aten/src/ATen/native/transformers/attention.h b/aten/src/ATen/native/transformers/attention.h
index 2d2740a92e7dc..0e4a52f445442 100644
--- a/aten/src/ATen/native/transformers/attention.h
+++ b/aten/src/ATen/native/transformers/attention.h
@@ -9,16 +9,16 @@ namespace at {
 namespace native {
 
 using fused_sdp_choice_fn = int64_t (*)(const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal, c10::optional<double> scale);
+        const std::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal, c10::optional<double> scale);
 
 DECLARE_DISPATCH(fused_sdp_choice_fn, _fused_sdp_choice_stub);
 
 TORCH_API Tensor bmm_nt(const Tensor& a, const Tensor& b);
 TORCH_API Tensor masked_softmax(
     Tensor& attn_scores,
-    c10::optional<Tensor> attn_mask,
+    std::optional<Tensor> attn_mask,
     const Tensor& query,
-    c10::optional<int64_t> mask_type = {});
+    std::optional<int64_t> mask_type = {});
 
 using transform_bias_rescale_qkv_fn = void(*)(
     at::ScalarType type,
@@ -53,8 +53,8 @@ using flash_attention_fn = void (*)(
     const Tensor& output, const Tensor& logsumexp,
     const Tensor& query, const Tensor& key, const Tensor& value,
     double dropout_p, bool is_causal,
-    c10::optional<Tensor> attn_mask,
-    c10::optional<double> scale);
+    std::optional<Tensor> attn_mask,
+    std::optional<double> scale);
 
 using flash_attention_backward_fn = void (*)(
     const Tensor& grad_q, const Tensor& grad_k,
@@ -62,8 +62,8 @@ using flash_attention_backward_fn = void (*)(
     const Tensor& query, const Tensor& key,
     const Tensor& value, const Tensor& out, const Tensor& logsumexp,
     double dropout_p, bool is_causal,
-    c10::optional<Tensor> attn_mask,
-    c10::optional<double> scale);
+    std::optional<Tensor> attn_mask,
+    std::optional<double> scale);
 
 DECLARE_DISPATCH(flash_attention_fn, flash_attention_kernel);
 DECLARE_DISPATCH(flash_attention_backward_fn, flash_attention_backward_kernel);
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index dcf451feead7b..e55560791a085 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -479,10 +479,10 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_cuda(
     const Tensor& qkv_bias,
     const Tensor& proj_weight,
     const Tensor& proj_bias,
-    const c10::optional<Tensor>& mask,
+    const std::optional<Tensor>& mask,
     bool need_weights,
     bool average_attn_weights,
-    const c10::optional<int64_t> mask_type) {
+    const std::optional<int64_t> mask_type) {
   // query shape: [B, T, D]
   // qkv_weight shape: [3 * D, D]
 
@@ -681,7 +681,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
     double dropout_p,
     bool is_causal,
     bool return_debug_mask,
-    c10::optional<double> scale) {
+    std::optional<double> scale) {
   // Used for tracking usage statistics
   C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention");
   // Query (Batch x Num_heads x Q_seq_len  x Dim_per_head)
@@ -733,7 +733,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
     double dropout_p,
     bool is_causal,
     bool training,
-    c10::optional<double> scale) {
+    std::optional<double> scale) {
   // Used for tracking usage statistics
   C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention_cudnn");
   // Query (Batch x Num_heads x Q_seq_len  x Dim_per_head)
@@ -780,11 +780,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_efficient_attenti
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
-    const c10::optional<at::Tensor>& attn_bias,
+    const std::optional<at::Tensor>& attn_bias,
     bool compute_log_sumexp,
     double dropout_p,
     bool is_causal,
-    c10::optional<double> scale) {
+    std::optional<double> scale) {
   // Used for tracking usage statistics
   C10_LOG_API_USAGE_ONCE("torch.sdpa.mem_efficient_attention");
   // Query -> Query(Batch x Q_seq_len x Num_heads x Dim_per_head)
@@ -817,7 +817,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_efficient_attenti
 }
 
 int64_t _fused_sdp_choice_cuda(const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal, c10::optional<double> scale){
+        const std::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal, c10::optional<double> scale){
   sdp::sdp_params kernel_params{query_, key, value, attn_mask_, dropout_p, is_causal};
   auto backend = select_sdp_backend(kernel_params);
   if (backend == sdp::SDPBackend::error) {
@@ -834,23 +834,23 @@ _flash_attention_forward(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
-    const c10::optional<Tensor>& cumulative_sequence_length_q,
-    const c10::optional<Tensor>& cumulative_sequence_length_k,
+    const std::optional<Tensor>& cumulative_sequence_length_q,
+    const std::optional<Tensor>& cumulative_sequence_length_k,
     int64_t max_seqlen_batch_q,
     int64_t max_seqlen_batch_k,
     double dropout_p,
     bool is_causal,
     bool return_debug_mask,
-    c10::optional<double> scale) {
+    std::optional<double> scale) {
 #if defined(USE_FLASH_ATTENTION)
   const auto softmax_scale =
       sdp::calculate_scale(query, scale).as_float_unchecked();
-  c10::optional<Tensor> out = c10::nullopt;
+  std::optional<Tensor> out = c10::nullopt;
   // This can be used when your sequence length k is not the full extent
   // of the tensor. This is useful for kv cache scenarios but for now
   // we will not support in this PR.
-  c10::optional<Tensor> seqused_k = c10::nullopt;
-  c10::optional<Tensor> alibi_slopes = c10::nullopt;
+  std::optional<Tensor> seqused_k = c10::nullopt;
+  std::optional<Tensor> alibi_slopes = c10::nullopt;
 
   // We are going to have two paths:
   // 1. The standard MHA path for dense tensors
@@ -937,23 +937,23 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
     const at::Tensor& query, // [b, seqlen, num_heads, K]
     const at::Tensor& key, // [b, seqlen, num_heads, K]
     const at::Tensor& value, // [b, seqlen, num_heads, Kv]
-    const c10::optional<at::Tensor>& bias, // [b, num_heads, seqlen, seqlen]
+    const std::optional<at::Tensor>& bias, // [b, num_heads, seqlen, seqlen]
     // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
     // position of the first query token for batch $b
-    const c10::optional<at::Tensor>& seqstart_q,
+    const std::optional<at::Tensor>& seqstart_q,
     // (Mode 1MHK only) [b+1]: cu_seqlen_k[b] contains the
     // position of the first key token for batch $b
-    const c10::optional<at::Tensor>& seqstart_k,
+    const std::optional<at::Tensor>& seqstart_k,
     // (Mode 1MHK only) Maximum sequence length across batches
-    const c10::optional<int64_t> max_seqlen_q_,
-    const c10::optional<int64_t> max_seqlen_k_,
+    const std::optional<int64_t> max_seqlen_q_,
+    const std::optional<int64_t> max_seqlen_k_,
     double dropout_p, // attention matrix dropout probability
     int64_t custom_mask_type,
     bool compute_logsumexp,
-    c10::optional<double> scale,
-    const c10::optional<at::Tensor>& causal_diagonal,
-    const c10::optional<at::Tensor>& seqlen_k,
-    const c10::optional<int64_t> window_size) {
+    std::optional<double> scale,
+    const std::optional<at::Tensor>& causal_diagonal,
+    const std::optional<at::Tensor>& seqlen_k,
+    const std::optional<int64_t> window_size) {
 #if defined(USE_MEM_EFF_ATTENTION)
 // TODO In theory it is possible to compile with _CUDA_ARCH < 5.0 and run on a
 // machine that is >= 5.0. In practice, this is not a problem but since
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
index 0405b6d73329f..78c2d54fdc8a6 100644
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@@ -66,22 +66,22 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
     bool is_causal,
     const Tensor& philox_seed,
     const Tensor& philox_offset,
-    c10::optional<double> scale) {
+    std::optional<double> scale) {
 #if defined(USE_FLASH_ATTENTION)
   const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
   //  CUDA code assumes that dout is contiguous
   auto contiguous_grad_out = grad_out.contiguous();
   auto contiguous_out = out.contiguous();
 
-  c10::optional<at::Tensor> dq{c10::nullopt};
-  c10::optional<at::Tensor> dk{c10::nullopt};
-  c10::optional<at::Tensor> dv{c10::nullopt};
+  std::optional<at::Tensor> dq{c10::nullopt};
+  std::optional<at::Tensor> dk{c10::nullopt};
+  std::optional<at::Tensor> dv{c10::nullopt};
 
   //  The kernel computes irregardless we will drop for this functions return
   Tensor grad_softmax;
 
   // Currently unused args:
-  c10::optional<at::Tensor> alibi_slopes{c10::nullopt};
+  std::optional<at::Tensor> alibi_slopes{c10::nullopt};
 
   bool determinisitic{false};
   auto& ctx = at::globalContext();
@@ -167,7 +167,7 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
     bool is_causal,
     const Tensor& philox_seed,
     const Tensor& philox_offset,
-    c10::optional<double> scale) {
+    std::optional<double> scale) {
     const int64_t batch_size = query.size(0);
     const int64_t num_heads = query.size(1);
     const int64_t head_dim = query.size(3);
@@ -205,14 +205,14 @@ _efficient_attention_backward(
     const at::Tensor& query,
     const at::Tensor& key,
     const at::Tensor& value,
-    const c10::optional<at::Tensor>& kernel_bias, // additive attention bias
+    const std::optional<at::Tensor>& kernel_bias, // additive attention bias
     const at::Tensor& out,
     // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
     // position of the first query token for batch $b
-    const c10::optional<at::Tensor>& cu_seqlens_q_dummy,
+    const std::optional<at::Tensor>& cu_seqlens_q_dummy,
     // (Mode 1MHK only) [b+1]: cu_seqlens_k[b] contains the
     // position of the first key token for batch $b
-    const c10::optional<at::Tensor>& cu_seqlens_k_dummy,
+    const std::optional<at::Tensor>& cu_seqlens_k_dummy,
     // (Mode 1MHK only) Maximum sequence length across batches
     int64_t max_seqlen_q,
     // (Mode 1MHK only) Maximum sequence length across batches
@@ -223,9 +223,9 @@ _efficient_attention_backward(
     const at::Tensor& philox_offset, // offset into random number sequence
     int64_t custom_mask_type,
     const bool bias_requires_grad,
-    const c10::optional<double> scale,
-    c10::optional <int64_t> num_splits_key,
-    const c10::optional<int64_t> window_size) {
+    const std::optional<double> scale,
+    std::optional <int64_t> num_splits_key,
+    const std::optional<int64_t> window_size) {
   #if defined(USE_MEM_EFF_ATTENTION)
   if (!grad_out_.defined()) {
     return std::make_tuple(Tensor{}, Tensor{}, Tensor{}, Tensor{});
@@ -233,8 +233,8 @@ _efficient_attention_backward(
   // This path is used when we directly call _efficient_attention_forward
   // from python.
   // This is needed because SaveVariable automatically converts
-  // c10::optional to undefined tensor
-  c10::optional<Tensor> bias, cu_seqlens_q, cu_seqlens_k;
+  // std::optional to undefined tensor
+  std::optional<Tensor> bias, cu_seqlens_q, cu_seqlens_k;
   bias = kernel_bias.has_value() && !kernel_bias->defined() ? c10::nullopt : kernel_bias;
   cu_seqlens_q = cu_seqlens_q_dummy.has_value() && !cu_seqlens_q_dummy->defined() ? c10::nullopt : cu_seqlens_q_dummy;
   cu_seqlens_k = cu_seqlens_k_dummy.has_value() && !cu_seqlens_k_dummy->defined() ? c10::nullopt : cu_seqlens_k_dummy;
@@ -603,7 +603,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_flash_attenti
     bool is_causal,
     const at::Tensor& philox_seed,
     const at::Tensor& philox_offset,
-    c10::optional<double> scale){
+    std::optional<double> scale){
   if (!grad_out_.defined()) {
     return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
   }
@@ -653,7 +653,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_e
     double dropout_p,
     std::array<bool, 4> grad_input_mask,
     bool causal,
-    c10::optional<double> scale) {
+    std::optional<double> scale) {
 
   if (!grad_out_.defined()) {
     return std::make_tuple(Tensor{}, Tensor{}, Tensor{}, Tensor{});
@@ -667,8 +667,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_e
   Tensor grad_q, grad_k, grad_v, grad_bias;
 
   // This is needed because SaveVariable automatically converts
-  // c10::optional to undefined tensor
-  c10::optional<Tensor> kernel_bias;
+  // std::optional to undefined tensor
+  std::optional<Tensor> kernel_bias;
   if (attn_bias.defined()) {
     kernel_bias = attn_bias;
   }
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
index 8f6f7a9f357dc..5c7db42368931 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
@@ -322,7 +322,7 @@ void set_params_splitkv(Flash_fwd_params &params, const int batch_size,
     }
 }
 
-void set_params_alibi(Flash_fwd_params &params, c10::optional<at::Tensor> &alibi_slopes_, int batch_size, int num_heads){
+void set_params_alibi(Flash_fwd_params &params, std::optional<at::Tensor> &alibi_slopes_, int batch_size, int num_heads){
 #ifdef FLASHATTENTION_DISABLE_ALIBI
     TORCH_CHECK(!alibi_slopes_.has_value(), "This flash attention build does not support alibi.");
     params.alibi_slopes_ptr = nullptr;
@@ -346,15 +346,15 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tenso
 mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
         const at::Tensor &k,         // batch_size x seqlen_k x num_heads_k x head_size
         const at::Tensor &v,         // batch_size x seqlen_k x num_heads_k x head_size
-        c10::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
-        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+        std::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
+        std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
         const float p_dropout,
         const float softmax_scale,
         bool is_causal,
         int window_size_left,
         int window_size_right,
         const bool return_softmax,
-        c10::optional<at::Generator> gen_) {
+        std::optional<at::Generator> gen_) {
 
     auto dprops = at::cuda::getCurrentDeviceProperties();
     // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
@@ -532,11 +532,11 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tenso
 mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
                const at::Tensor &k,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
                const at::Tensor &v,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
-               c10::optional<at::Tensor> &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+               std::optional<at::Tensor> &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
                const at::Tensor &cu_seqlens_q,  // b+1
                const at::Tensor &cu_seqlens_k,  // b+1
-               c10::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
-               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               std::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
+               std::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
                int max_seqlen_q,
                const int max_seqlen_k,
                const float p_dropout,
@@ -546,7 +546,7 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
                int window_size_left,
                int window_size_right,
                const bool return_softmax,
-               c10::optional<at::Generator> gen_) {
+               std::optional<at::Generator> gen_) {
 
     auto dprops = at::cuda::getCurrentDeviceProperties();
     // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
@@ -765,10 +765,10 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
         const at::Tensor &v,   // batch_size x seqlen_k x num_heads_k x head_size
         const at::Tensor &out,   // batch_size x seqlen_q x num_heads x head_size
         const at::Tensor &softmax_lse,     // b x h x seqlen_q
-        c10::optional<at::Tensor> &dq_,   // batch_size x seqlen_q x num_heads x head_size
-        c10::optional<at::Tensor> &dk_,   // batch_size x seqlen_k x num_heads_k x head_size
-        c10::optional<at::Tensor> &dv_,   // batch_size x seqlen_k x num_heads_k x head_size
-        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+        std::optional<at::Tensor> &dq_,   // batch_size x seqlen_q x num_heads x head_size
+        std::optional<at::Tensor> &dk_,   // batch_size x seqlen_k x num_heads_k x head_size
+        std::optional<at::Tensor> &dv_,   // batch_size x seqlen_k x num_heads_k x head_size
+        std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
         const float p_dropout,         // probability to drop
         const float softmax_scale,
         const bool is_causal,
@@ -976,12 +976,12 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
                const at::Tensor &v,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
                const at::Tensor &out,   // total_q x num_heads x head_size
                const at::Tensor &softmax_lse,     // b x h x s   softmax logsumexp
-               c10::optional<at::Tensor> &dq_,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
-               c10::optional<at::Tensor> &dk_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
-               c10::optional<at::Tensor> &dv_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               std::optional<at::Tensor> &dq_,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               std::optional<at::Tensor> &dk_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               std::optional<at::Tensor> &dv_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
                const at::Tensor &cu_seqlens_q,  // b+1
                const at::Tensor &cu_seqlens_k,  // b+1
-               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               std::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
                const int max_seqlen_q,
                const int max_seqlen_k,          // max sequence length to choose the kernel
                const float p_dropout,         // probability to drop
@@ -1208,15 +1208,15 @@ std::tuple<at::Tensor, at::Tensor>
 mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_heads x head_size
                 const at::Tensor &kcache,            // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
                 const at::Tensor &vcache,            // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
-                c10::optional<const at::Tensor> &k_, // batch_size x seqlen_knew x num_heads_k x head_size
-                c10::optional<const at::Tensor> &v_, // batch_size x seqlen_knew x num_heads_k x head_size
-                c10::optional<const at::Tensor> &seqlens_k_, // batch_size
-                c10::optional<const at::Tensor> &rotary_cos_, // seqlen_ro x (rotary_dim / 2)
-                c10::optional<const at::Tensor> &rotary_sin_, // seqlen_ro x (rotary_dim / 2)
-                c10::optional<const at::Tensor> &cache_batch_idx_, // indices to index into the KV cache
-                c10::optional<at::Tensor> &block_table_, // batch_size x max_num_blocks_per_seq
-                c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
-                c10::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
+                std::optional<const at::Tensor> &k_, // batch_size x seqlen_knew x num_heads_k x head_size
+                std::optional<const at::Tensor> &v_, // batch_size x seqlen_knew x num_heads_k x head_size
+                std::optional<const at::Tensor> &seqlens_k_, // batch_size
+                std::optional<const at::Tensor> &rotary_cos_, // seqlen_ro x (rotary_dim / 2)
+                std::optional<const at::Tensor> &rotary_sin_, // seqlen_ro x (rotary_dim / 2)
+                std::optional<const at::Tensor> &cache_batch_idx_, // indices to index into the KV cache
+                std::optional<at::Tensor> &block_table_, // batch_size x max_num_blocks_per_seq
+                std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+                std::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
                 const float softmax_scale,
                 bool is_causal,
                 int window_size_left,
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h
index 2745b28dca29b..a3aa8aaa7adff 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h
@@ -11,25 +11,25 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tenso
 mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
         const at::Tensor &k,         // batch_size x seqlen_k x num_heads_k x head_size
         const at::Tensor &v,         // batch_size x seqlen_k x num_heads_k x head_size
-        c10::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
-        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+        std::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
+        std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
         const float p_dropout,
         const float softmax_scale,
         bool is_causal,
         int window_size_left,
         int window_size_right,
         const bool return_softmax,
-        c10::optional<at::Generator> gen_);
+        std::optional<at::Generator> gen_);
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
 mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
                const at::Tensor &k,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
                const at::Tensor &v,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
-               c10::optional<at::Tensor> &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+               std::optional<at::Tensor> &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
                const at::Tensor &cu_seqlens_q,  // b+1
                const at::Tensor &cu_seqlens_k,  // b+1
-               c10::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
-               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               std::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
+               std::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
                int max_seqlen_q,
                const int max_seqlen_k,
                const float p_dropout,
@@ -39,7 +39,7 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
                int window_size_left,
                int window_size_right,
                const bool return_softmax,
-               c10::optional<at::Generator> gen_);
+               std::optional<at::Generator> gen_);
 
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
@@ -49,10 +49,10 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
         const at::Tensor &v,   // batch_size x seqlen_k x num_heads_k x head_size
         const at::Tensor &out,   // batch_size x seqlen_q x num_heads x head_size
         const at::Tensor &softmax_lse,     // b x h x seqlen_q
-        c10::optional<at::Tensor> &dq_,   // batch_size x seqlen_q x num_heads x head_size
-        c10::optional<at::Tensor> &dk_,   // batch_size x seqlen_k x num_heads_k x head_size
-        c10::optional<at::Tensor> &dv_,   // batch_size x seqlen_k x num_heads_k x head_size
-        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+        std::optional<at::Tensor> &dq_,   // batch_size x seqlen_q x num_heads x head_size
+        std::optional<at::Tensor> &dk_,   // batch_size x seqlen_k x num_heads_k x head_size
+        std::optional<at::Tensor> &dv_,   // batch_size x seqlen_k x num_heads_k x head_size
+        std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
         const float p_dropout,         // probability to drop
         const float softmax_scale,
         const bool is_causal,
@@ -69,12 +69,12 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
                const at::Tensor &v,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
                const at::Tensor &out,   // total_q x num_heads x head_size
                const at::Tensor &softmax_lse,     // b x h x s   softmax logsumexp
-               c10::optional<at::Tensor> &dq_,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
-               c10::optional<at::Tensor> &dk_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
-               c10::optional<at::Tensor> &dv_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               std::optional<at::Tensor> &dq_,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               std::optional<at::Tensor> &dk_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               std::optional<at::Tensor> &dv_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
                const at::Tensor &cu_seqlens_q,  // b+1
                const at::Tensor &cu_seqlens_k,  // b+1
-               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               std::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
                const int max_seqlen_q,
                const int max_seqlen_k,          // max sequence length to choose the kernel
                const float p_dropout,         // probability to drop
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
index 0debd93bf1d4f..d25c168fcbe88 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -491,6 +491,18 @@ bool check_runtime_enabled_cudnn(sdp_params const& params, bool debug) {
   return true;
 }
 
+bool check_runtime_disabled_cudnn(sdp_params const& params, bool debug) {
+  // We check the global context to see if user has explicitly turned of cudnn
+  // sdp kernels
+  if (!at::globalContext().userEnabledCuDNNSDP()) {
+    if (debug) {
+      TORCH_WARN("CuDNN attention has been runtime disabled.");
+    }
+    return false;
+  }
+  return true;
+}
+
 bool check_cudnn_requires_grad(sdp_params const& params, bool debug) {
   // Check that the input is causal
   if (input_requires_grad(params)) {
@@ -511,6 +523,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
   constexpr auto general_constraints =
       array_of<bool (*)(sdp_params const&, bool)>(
           check_runtime_enabled_cudnn,
+          check_runtime_disabled_cudnn,
           check_cudnn_hardware_support,
           check_all_tensors_on_device,
           check_cudnn_tensor_shapes,
diff --git a/aten/src/ATen/native/transformers/sdp_utils_cpp.h b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
index 6e15a27fae542..7c56a1f617dbc 100644
--- a/aten/src/ATen/native/transformers/sdp_utils_cpp.h
+++ b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
@@ -44,7 +44,7 @@ struct sdp_params {
   at::Tensor query;
   at::Tensor key;
   at::Tensor value;
-  c10::optional<at::Tensor> attn_mask;
+  std::optional<at::Tensor> attn_mask;
   double dropout;
   bool is_causal;
 };
@@ -53,7 +53,7 @@ SDPBackend select_sdp_backend_cpp(sdp_params const& kernel_params);
 
 inline c10::SymFloat calculate_scale(
     const at::Tensor& query,
-    c10::optional<double> scale) {
+    std::optional<double> scale) {
   const auto softmax_scale = scale.has_value()
       ? scale.value()
       : (c10::SymFloat(1.0) / (c10::SymFloat(query.sym_size(-1)).sqrt()));
diff --git a/aten/src/ATen/native/transformers/transformer.cpp b/aten/src/ATen/native/transformers/transformer.cpp
index 4f64c95b204b2..b551100555675 100644
--- a/aten/src/ATen/native/transformers/transformer.cpp
+++ b/aten/src/ATen/native/transformers/transformer.cpp
@@ -27,7 +27,7 @@ Tensor linear_for_ffn(
     const Tensor& bias,
     const Tensor& mat1,
     const Tensor& mat2,
-    c10::optional<bool> use_gelu) {
+    std::optional<bool> use_gelu) {
   if (mat1.is_nested()) {
     return NestedTensor_times_Tensor_plus_Tensor_addmm(
         bias, mat1, mat2.t(), 1, 1, use_gelu);
@@ -91,8 +91,8 @@ Tensor transformer_encoder_layer_forward(
     const Tensor& ffn_bias_1,
     const Tensor& ffn_weight_2,
     const Tensor& ffn_bias_2,
-    const c10::optional<Tensor>& mask,
-    const c10::optional<int64_t> mask_type) {
+    const std::optional<Tensor>& mask,
+    const std::optional<int64_t> mask_type) {
   {
     const Tensor& check_for_empty = src.is_nested() ? get_nested_tensor_impl(src)->get_buffer() : src;
     if (check_for_empty.numel() == 0) {
diff --git a/aten/src/ATen/native/utils/Factory.cpp b/aten/src/ATen/native/utils/Factory.cpp
index ea6be4e017552..28ef6477e3335 100644
--- a/aten/src/ATen/native/utils/Factory.cpp
+++ b/aten/src/ATen/native/utils/Factory.cpp
@@ -12,7 +12,7 @@ Tensor empty_with_tail_padding(
     const IntArrayRef size,
     const caffe2::TypeMeta dtype,
     const c10::MemoryFormat memory_format,
-    c10::optional<DimnameList> maybe_names) {
+    std::optional<DimnameList> maybe_names) {
   auto* const allocator_ptr = c10::GetDefaultMobileCPUAllocator();
   const int64_t nelements = c10::multiply_integers(size);
   size_t size_bytes = nelements * dtype.itemsize();
diff --git a/aten/src/ATen/native/utils/Factory.h b/aten/src/ATen/native/utils/Factory.h
index bd153aaa67529..b0302417cdce0 100644
--- a/aten/src/ATen/native/utils/Factory.h
+++ b/aten/src/ATen/native/utils/Factory.h
@@ -17,7 +17,7 @@ at::Tensor empty_with_tail_padding(
     IntArrayRef size,
     const caffe2::TypeMeta dtype,
     c10::MemoryFormat memory_format,
-    c10::optional<DimnameList> maybe_names);
+    std::optional<DimnameList> maybe_names);
 
 } // namespace mobile
 } // namespace native
diff --git a/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp b/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp
index 3f583ddc3c4ae..e12e69c4ebec2 100644
--- a/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp
@@ -73,10 +73,10 @@ using namespace api::utils;
 
 Tensor batch_norm(
     const at::Tensor& input_arg,
-    const c10::optional<Tensor>& weight_opt /* optional */,
-    const c10::optional<Tensor>& bias_opt /* optional */,
-    const c10::optional<Tensor>& running_mean_opt /* optional */,
-    const c10::optional<Tensor>& running_var_opt /* optional */,
+    const std::optional<Tensor>& weight_opt /* optional */,
+    const std::optional<Tensor>& bias_opt /* optional */,
+    const std::optional<Tensor>& running_mean_opt /* optional */,
+    const std::optional<Tensor>& running_var_opt /* optional */,
     bool training,
     double /* momentum, not used in eval mode */,
     double eps,
@@ -104,10 +104,10 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
 } // namespace
 
 BatchNormPackedContext::BatchNormPackedContext(
-    const c10::optional<Tensor>& weight_opt,
-    const c10::optional<Tensor>& bias_opt,
-    const c10::optional<Tensor>& running_mean_opt,
-    const c10::optional<Tensor>& running_var_opt,
+    const std::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& running_mean_opt,
+    const std::optional<Tensor>& running_var_opt,
     double eps)
     : unpacked_{c10::AnyType::get()} {
   packed_.reserve(ListArgs::kNumArgs);
@@ -181,10 +181,10 @@ BatchNormPackedContext BatchNormPackedContext::pack(
 }
 
 c10::intrusive_ptr<BatchNormPackedContext> create_batchnorm_context(
-    c10::optional<Tensor>&& weight_opt,
-    c10::optional<Tensor>&& bias_opt,
-    c10::optional<Tensor>&& running_mean_opt,
-    c10::optional<Tensor>&& running_var_opt,
+    std::optional<Tensor>&& weight_opt,
+    std::optional<Tensor>&& bias_opt,
+    std::optional<Tensor>&& running_mean_opt,
+    std::optional<Tensor>&& running_var_opt,
     bool training,
     double /* momentum */,
     double eps,
diff --git a/aten/src/ATen/native/vulkan/ops/Batchnorm.h b/aten/src/ATen/native/vulkan/ops/Batchnorm.h
index 6afaeb6f243b3..4108b0d4e3201 100644
--- a/aten/src/ATen/native/vulkan/ops/Batchnorm.h
+++ b/aten/src/ATen/native/vulkan/ops/Batchnorm.h
@@ -18,10 +18,10 @@ class BatchNormPackedContext final : virtual public VulkanPackedContext,
 
  public:
   BatchNormPackedContext(
-      const c10::optional<Tensor>& weight_opt,
-      const c10::optional<Tensor>& bias_opt,
-      const c10::optional<Tensor>& running_mean_opt,
-      const c10::optional<Tensor>& running_var_opt,
+      const std::optional<Tensor>& weight_opt,
+      const std::optional<Tensor>& bias_opt,
+      const std::optional<Tensor>& running_mean_opt,
+      const std::optional<Tensor>& running_var_opt,
       double eps);
 
   /*
@@ -47,10 +47,10 @@ class BatchNormPackedContext final : virtual public VulkanPackedContext,
 };
 
 c10::intrusive_ptr<BatchNormPackedContext> create_batchnorm_context(
-    c10::optional<Tensor>&& weight_opt,
-    c10::optional<Tensor>&& bias_opt,
-    c10::optional<Tensor>&& running_mean_opt,
-    c10::optional<Tensor>&& running_var_opt,
+    std::optional<Tensor>&& weight_opt,
+    std::optional<Tensor>&& bias_opt,
+    std::optional<Tensor>&& running_mean_opt,
+    std::optional<Tensor>&& running_var_opt,
     bool training,
     double /* momentum */,
     double eps,
diff --git a/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp b/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp
index c08363a17f8eb..e1445f40ac5f8 100644
--- a/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp
@@ -15,7 +15,7 @@ using namespace api::utils;
 Tensor binary_op_scalar(
     const Tensor& self_arg,
     const Scalar& other,
-    const c10::optional<Scalar>& alpha_arg,
+    const std::optional<Scalar>& alpha_arg,
     const api::ShaderInfo& shader_descriptor) {
   api::Context* const context = api::context();
 
@@ -102,7 +102,7 @@ Tensor binary_op_preprocess_other_arg(const Tensor& other_arg) {
 Tensor& binary_op_scalar_(
     Tensor& self_arg,
     const Scalar& other,
-    const c10::optional<Scalar>& alpha_arg,
+    const std::optional<Scalar>& alpha_arg,
     const api::ShaderInfo& shader_descriptor) {
   TORCH_CHECK(
       self_arg.is_vulkan(),
@@ -152,7 +152,7 @@ Tensor& binary_op_scalar_(
 Tensor binary_op_tensor(
     const Tensor& self_arg,
     const Tensor& other_arg,
-    const c10::optional<Scalar>& alpha_arg,
+    const std::optional<Scalar>& alpha_arg,
     const api::ShaderInfo& shader_descriptor) {
   utils::is_broadcastable(self_arg, other_arg);
   api::Context* const context = api::context();
@@ -313,7 +313,7 @@ Tensor quantized_binary_op_tensor(
 Tensor& binary_op_tensor_(
     Tensor& self_arg,
     const Tensor& other_arg,
-    const c10::optional<Scalar>& alpha_arg,
+    const std::optional<Scalar>& alpha_arg,
     const api::ShaderInfo& shader_descriptor) {
   TORCH_CHECK(
       get_dim<Dim4D::Batch>(self_arg) >= get_dim<Dim4D::Batch>(other_arg) &&
@@ -389,12 +389,12 @@ Tensor add_scalar(
     const Scalar& other,
     const Scalar& alpha) {
   return binary_op_scalar(
-      self_arg, other, c10::optional<Scalar>(alpha), VK_KERNEL(add_scalar));
+      self_arg, other, std::optional<Scalar>(alpha), VK_KERNEL(add_scalar));
 }
 
 Tensor& add_scalar_(Tensor& self, const Scalar& other, const Scalar& alpha) {
   return binary_op_scalar_(
-      self, other, c10::optional<Scalar>(alpha), VK_KERNEL(add_scalar_inplace));
+      self, other, std::optional<Scalar>(alpha), VK_KERNEL(add_scalar_inplace));
 }
 
 Tensor quantized_add(
@@ -438,7 +438,7 @@ Tensor add_tensor(
     const Tensor& other_arg,
     const Scalar& alpha) {
   return binary_op_tensor(
-      self_arg, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(add));
+      self_arg, other_arg, std::optional<Scalar>(alpha), VK_KERNEL(add));
 }
 
 Tensor& add_tensor_(
@@ -446,7 +446,7 @@ Tensor& add_tensor_(
     const Tensor& other_arg,
     const Scalar& alpha) {
   return binary_op_tensor_(
-      self, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(add_inplace));
+      self, other_arg, std::optional<Scalar>(alpha), VK_KERNEL(add_inplace));
 }
 
 Tensor sub_scalar(
@@ -456,7 +456,7 @@ Tensor sub_scalar(
   return binary_op_scalar(
       self_arg,
       other,
-      c10::optional<Scalar>(-1 * alpha.to<float>()),
+      std::optional<Scalar>(-1 * alpha.to<float>()),
       VK_KERNEL(add_scalar));
 }
 
@@ -464,7 +464,7 @@ Tensor& sub_scalar_(Tensor& self, const Scalar& other, const Scalar& alpha) {
   return binary_op_scalar_(
       self,
       other,
-      c10::optional<Scalar>(-1 * alpha.to<float>()),
+      std::optional<Scalar>(-1 * alpha.to<float>()),
       VK_KERNEL(add_scalar_inplace));
 }
 
@@ -473,7 +473,7 @@ Tensor sub_tensor(
     const Tensor& other_arg,
     const Scalar& alpha) {
   return binary_op_tensor(
-      self_arg, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(sub));
+      self_arg, other_arg, std::optional<Scalar>(alpha), VK_KERNEL(sub));
 }
 
 Tensor& sub_tensor_(
@@ -481,34 +481,34 @@ Tensor& sub_tensor_(
     const Tensor& other_arg,
     const Scalar& alpha) {
   return binary_op_tensor_(
-      self, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(sub_inplace));
+      self, other_arg, std::optional<Scalar>(alpha), VK_KERNEL(sub_inplace));
 }
 
 Tensor mul_scalar(const Tensor& self_arg, const Scalar& other) {
   return binary_op_scalar(
-      self_arg, other, c10::optional<Scalar>(), VK_KERNEL(mul_scalar));
+      self_arg, other, std::optional<Scalar>(), VK_KERNEL(mul_scalar));
 }
 
 Tensor& mul_scalar_(Tensor& self, const Scalar& other) {
   return binary_op_scalar_(
-      self, other, c10::optional<Scalar>(), VK_KERNEL(mul_scalar_inplace));
+      self, other, std::optional<Scalar>(), VK_KERNEL(mul_scalar_inplace));
 }
 
 Tensor mul_tensor(const Tensor& self_arg, const Tensor& other_arg) {
   return binary_op_tensor(
-      self_arg, other_arg, c10::optional<Scalar>(), VK_KERNEL(mul));
+      self_arg, other_arg, std::optional<Scalar>(), VK_KERNEL(mul));
 }
 
 Tensor& mul_tensor_(Tensor& self, const Tensor& other_arg) {
   return binary_op_tensor_(
-      self, other_arg, c10::optional<Scalar>(), VK_KERNEL(mul_inplace));
+      self, other_arg, std::optional<Scalar>(), VK_KERNEL(mul_inplace));
 }
 
 Tensor div_scalar(const Tensor& self_arg, const Scalar& other) {
   return binary_op_scalar(
       self_arg,
       1.0 / other.to<float>(),
-      c10::optional<Scalar>(),
+      std::optional<Scalar>(),
       VK_KERNEL(mul_scalar));
 }
 
@@ -516,45 +516,45 @@ Tensor& div_scalar_(Tensor& self, const Scalar& other) {
   return binary_op_scalar_(
       self,
       1.0 / other.to<float>(),
-      c10::optional<Scalar>(),
+      std::optional<Scalar>(),
       VK_KERNEL(mul_scalar_inplace));
 }
 
 Tensor div_tensor(const Tensor& self_arg, const Tensor& other_arg) {
   return binary_op_tensor(
-      self_arg, other_arg, c10::optional<Scalar>(), VK_KERNEL(div));
+      self_arg, other_arg, std::optional<Scalar>(), VK_KERNEL(div));
 }
 
 Tensor& div_tensor_(Tensor& self, const Tensor& other_arg) {
   return binary_op_tensor_(
-      self, other_arg, c10::optional<Scalar>(), VK_KERNEL(div_inplace));
+      self, other_arg, std::optional<Scalar>(), VK_KERNEL(div_inplace));
 }
 
 Tensor pow(const Tensor& self, const Tensor& other) {
-  return binary_op_tensor(self, other, c10::optional<Scalar>(), VK_KERNEL(pow));
+  return binary_op_tensor(self, other, std::optional<Scalar>(), VK_KERNEL(pow));
 }
 
 Tensor& pow_(Tensor& self, const Tensor& other) {
   return binary_op_tensor_(
-      self, other, c10::optional<Scalar>(), VK_KERNEL(pow_inplace));
+      self, other, std::optional<Scalar>(), VK_KERNEL(pow_inplace));
 }
 
 Tensor pow_tensor_scalar(const Tensor& self, const Scalar& other) {
   return binary_op_scalar(
-      self, other, c10::optional<Scalar>(), VK_KERNEL(pow_tensor_scalar));
+      self, other, std::optional<Scalar>(), VK_KERNEL(pow_tensor_scalar));
 }
 
 Tensor& pow_tensor_scalar_(Tensor& self, const Scalar& other) {
   return binary_op_scalar_(
       self,
       other,
-      c10::optional<Scalar>(),
+      std::optional<Scalar>(),
       VK_KERNEL(pow_tensor_scalar_inplace));
 }
 
 Tensor pow_scalar_tensor(const Scalar& self, const Tensor& other) {
   return binary_op_scalar(
-      other, self, c10::optional<Scalar>(), VK_KERNEL(pow_scalar_tensor));
+      other, self, std::optional<Scalar>(), VK_KERNEL(pow_scalar_tensor));
 }
 
 Tensor floor_divide_scalar(const Tensor& self, const Scalar& other) {
@@ -563,7 +563,7 @@ Tensor floor_divide_scalar(const Tensor& self, const Scalar& other) {
   return binary_op_scalar(
       self,
       1.0 / other.to<float>(),
-      c10::optional<Scalar>(),
+      std::optional<Scalar>(),
       VK_KERNEL(floor_mul_scalar));
 }
 
@@ -573,20 +573,20 @@ Tensor& floor_divide_scalar_(Tensor& self, const Scalar& other) {
   return binary_op_scalar_(
       self,
       1.0 / other.to<float>(),
-      c10::optional<Scalar>(),
+      std::optional<Scalar>(),
       VK_KERNEL(floor_mul_scalar_inplace));
 }
 
 Tensor floor_divide_tensor(const Tensor& self, const Tensor& other) {
   return binary_op_tensor(
-      self, other, c10::optional<Scalar>(), VK_KERNEL(floor_divide));
+      self, other, std::optional<Scalar>(), VK_KERNEL(floor_divide));
 }
 
 Tensor& floor_divide_tensor_(Tensor& self, const Tensor& other_arg) {
   return binary_op_tensor_(
       self,
       other_arg,
-      c10::optional<Scalar>(),
+      std::optional<Scalar>(),
       VK_KERNEL(floor_divide_inplace));
 }
 
diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
index 3cc4dd3d3c4bc..e336b01323666 100644
--- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
@@ -11,8 +11,8 @@ using namespace api::utils;
 
 Tensor _clamp(
     const Tensor& self_arg,
-    const c10::optional<Scalar>& min,
-    const c10::optional<Scalar>& max,
+    const std::optional<Scalar>& min,
+    const std::optional<Scalar>& max,
     const api::ShaderInfo& shader_descriptor) {
   TORCH_CHECK(min || max, "At least one of 'min' or 'max' must not be None");
 
@@ -96,15 +96,15 @@ Tensor _clamp(
 
 Tensor clamp(
     const Tensor& self_arg,
-    const c10::optional<Scalar>& min,
-    const c10::optional<Scalar>& max) {
+    const std::optional<Scalar>& min,
+    const std::optional<Scalar>& max) {
   return _clamp(self_arg, min, max, VK_KERNEL(clamp));
 }
 
 Tensor& _clamp_(
     Tensor& self_arg,
-    const c10::optional<Scalar>& min,
-    const c10::optional<Scalar>& max,
+    const std::optional<Scalar>& min,
+    const std::optional<Scalar>& max,
     const api::ShaderInfo& shader_descriptor) {
   TORCH_CHECK(min || max, "At least one of 'min' or 'max' must not be None");
 
@@ -186,8 +186,8 @@ Tensor threshold(
 
 Tensor& clamp_(
     Tensor& self,
-    const c10::optional<Scalar>& min,
-    const c10::optional<Scalar>& max) {
+    const std::optional<Scalar>& min,
+    const std::optional<Scalar>& max) {
   return _clamp_(self, min, max, VK_KERNEL(clamp_));
 }
 
diff --git a/aten/src/ATen/native/vulkan/ops/Clone.cpp b/aten/src/ATen/native/vulkan/ops/Clone.cpp
index 2601d785ddb52..3e9e611717257 100644
--- a/aten/src/ATen/native/vulkan/ops/Clone.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clone.cpp
@@ -16,7 +16,7 @@ namespace {
 
 Tensor clone(
     const Tensor& src,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    std::optional<c10::MemoryFormat> optional_memory_format) {
   auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);
   TORCH_CHECK(
       (c10::MemoryFormat::Preserve == memory_format) ||
diff --git a/aten/src/ATen/native/vulkan/ops/Common.h b/aten/src/ATen/native/vulkan/ops/Common.h
index 83cb45b163a2a..c74483f793c52 100644
--- a/aten/src/ATen/native/vulkan/ops/Common.h
+++ b/aten/src/ATen/native/vulkan/ops/Common.h
@@ -76,18 +76,18 @@ uint32_t get_dim(const vTensor& v_in) {
   return get_dim<N>(v_in.sizes());
 }
 
-inline c10::optional<Tensor> get_optional_tensor(
+inline std::optional<Tensor> get_optional_tensor(
     const c10::impl::GenericList& gen_list,
     const uint32_t idx) {
   return gen_list.get(idx).isTensor() ? gen_list.get(idx).toTensor()
-                                      : c10::optional<Tensor>();
+                                      : std::optional<Tensor>();
 }
 
-inline c10::optional<Scalar> get_optional_scalar(
+inline std::optional<Scalar> get_optional_scalar(
     const c10::impl::GenericList& gen_list,
     const uint32_t idx) {
   return gen_list.get(idx).isScalar() ? gen_list.get(idx).toScalar()
-                                      : c10::optional<Scalar>();
+                                      : std::optional<Scalar>();
 }
 
 inline float roundevenf(float v) {
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index 01dccac003011..f210c253800b1 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -245,7 +245,7 @@ at::Tensor rearrange_weights_2d(const Tensor& weight_in, bool tconv) {
  * taking each texel and arranging them along the x axis.
  */
 at::Tensor rearrange_bias(
-    const c10::optional<Tensor>& bias_in,
+    const std::optional<Tensor>& bias_in,
     const at::Tensor& weight_in,
     bool tconv) {
   // If optional is empty, just return zeros
@@ -543,7 +543,7 @@ vTensor pack_weights(
 }
 
 vTensor pack_biases(
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const Tensor& weight,
     const bool transposed,
     const bool quantized) {
@@ -629,7 +629,7 @@ bool weight_valid(const Tensor& weight, const bool quantized) {
 }
 
 bool bias_valid(
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const Tensor& weight,
     const bool transposed,
     const bool quantized) {
@@ -656,7 +656,7 @@ bool bias_valid(
 
 bool available(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const IntArrayRef stride,
     const IntArrayRef padding,
     const IntArrayRef dilation,
@@ -664,8 +664,8 @@ bool available(
     const bool quantized,
     const IntArrayRef /* output_padding */,
     const int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   if (!weight_valid(weight, quantized)) {
     return false;
   }
@@ -765,7 +765,7 @@ static inline std::vector<int64_t> get_conv_transpose_output_size(
 Tensor convolution(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const IntArrayRef stride,
     const IntArrayRef padding,
     const IntArrayRef dilation,
@@ -790,7 +790,7 @@ Tensor convolution(
 Tensor quantized_convolution(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const IntArrayRef stride,
     const IntArrayRef padding,
     const IntArrayRef dilation,
@@ -865,7 +865,7 @@ vTensor pack_weights_using_width_packing(const Tensor& weight_arg) {
 Tensor run_conv1d_context_impl(
     const Tensor& input_arg,
     const Tensor& weight_arg,
-    const c10::optional<Tensor>& bias_arg_opt,
+    const std::optional<Tensor>& bias_arg_opt,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
@@ -962,7 +962,7 @@ Tensor run_conv1d_context_impl(
 
 Conv2dPackedContext::Conv2dPackedContext(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const IntArrayRef stride_arg,
     const IntArrayRef padding_arg,
     const IntArrayRef dilation_arg,
@@ -970,8 +970,8 @@ Conv2dPackedContext::Conv2dPackedContext(
     const bool quantized,
     const IntArrayRef output_padding_arg,
     const int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max)
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max)
     : unpacked_{c10::AnyType::get()} {
   const auto stride = expand_param_if_needed(stride_arg, "stride", 2);
   const auto padding = expand_param_if_needed(padding_arg, "padding", 2);
@@ -1058,13 +1058,13 @@ Conv2dPackedContext Conv2dPackedContext::pack(c10::impl::GenericList unpacked) {
 
 c10::intrusive_ptr<Conv2dPackedContext> create_conv2d_context(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& dilation,
     const int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   return c10::make_intrusive<Conv2dPackedContext>(Conv2dPackedContext(
       weight,
       bias,
@@ -1081,14 +1081,14 @@ c10::intrusive_ptr<Conv2dPackedContext> create_conv2d_context(
 
 c10::intrusive_ptr<Conv2dPackedContext> create_tconv2d_context(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& output_padding,
     std::vector<int64_t>&& dilation,
     const int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   return c10::make_intrusive<Conv2dPackedContext>(Conv2dPackedContext(
       weight,
       bias,
@@ -1105,13 +1105,13 @@ c10::intrusive_ptr<Conv2dPackedContext> create_tconv2d_context(
 
 c10::intrusive_ptr<Conv2dPackedContext> create_qconv2d_context(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& dilation,
     const int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   return c10::make_intrusive<Conv2dPackedContext>(Conv2dPackedContext(
       weight,
       bias,
@@ -1128,14 +1128,14 @@ c10::intrusive_ptr<Conv2dPackedContext> create_qconv2d_context(
 
 c10::intrusive_ptr<Conv2dPackedContext> create_qtconv2d_context(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& output_padding,
     std::vector<int64_t>&& dilation,
     const int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   return c10::make_intrusive<Conv2dPackedContext>(Conv2dPackedContext(
       weight,
       bias,
@@ -1294,7 +1294,7 @@ Tensor run_qconv2d_context(
 Tensor quantized_conv2d(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
@@ -1321,15 +1321,15 @@ Conv2dOpContext::Conv2dOpContext(Conv2dPackedContext conv_context)
 
 Conv2dOpContext Conv2dOpContext::create(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const IntArrayRef stride_arg,
     const IntArrayRef padding_arg,
     const IntArrayRef dilation_arg,
     const bool transposed,
     const IntArrayRef output_padding_arg,
     const int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   return Conv2dOpContext{Conv2dPackedContext(
       weight,
       bias,
@@ -1367,13 +1367,13 @@ Conv2dOpContext::State Conv2dOpContext::unpack() const {
 
 c10::intrusive_ptr<Conv2dOpContext> conv2d_clamp_prepack(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& dilation,
     const int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   return c10::make_intrusive<Conv2dOpContext>(Conv2dOpContext::create(
       std::move(weight),
       std::move(bias),
@@ -1395,7 +1395,7 @@ Tensor conv2d_clamp_run(
 
 Conv1dPackedContext::Conv1dPackedContext(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const IntArrayRef stride_arg,
     const IntArrayRef padding_arg,
     const IntArrayRef dilation_arg,
@@ -1435,7 +1435,7 @@ Conv1dPackedContext Conv1dPackedContext::pack(c10::impl::GenericList unpacked) {
 
 c10::intrusive_ptr<Conv1dPackedContext> create_conv1d_context(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& dilation,
@@ -1447,7 +1447,7 @@ c10::intrusive_ptr<Conv1dPackedContext> create_conv1d_context(
 Tensor convolution1d(
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const IntArrayRef stride,
     const IntArrayRef padding,
     const IntArrayRef dilation,
@@ -1464,7 +1464,7 @@ Tensor run_conv1d_context(
     const c10::intrusive_ptr<Conv1dPackedContext>& context) {
   const Tensor weight =
       context->get_val(Conv1dPackedContext::Packed::Weight).toTensor();
-  const c10::optional<Tensor>& bias_opt =
+  const std::optional<Tensor>& bias_opt =
       context->get_val(Conv1dPackedContext::Packed::Bias).toTensor();
   const auto stride =
       context->get_val(Conv1dPackedContext::Packed::Stride).toIntVector();
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.h b/aten/src/ATen/native/vulkan/ops/Convolution.h
index 1d51190b8cab5..84ace9526bbfc 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.h
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.h
@@ -21,7 +21,7 @@ namespace conv2d {
 Tensor rearrange_weights_dw(const Tensor& weight_in);
 Tensor rearrange_weights_2d(const Tensor& weight_in, bool tconv);
 Tensor rearrange_bias(
-    const c10::optional<Tensor>& bias_in,
+    const std::optional<Tensor>& bias_in,
     const at::Tensor& weight_in,
     bool tconv);
 
@@ -60,7 +60,7 @@ class Conv2dPackedContext final : virtual public VulkanPackedContext,
  public:
   Conv2dPackedContext(
       const Tensor& weight,
-      const c10::optional<Tensor>& bias,
+      const std::optional<Tensor>& bias,
       const IntArrayRef stride_arg,
       const IntArrayRef padding_arg,
       const IntArrayRef dilation_arg,
@@ -68,8 +68,8 @@ class Conv2dPackedContext final : virtual public VulkanPackedContext,
       const bool quantized,
       const IntArrayRef output_padding_arg,
       const int64_t groups,
-      const c10::optional<Scalar>& output_min = c10::nullopt,
-      const c10::optional<Scalar>& output_max = c10::nullopt);
+      const std::optional<Scalar>& output_min = c10::nullopt,
+      const std::optional<Scalar>& output_max = c10::nullopt);
 
   /*
    * Assigns a name to each index in the unpacked list.
@@ -127,13 +127,13 @@ class Conv2dPackedContext final : virtual public VulkanPackedContext,
 
 c10::intrusive_ptr<Conv2dPackedContext> create_conv2d_context(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& dilation,
     const int64_t groups,
-    const c10::optional<Scalar>& output_min = c10::nullopt,
-    const c10::optional<Scalar>& output_max = c10::nullopt);
+    const std::optional<Scalar>& output_min = c10::nullopt,
+    const std::optional<Scalar>& output_max = c10::nullopt);
 
 Tensor run_conv2d_context(
     const Tensor& input,
@@ -141,14 +141,14 @@ Tensor run_conv2d_context(
 
 c10::intrusive_ptr<Conv2dPackedContext> create_tconv2d_context(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& output_padding,
     std::vector<int64_t>&& dilation,
     const int64_t groups,
-    const c10::optional<Scalar>& output_min = c10::nullopt,
-    const c10::optional<Scalar>& output_max = c10::nullopt);
+    const std::optional<Scalar>& output_min = c10::nullopt,
+    const std::optional<Scalar>& output_max = c10::nullopt);
 
 Tensor run_tconv2d_context(
     const Tensor& input,
@@ -156,13 +156,13 @@ Tensor run_tconv2d_context(
 
 c10::intrusive_ptr<Conv2dPackedContext> create_qconv2d_context(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& dilation,
     const int64_t groups,
-    const c10::optional<Scalar>& output_min = c10::nullopt,
-    const c10::optional<Scalar>& output_max = c10::nullopt);
+    const std::optional<Scalar>& output_min = c10::nullopt,
+    const std::optional<Scalar>& output_max = c10::nullopt);
 
 Tensor run_qconv2d_context(
     const Tensor& input_arg,
@@ -172,39 +172,39 @@ Tensor run_qconv2d_context(
 
 c10::intrusive_ptr<Conv2dPackedContext> create_qtconv2d_context(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& output_padding,
     std::vector<int64_t>&& dilation,
     const int64_t groups,
-    const c10::optional<Scalar>& output_min = c10::nullopt,
-    const c10::optional<Scalar>& output_max = c10::nullopt);
+    const std::optional<Scalar>& output_min = c10::nullopt,
+    const std::optional<Scalar>& output_max = c10::nullopt);
 
 // Backwards compatibility
 class Conv2dOpContext final : public torch::jit::CustomClassHolder {
  public:
   static Conv2dOpContext create(
       const Tensor& weight,
-      const c10::optional<Tensor>& bias,
+      const std::optional<Tensor>& bias,
       IntArrayRef stride,
       IntArrayRef padding,
       IntArrayRef dilation,
       bool transposed,
       IntArrayRef output_padding,
       int64_t groups,
-      const c10::optional<Scalar>& output_min = c10::nullopt,
-      const c10::optional<Scalar>& output_max = c10::nullopt);
+      const std::optional<Scalar>& output_min = c10::nullopt,
+      const std::optional<Scalar>& output_max = c10::nullopt);
 
   using State = std::tuple<
       Tensor,
-      c10::optional<Tensor>,
+      std::optional<Tensor>,
       std::vector<int64_t>,
       std::vector<int64_t>,
       std::vector<int64_t>,
       int64_t,
-      c10::optional<Scalar>,
-      c10::optional<Scalar>>;
+      std::optional<Scalar>,
+      std::optional<Scalar>>;
 
   Tensor run(const Tensor& input) const;
   State unpack() const;
@@ -220,13 +220,13 @@ Tensor conv2d_clamp_run(
 
 c10::intrusive_ptr<Conv2dOpContext> conv2d_clamp_prepack(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& dilation,
     const int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max);
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max);
 
 class Conv1dPackedContext final : virtual public VulkanPackedContext,
                                   public torch::jit::CustomClassHolder {
@@ -237,7 +237,7 @@ class Conv1dPackedContext final : virtual public VulkanPackedContext,
  public:
   Conv1dPackedContext(
       const Tensor& weight,
-      const c10::optional<Tensor>& bias,
+      const std::optional<Tensor>& bias,
       const IntArrayRef stride_arg,
       const IntArrayRef padding_arg,
       const IntArrayRef dilation_arg,
@@ -287,7 +287,7 @@ class Conv1dPackedContext final : virtual public VulkanPackedContext,
 
 c10::intrusive_ptr<Conv1dPackedContext> create_conv1d_context(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& bias,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& dilation,
diff --git a/aten/src/ATen/native/vulkan/ops/Factory.cpp b/aten/src/ATen/native/vulkan/ops/Factory.cpp
index b746868c238fd..afe82caed8f19 100644
--- a/aten/src/ATen/native/vulkan/ops/Factory.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Factory.cpp
@@ -8,10 +8,10 @@ namespace ops {
 
 Tensor _empty_affine_quantized(
     const IntArrayRef sizes,
-    const c10::optional<ScalarType> dtype,
-    const c10::optional<c10::Layout> layout,
-    const c10::optional<Device> device,
-    const c10::optional<bool> pin_memory,
+    const std::optional<ScalarType> dtype,
+    const std::optional<c10::Layout> layout,
+    const std::optional<Device> device,
+    const std::optional<bool> pin_memory,
     const double scale,
     const int64_t zero_point,
     const optional<MemoryFormat> memory_format) {
@@ -30,10 +30,10 @@ Tensor _empty_affine_quantized(
 
 Tensor empty_memory_format(
     const IntArrayRef sizes,
-    const c10::optional<ScalarType> dtype,
-    const c10::optional<c10::Layout> layout,
-    const c10::optional<Device> device,
-    const c10::optional<bool> pin_memory,
+    const std::optional<ScalarType> dtype,
+    const std::optional<c10::Layout> layout,
+    const std::optional<Device> device,
+    const std::optional<bool> pin_memory,
     const optional<MemoryFormat> memory_format) {
   api::StorageType storage_type = api::StorageType::TEXTURE_3D;
   return convert(vTensor{
diff --git a/aten/src/ATen/native/vulkan/ops/Factory.h b/aten/src/ATen/native/vulkan/ops/Factory.h
index 9dee6307bb85c..9839ba2d84319 100644
--- a/aten/src/ATen/native/vulkan/ops/Factory.h
+++ b/aten/src/ATen/native/vulkan/ops/Factory.h
@@ -7,10 +7,10 @@ namespace ops {
 
 Tensor _empty_affine_quantized(
     const IntArrayRef sizes,
-    const c10::optional<ScalarType> dtype,
-    const c10::optional<c10::Layout> layout,
-    const c10::optional<Device> device,
-    const c10::optional<bool> pin_memory,
+    const std::optional<ScalarType> dtype,
+    const std::optional<c10::Layout> layout,
+    const std::optional<Device> device,
+    const std::optional<bool> pin_memory,
     const double scale,
     const int64_t zero_point,
     const optional<MemoryFormat> memory_format);
diff --git a/aten/src/ATen/native/vulkan/ops/Layernorm.cpp b/aten/src/ATen/native/vulkan/ops/Layernorm.cpp
index cdca77f95fcaf..6b6a4b866c700 100644
--- a/aten/src/ATen/native/vulkan/ops/Layernorm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Layernorm.cpp
@@ -19,8 +19,8 @@ namespace vulkan {
 namespace ops {
 
 LayernormPackedContext::LayernormPackedContext(
-    const c10::optional<Tensor>& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& weight,
+    const std::optional<Tensor>& bias,
     double eps)
     : unpacked_{c10::AnyType::get()} {
   packed_.reserve(ListArgs::kNumArgs);
@@ -48,8 +48,8 @@ LayernormPackedContext LayernormPackedContext::pack(
 }
 
 c10::intrusive_ptr<LayernormPackedContext> create_layernorm_context(
-    c10::optional<Tensor>&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& weight,
+    std::optional<Tensor>&& bias,
     double eps) {
   return c10::make_intrusive<LayernormPackedContext>(
       LayernormPackedContext(weight, bias, eps));
@@ -61,10 +61,10 @@ Tensor run_layernorm_context(
     const c10::intrusive_ptr<LayernormPackedContext>& layernorm_context) {
   const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
 
-  const c10::optional<Tensor>& weight_opt =
+  const std::optional<Tensor>& weight_opt =
       layernorm_context->get_val(LayernormPackedContext::ListArgs::kWeight)
           .toTensor();
-  const c10::optional<Tensor>& bias_opt =
+  const std::optional<Tensor>& bias_opt =
       layernorm_context->get_val(LayernormPackedContext::ListArgs::kBias)
           .toTensor();
   const float eps = api::utils::safe_downcast<float>(
@@ -81,8 +81,8 @@ Tensor run_layernorm_context(
 Tensor layer_norm(
     const at::Tensor& input_arg,
     IntArrayRef normalized_shape,
-    const c10::optional<Tensor>& weight_opt /* optional */,
-    const c10::optional<Tensor>& bias_opt /* optional */,
+    const std::optional<Tensor>& weight_opt /* optional */,
+    const std::optional<Tensor>& bias_opt /* optional */,
     double eps,
     bool /* cudnn_enable, deprecated */) {
   return run_layernorm_context(
diff --git a/aten/src/ATen/native/vulkan/ops/Layernorm.h b/aten/src/ATen/native/vulkan/ops/Layernorm.h
index 39518bf63bc9f..881fd6ba9b36c 100644
--- a/aten/src/ATen/native/vulkan/ops/Layernorm.h
+++ b/aten/src/ATen/native/vulkan/ops/Layernorm.h
@@ -18,8 +18,8 @@ class LayernormPackedContext final : virtual public VulkanPackedContext,
 
  public:
   LayernormPackedContext(
-      const c10::optional<Tensor>& weight,
-      const c10::optional<Tensor>& bias,
+      const std::optional<Tensor>& weight,
+      const std::optional<Tensor>& bias,
       double eps);
 
   /*
@@ -43,8 +43,8 @@ class LayernormPackedContext final : virtual public VulkanPackedContext,
 };
 
 c10::intrusive_ptr<LayernormPackedContext> create_layernorm_context(
-    c10::optional<Tensor>&& weight,
-    c10::optional<Tensor>&& bias,
+    std::optional<Tensor>&& weight,
+    std::optional<Tensor>&& bias,
     double eps);
 
 Tensor run_layernorm_context(
diff --git a/aten/src/ATen/native/vulkan/ops/Mm.cpp b/aten/src/ATen/native/vulkan/ops/Mm.cpp
index e5893e8172875..c4f4d6d0a6342 100644
--- a/aten/src/ATen/native/vulkan/ops/Mm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp
@@ -149,7 +149,7 @@ vTensor pack_weights(const Tensor& weight_arg, const bool use_batch = false) {
 
 vTensor pack_biases(
     const Tensor& weight_arg,
-    const c10::optional<Tensor>& bias_arg,
+    const std::optional<Tensor>& bias_arg,
     const bool use_batch = false) {
   if (bias_arg) {
     Tensor bias = *bias_arg;
@@ -166,7 +166,7 @@ vTensor pack_biases(
 // removed in the future.
 vTensor pack_biases_quantized_weights(
     const Tensor& weight_arg,
-    const c10::optional<Tensor>& bias_arg,
+    const std::optional<Tensor>& bias_arg,
     const bool use_batch = false) {
   TORCH_CHECK(
       weight_arg.is_quantized(),
@@ -291,7 +291,7 @@ vTensor pack_biases_quantized_weights(
 
 bool available_check_with_batch(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias) {
+    const std::optional<Tensor>& bias) {
   const bool weight_available = (3 == weight.ndimension()) &&
       (weight.size(Layout::BatchMatrices::batch) > 0) &&
       (weight.size(Layout::BatchMatrices::height) > 0) &&
@@ -345,7 +345,7 @@ bool available_check_with_batch(
 
 bool available(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const bool use_batch = false) {
   if (!api::available()) {
     return false;
@@ -897,7 +897,7 @@ Tensor mm(const Tensor& mat1_arg, const Tensor& mat2_arg) {
       1.0f,
       1.0f,
       c10::make_intrusive<LinearPackedContext>(
-          LinearPackedContext(mat2_arg, c10::optional<Tensor>())),
+          LinearPackedContext(mat2_arg, std::optional<Tensor>())),
       false,
       0,
       0);
@@ -909,7 +909,7 @@ Tensor bmm(const Tensor& mat1_arg, const Tensor& mat2_arg) {
       1.0f,
       1.0f,
       c10::make_intrusive<LinearPackedContext>(LinearPackedContext(
-          mat2_arg, c10::optional<Tensor>(), true /*use batch*/)));
+          mat2_arg, std::optional<Tensor>(), true /*use batch*/)));
 }
 
 Tensor baddbmm(
@@ -941,7 +941,7 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
 
 LinearPackedContext::LinearPackedContext(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const bool use_batch)
     : unpacked_{c10::AnyType::get()} {
   TORCH_CHECK(
@@ -974,7 +974,7 @@ LinearPackedContext LinearPackedContext::pack(c10::impl::GenericList unpacked) {
 
 c10::intrusive_ptr<LinearPackedContext> create_linear_context(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias) {
+    std::optional<Tensor>&& bias) {
   return c10::make_intrusive<LinearPackedContext>(
       LinearPackedContext(weight, bias));
 }
diff --git a/aten/src/ATen/native/vulkan/ops/Mm.h b/aten/src/ATen/native/vulkan/ops/Mm.h
index b4fcb31bc315c..99862913a65a0 100644
--- a/aten/src/ATen/native/vulkan/ops/Mm.h
+++ b/aten/src/ATen/native/vulkan/ops/Mm.h
@@ -61,7 +61,7 @@ class LinearPackedContext final : virtual public VulkanPackedContext,
  public:
   LinearPackedContext(
       const Tensor& weight,
-      const c10::optional<Tensor>& bias,
+      const std::optional<Tensor>& bias,
       const bool use_batch = false);
 
   /*
@@ -97,7 +97,7 @@ class LinearPackedContext final : virtual public VulkanPackedContext,
 
 c10::intrusive_ptr<LinearPackedContext> create_linear_context(
     Tensor&& weight,
-    c10::optional<Tensor>&& bias);
+    std::optional<Tensor>&& bias);
 
 Tensor run_linear_context(
     const Tensor& input,
diff --git a/aten/src/ATen/native/vulkan/ops/NativeLayerNorm.cpp b/aten/src/ATen/native/vulkan/ops/NativeLayerNorm.cpp
index ffeb8c27c52b5..94d155cc2f647 100644
--- a/aten/src/ATen/native/vulkan/ops/NativeLayerNorm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/NativeLayerNorm.cpp
@@ -12,8 +12,8 @@ using namespace api::utils;
 void _check_layer_norm_inputs(
     const at::Tensor& input,
     IntArrayRef normalized_shape,
-    const c10::optional<Tensor>& weight /* optional */,
-    const c10::optional<Tensor>& bias /* optional */) {
+    const std::optional<Tensor>& weight /* optional */,
+    const std::optional<Tensor>& bias /* optional */) {
   const auto normalized_ndim = normalized_shape.size();
   TORCH_CHECK(
       normalized_ndim >= 1,
@@ -55,8 +55,8 @@ void _check_layer_norm_inputs(
 std::tuple<Tensor, Tensor, Tensor> native_layer_norm(
     const at::Tensor& input_arg,
     IntArrayRef normalized_shape,
-    const c10::optional<Tensor>& weight_opt /* optional */,
-    const c10::optional<Tensor>& bias_opt /* optional */,
+    const std::optional<Tensor>& weight_opt /* optional */,
+    const std::optional<Tensor>& bias_opt /* optional */,
     double eps) {
   _check_layer_norm_inputs(input_arg, normalized_shape, weight_opt, bias_opt);
 
diff --git a/aten/src/ATen/native/vulkan/ops/Pool.cpp b/aten/src/ATen/native/vulkan/ops/Pool.cpp
index fab4f05b4a98b..8730cf660a43b 100644
--- a/aten/src/ATen/native/vulkan/ops/Pool.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Pool.cpp
@@ -232,7 +232,7 @@ Tensor avg_pool2d(
     const IntArrayRef padding_arg,
     const bool ceil_mode,
     const bool /* count_include_pad */,
-    const c10::optional<int64_t> /* divisor_override */) {
+    const std::optional<int64_t> /* divisor_override */) {
   return pool2d(
       self_arg,
       kernel_arg,
diff --git a/aten/src/ATen/native/vulkan/ops/QuantizedFunctions.h b/aten/src/ATen/native/vulkan/ops/QuantizedFunctions.h
index b22a3aa05b819..d72ad00321043 100644
--- a/aten/src/ATen/native/vulkan/ops/QuantizedFunctions.h
+++ b/aten/src/ATen/native/vulkan/ops/QuantizedFunctions.h
@@ -52,7 +52,7 @@ Tensor quantized_div(
 Tensor quantized_conv2d(
     const Tensor& input_,
     const Tensor& weight,
-    const c10::optional<Tensor>& bias_opt,
+    const std::optional<Tensor>& bias_opt,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
@@ -63,8 +63,8 @@ Tensor quantized_conv2d(
 Tensor quantized_upsample_nearest2d(
     const Tensor& input_arg,
     const IntArrayRef output_sizes,
-    const c10::optional<double> scales_h,
-    const c10::optional<double> scales_w);
+    const std::optional<double> scales_h,
+    const std::optional<double> scales_w);
 
 } // namespace ops
 } // namespace vulkan
diff --git a/aten/src/ATen/native/vulkan/ops/Random.cpp b/aten/src/ATen/native/vulkan/ops/Random.cpp
index c266b10417039..3103f7fe6f58d 100644
--- a/aten/src/ATen/native/vulkan/ops/Random.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Random.cpp
@@ -16,7 +16,7 @@ Tensor& uniform_(
     Tensor& self,
     const double from,
     const double to,
-    const c10::optional<at::Generator> /* not implemented */) {
+    const std::optional<at::Generator> /* not implemented */) {
   TORCH_CHECK(
       self.is_vulkan(),
       "Vulkan: In-place operator is only supported on Vulkan tensors.");
@@ -59,11 +59,11 @@ Tensor& uniform_(
 
 Tensor rand_like(
     const at::Tensor& input_arg,
-    const c10::optional<c10::ScalarType> /* not implemented */,
-    const c10::optional<c10::Layout> /* not implemented */,
-    const c10::optional<c10::Device> /* not implemented */,
-    const c10::optional<bool> /* not implemented */,
-    const c10::optional<c10::MemoryFormat> /* not implemented */) {
+    const std::optional<c10::ScalarType> /* not implemented */,
+    const std::optional<c10::Layout> /* not implemented */,
+    const std::optional<c10::Device> /* not implemented */,
+    const std::optional<bool> /* not implemented */,
+    const std::optional<c10::MemoryFormat> /* not implemented */) {
   // Returns a tensor with the same size as input that is filled with random
   // numbers from a uniform distribution on the interval [0,1). To match the CPU
   // implementation, we simplify the range to [0,1] and tolerate the small
@@ -75,7 +75,7 @@ Tensor& normal_(
     Tensor& self,
     const double mean,
     const double std,
-    const c10::optional<at::Generator> /* not implemented */) {
+    const std::optional<at::Generator> /* not implemented */) {
   TORCH_CHECK(
       self.is_vulkan(),
       "Vulkan: In-place operator is only supported on Vulkan tensors.");
@@ -120,11 +120,11 @@ Tensor& normal_(
 
 Tensor randn_like(
     const at::Tensor& input_arg,
-    const c10::optional<c10::ScalarType> /* not implemented */,
-    const c10::optional<c10::Layout> /* not implemented */,
-    const c10::optional<c10::Device> /* not implemented */,
-    const c10::optional<bool> /* not implemented */,
-    const c10::optional<c10::MemoryFormat> /* not implemented */) {
+    const std::optional<c10::ScalarType> /* not implemented */,
+    const std::optional<c10::Layout> /* not implemented */,
+    const std::optional<c10::Device> /* not implemented */,
+    const std::optional<bool> /* not implemented */,
+    const std::optional<c10::MemoryFormat> /* not implemented */) {
   // Returns a tensor with the same size as input that is filled with random
   // numbers from a normal distribution with mean 0 and standard deviation 1.
   return input_arg.clone().detach().normal_(0.0, 1.0);
diff --git a/aten/src/ATen/native/vulkan/ops/Slice.cpp b/aten/src/ATen/native/vulkan/ops/Slice.cpp
index 7d7721bcb7b15..dad391e9a5ddd 100644
--- a/aten/src/ATen/native/vulkan/ops/Slice.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Slice.cpp
@@ -232,8 +232,8 @@ Tensor slice_height(
 Tensor slice(
     const Tensor& self,
     int64_t dim,
-    c10::optional<int64_t> start,
-    c10::optional<int64_t> end,
+    std::optional<int64_t> start,
+    std::optional<int64_t> end,
     const int64_t step) {
   TORCH_CHECK(step > 0, "slice step must be positive");
   auto nDims = safe_downcast<uint32_t>(self.dim());
diff --git a/aten/src/ATen/native/vulkan/ops/Sum.cpp b/aten/src/ATen/native/vulkan/ops/Sum.cpp
index 56eed26448dd5..6d8331caff215 100644
--- a/aten/src/ATen/native/vulkan/ops/Sum.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Sum.cpp
@@ -132,7 +132,7 @@ Tensor sum_dim_IntList(
   return self;
 }
 
-Tensor sum(const Tensor& self, const c10::optional<ScalarType> dtype) {
+Tensor sum(const Tensor& self, const std::optional<ScalarType> dtype) {
   std::vector<int64_t> dims;
   for (int64_t d = 0; d < self.dim(); d++) {
     // If any dimension has zero elements, we will shortcut to a zero-dim.
diff --git a/aten/src/ATen/native/vulkan/ops/Upsample.cpp b/aten/src/ATen/native/vulkan/ops/Upsample.cpp
index 776d1e79ce705..7e3a2ead2d632 100644
--- a/aten/src/ATen/native/vulkan/ops/Upsample.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Upsample.cpp
@@ -12,8 +12,8 @@ using namespace api::utils;
 Tensor upsample_nearest2d(
     const Tensor& input_arg,
     const IntArrayRef output_sizes,
-    const c10::optional<double> scales_h,
-    const c10::optional<double> scales_w) {
+    const std::optional<double> scales_h,
+    const std::optional<double> scales_w) {
   api::Context* const context = api::context();
 
   TORCH_CHECK(
@@ -98,8 +98,8 @@ Tensor upsample_bilinear2d(
     const Tensor& input_arg,
     const IntArrayRef output_sizes,
     bool align_corners,
-    const c10::optional<double> scales_h,
-    const c10::optional<double> scales_w) {
+    const std::optional<double> scales_h,
+    const std::optional<double> scales_w) {
   api::Context* const context = api::context();
 
   TORCH_CHECK(
diff --git a/aten/src/ATen/native/vulkan/ops/Zero.cpp b/aten/src/ATen/native/vulkan/ops/Zero.cpp
index 5ceaae07cdc3e..fc903ad3f1e19 100644
--- a/aten/src/ATen/native/vulkan/ops/Zero.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Zero.cpp
@@ -43,10 +43,10 @@ Tensor& zero_(at::Tensor& self) {
 
 Tensor zeros(
     const IntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<c10::Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<ScalarType> dtype,
+    std::optional<c10::Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
   TORCH_CHECK(size.size() <= 4, "Vulkan zeros supports up to 4d tensors");
 
   // Get the global Vulkan context
diff --git a/aten/src/ATen/native/vulkan/ops/cumsum.cpp b/aten/src/ATen/native/vulkan/ops/cumsum.cpp
index c0e8a0c09362d..e6537fcc5acd5 100644
--- a/aten/src/ATen/native/vulkan/ops/cumsum.cpp
+++ b/aten/src/ATen/native/vulkan/ops/cumsum.cpp
@@ -87,7 +87,7 @@ void set_cumsum_kernel_params(
 Tensor cumsum(
     const at::Tensor& input_arg,
     const int64_t dim_arg,
-    const c10::optional<ScalarType> dtype) {
+    const std::optional<ScalarType> dtype) {
   TORCH_CHECK(
       input_arg.dim() >= 1 && input_arg.dim() <= 4,
       "Vulkan cumsum expects 1 <= input dimension <= 4, Tensor input dimensions ",
diff --git a/aten/src/ATen/native/xnnpack/Convolution.cpp b/aten/src/ATen/native/xnnpack/Convolution.cpp
index aaf42ea3ed3d3..504c6a363816c 100644
--- a/aten/src/ATen/native/xnnpack/Convolution.cpp
+++ b/aten/src/ATen/native/xnnpack/Convolution.cpp
@@ -170,7 +170,7 @@ const Tensor reorder_weights_for_transpose_conv(const Tensor& weight_nhwc,
 
 ContextConv2D create(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const IntArrayRef padding,
     const IntArrayRef output_padding,
     const IntArrayRef stride,
@@ -396,13 +396,13 @@ Tensor run(
 c10::intrusive_ptr<xnnpack::Conv2dOpContext>
     createConv2dClampPrePackOpContext(
         Tensor weight,
-        c10::optional<Tensor> bias,
+        std::optional<Tensor> bias,
         std::vector<int64_t> stride,
         std::vector<int64_t> padding,
         std::vector<int64_t> dilation,
         int64_t groups,
-        const c10::optional<Scalar>& output_min,
-        const c10::optional<Scalar>& output_max) {
+        const std::optional<Scalar>& output_min,
+        const std::optional<Scalar>& output_max) {
       return xnnpack::XNNPackConv2dOpContext::create_context(
           std::move(weight),
           std::move(bias),
@@ -417,14 +417,14 @@ c10::intrusive_ptr<xnnpack::Conv2dOpContext>
 c10::intrusive_ptr<xnnpack::TransposeConv2dOpContext>
     createConv2dTransposeClampPrePackOpContext(
         Tensor weight,
-        c10::optional<Tensor> bias,
+        std::optional<Tensor> bias,
         std::vector<int64_t> stride,
         std::vector<int64_t> padding,
         std::vector<int64_t> output_padding,
         std::vector<int64_t> dilation,
         int64_t groups,
-        const c10::optional<Scalar>& output_min,
-        const c10::optional<Scalar>& output_max) {
+        const std::optional<Scalar>& output_min,
+        const std::optional<Scalar>& output_max) {
       return xnnpack::XNNPackTransposeConv2dOpContext::create_context(
           std::move(weight),
           std::move(bias),
diff --git a/aten/src/ATen/native/xnnpack/Convolution.h b/aten/src/ATen/native/xnnpack/Convolution.h
index 0df4a6bcd483d..0ec3f01f36bb6 100644
--- a/aten/src/ATen/native/xnnpack/Convolution.h
+++ b/aten/src/ATen/native/xnnpack/Convolution.h
@@ -12,25 +12,25 @@ namespace internal::convolution2d {
 c10::intrusive_ptr<xnnpack::Conv2dOpContext>
     createConv2dClampPrePackOpContext(
         Tensor weight,
-        c10::optional<Tensor> bias,
+        std::optional<Tensor> bias,
         std::vector<int64_t> stride,
         std::vector<int64_t> padding,
         std::vector<int64_t> dilation,
         int64_t groups,
-        const c10::optional<Scalar>& output_min,
-        const c10::optional<Scalar>& output_max);
+        const std::optional<Scalar>& output_min,
+        const std::optional<Scalar>& output_max);
 
 c10::intrusive_ptr<xnnpack::TransposeConv2dOpContext>
     createConv2dTransposeClampPrePackOpContext(
         Tensor weight,
-        c10::optional<Tensor> bias,
+        std::optional<Tensor> bias,
         std::vector<int64_t> stride,
         std::vector<int64_t> padding,
         std::vector<int64_t> output_padding,
         std::vector<int64_t> dilation,
         int64_t groups,
-        const c10::optional<Scalar>& output_min,
-        const c10::optional<Scalar>& output_max);
+        const std::optional<Scalar>& output_min,
+        const std::optional<Scalar>& output_max);
 
 Tensor conv2d_clamp_run(
     const Tensor& input,
@@ -45,7 +45,7 @@ Tensor conv2d_transpose_clamp_run(
 
 ContextConv2D create(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const IntArrayRef padding,
     const IntArrayRef output_padding,
     const IntArrayRef stride,
diff --git a/aten/src/ATen/native/xnnpack/Linear.cpp b/aten/src/ATen/native/xnnpack/Linear.cpp
index dcab40ec17cfd..b1f4936625828 100644
--- a/aten/src/ATen/native/xnnpack/Linear.cpp
+++ b/aten/src/ATen/native/xnnpack/Linear.cpp
@@ -14,7 +14,7 @@ namespace {
 // TODO: Decouple and improve error handling and messages.
 bool available(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const float output_min,
     const float output_max) {
          // XNNPACK
@@ -65,7 +65,7 @@ Tensor create_and_run(
 
 ContextLinear create(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const float output_min,
     const float output_max) {
   const Tensor weight_contig = weight.contiguous();
@@ -173,9 +173,9 @@ Tensor run(
 
 c10::intrusive_ptr<xnnpack::LinearOpContext> createLinearClampPrePackOpContext(
     Tensor weight,
-    c10::optional<Tensor> bias,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    std::optional<Tensor> bias,
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   return xnnpack::XNNPackLinearOpContext::create_context(
       std::move(weight), std::move(bias), output_min, output_max);
 }
diff --git a/aten/src/ATen/native/xnnpack/Linear.h b/aten/src/ATen/native/xnnpack/Linear.h
index 32c9d93bf4533..9a16918ca0a99 100644
--- a/aten/src/ATen/native/xnnpack/Linear.h
+++ b/aten/src/ATen/native/xnnpack/Linear.h
@@ -11,9 +11,9 @@ namespace internal::linear {
 
 c10::intrusive_ptr<xnnpack::LinearOpContext> createLinearClampPrePackOpContext(
     Tensor weight,
-    c10::optional<Tensor> bias,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max);
+    std::optional<Tensor> bias,
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max);
 
 Tensor linear_clamp_run(const Tensor& input, const c10::intrusive_ptr<xnnpack::LinearOpContext>& op_context);
 
@@ -22,7 +22,7 @@ unpack_prepacked_sizes_linear(const IValue& ivalue);
 
 ContextLinear create(
     const Tensor& weight,
-    const c10::optional<Tensor>& bias,
+    const std::optional<Tensor>& bias,
     const float output_min,
     const float output_max);
 
diff --git a/aten/src/ATen/native/xnnpack/OpContext.cpp b/aten/src/ATen/native/xnnpack/OpContext.cpp
index 07f926cd8add5..71c40d1dccd7b 100644
--- a/aten/src/ATen/native/xnnpack/OpContext.cpp
+++ b/aten/src/ATen/native/xnnpack/OpContext.cpp
@@ -10,9 +10,9 @@ namespace at::native::xnnpack {
 c10::intrusive_ptr<LinearOpContext>
 XNNPackLinearOpContext::create_context(
     at::Tensor&& weight,
-    c10::optional<at::Tensor>&& bias,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    std::optional<at::Tensor>&& bias,
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   auto linear_op_context =
       c10::make_intrusive<XNNPackLinearOpContext>(
           std::move(weight),
@@ -46,13 +46,13 @@ Tensor XNNPackLinearOpContext::run(const Tensor& input) {
 
 c10::intrusive_ptr<Conv2dOpContext>
 XNNPackConv2dOpContext::create_context(at::Tensor&& weight,
-    c10::optional<at::Tensor>&& bias,
+    std::optional<at::Tensor>&& bias,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& dilation,
     int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   auto op_context =
       xnnpack::internal::convolution2d::create(
           weight,
@@ -89,14 +89,14 @@ XNNPackConv2dOpContext::create_context(at::Tensor&& weight,
 
 c10::intrusive_ptr<TransposeConv2dOpContext>
 XNNPackTransposeConv2dOpContext::create_context(at::Tensor&& weight,
-    c10::optional<at::Tensor>&& bias,
+    std::optional<at::Tensor>&& bias,
     std::vector<int64_t>&& padding,
     std::vector<int64_t>&& output_padding,
     std::vector<int64_t>&& stride,
     std::vector<int64_t>&& dilation,
     int64_t groups,
-    const c10::optional<Scalar>& output_min,
-    const c10::optional<Scalar>& output_max) {
+    const std::optional<Scalar>& output_min,
+    const std::optional<Scalar>& output_max) {
   auto op_context =
       xnnpack::internal::convolution2d::create(
           weight,
diff --git a/aten/src/ATen/native/xnnpack/OpContext.h b/aten/src/ATen/native/xnnpack/OpContext.h
index eecc8b11fad13..0aec38b102ff5 100644
--- a/aten/src/ATen/native/xnnpack/OpContext.h
+++ b/aten/src/ATen/native/xnnpack/OpContext.h
@@ -10,37 +10,37 @@ namespace at::native::xnnpack {
 
 using SerializationTypeLinearPrePack = std::tuple<
     Tensor,
-    c10::optional<Tensor>,
-    c10::optional<Scalar>,
-    c10::optional<Scalar>>;
+    std::optional<Tensor>,
+    std::optional<Scalar>,
+    std::optional<Scalar>>;
 using SerializationTypeConv2dPrePack = std::tuple<
     Tensor,
-    c10::optional<Tensor>,
+    std::optional<Tensor>,
     std::vector<int64_t>,
     std::vector<int64_t>,
     std::vector<int64_t>,
     int64_t,
-    c10::optional<Scalar>,
-    c10::optional<Scalar>>;
+    std::optional<Scalar>,
+    std::optional<Scalar>>;
 using SerializationTypeTransposeConv2dPrePack = std::tuple<
     Tensor,
-    c10::optional<Tensor>,
+    std::optional<Tensor>,
     std::vector<int64_t>,
     std::vector<int64_t>,
     std::vector<int64_t>,
     std::vector<int64_t>,
     int64_t,
-    c10::optional<Scalar>,
-    c10::optional<Scalar>>;
+    std::optional<Scalar>,
+    std::optional<Scalar>>;
 
 
 
 class LinearOpContext : public torch::jit::CustomClassHolder {
  protected:
   Tensor orig_weight_;
-  c10::optional<Tensor> orig_bias_;
-  c10::optional<Scalar> output_min_;
-  c10::optional<Scalar> output_max_;
+  std::optional<Tensor> orig_bias_;
+  std::optional<Scalar> output_min_;
+  std::optional<Scalar> output_max_;
   bool orig_weight_and_bias_freed_;
 
  public:
@@ -60,9 +60,9 @@ class XNNPackLinearOpContext final : public LinearOpContext {
  public:
   XNNPackLinearOpContext(
       Tensor&& weight,
-      c10::optional<Tensor>&& bias,
-      const c10::optional<Scalar>& min,
-      const c10::optional<Scalar>& max,
+      std::optional<Tensor>&& bias,
+      const std::optional<Scalar>& min,
+      const std::optional<Scalar>& max,
       ContextLinear&& op_context)
       : op_context_(std::move(op_context)) {
     orig_weight_ = std::move(weight);
@@ -77,21 +77,21 @@ class XNNPackLinearOpContext final : public LinearOpContext {
 
   static c10::intrusive_ptr<LinearOpContext> create_context(
       Tensor&& weight,
-      c10::optional<Tensor>&& bias,
-      const c10::optional<Scalar>& output_min,
-      const c10::optional<Scalar>& output_max);
+      std::optional<Tensor>&& bias,
+      const std::optional<Scalar>& output_min,
+      const std::optional<Scalar>& output_max);
 };
 
 class Conv2dOpContext : public torch::jit::CustomClassHolder {
  protected:
   Tensor orig_weight_;
-  c10::optional<Tensor> orig_bias_;
+  std::optional<Tensor> orig_bias_;
   std::vector<int64_t> stride_;
   std::vector<int64_t> padding_;
   std::vector<int64_t> dilation_;
   int64_t groups_;
-  c10::optional<Scalar> output_min_;
-  c10::optional<Scalar> output_max_;
+  std::optional<Scalar> output_min_;
+  std::optional<Scalar> output_max_;
   bool orig_weight_and_bias_freed_;
 
  public:
@@ -115,14 +115,14 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder {
 class TransposeConv2dOpContext : public torch::jit::CustomClassHolder {
  protected:
   Tensor orig_weight_;
-  c10::optional<Tensor> orig_bias_;
+  std::optional<Tensor> orig_bias_;
   std::vector<int64_t> stride_;
   std::vector<int64_t> padding_;
   std::vector<int64_t> output_padding_;
   std::vector<int64_t> dilation_;
   int64_t groups_;
-  c10::optional<Scalar> output_min_;
-  c10::optional<Scalar> output_max_;
+  std::optional<Scalar> output_min_;
+  std::optional<Scalar> output_max_;
   bool orig_weight_and_bias_freed_;
 
  public:
@@ -158,13 +158,13 @@ class XNNPackConv2dOpContext final : public Conv2dOpContext {
  public:
   XNNPackConv2dOpContext(
       Tensor&& weight,
-      c10::optional<Tensor>&& bias,
+      std::optional<Tensor>&& bias,
       std::vector<int64_t>&& padding,
       std::vector<int64_t>&& stride,
       std::vector<int64_t>&& dilation,
       uint64_t groups,
-      const c10::optional<Scalar>& min,
-      const c10::optional<Scalar>& max,
+      const std::optional<Scalar>& min,
+      const std::optional<Scalar>& max,
       ContextConv2D&& op_context)
       : op_context_(std::move(op_context)) {
     orig_weight_ = std::move(weight);
@@ -183,13 +183,13 @@ class XNNPackConv2dOpContext final : public Conv2dOpContext {
 
   static c10::intrusive_ptr<Conv2dOpContext> create_context(
       Tensor&& weight,
-      c10::optional<Tensor>&& bias,
+      std::optional<Tensor>&& bias,
       std::vector<int64_t>&& padding,
       std::vector<int64_t>&& stride,
       std::vector<int64_t>&& dilation,
       int64_t groups,
-      const c10::optional<Scalar>& output_min,
-      const c10::optional<Scalar>& output_max);
+      const std::optional<Scalar>& output_min,
+      const std::optional<Scalar>& output_max);
 };
 
 class XNNPackTransposeConv2dOpContext final : public TransposeConv2dOpContext {
@@ -206,14 +206,14 @@ class XNNPackTransposeConv2dOpContext final : public TransposeConv2dOpContext {
  public:
   XNNPackTransposeConv2dOpContext(
       Tensor&& weight,
-      c10::optional<Tensor>&& bias,
+      std::optional<Tensor>&& bias,
       std::vector<int64_t>&& padding,
       std::vector<int64_t>&& output_padding,
       std::vector<int64_t>&& stride,
       std::vector<int64_t>&& dilation,
       uint64_t groups,
-      const c10::optional<Scalar>& min,
-      const c10::optional<Scalar>& max,
+      const std::optional<Scalar>& min,
+      const std::optional<Scalar>& max,
       ContextConv2D&& op_context)
       : op_context_(std::move(op_context)) {
     orig_weight_ = std::move(weight);
@@ -233,14 +233,14 @@ class XNNPackTransposeConv2dOpContext final : public TransposeConv2dOpContext {
 
   static c10::intrusive_ptr<TransposeConv2dOpContext> create_context(
       Tensor&& weight,
-      c10::optional<Tensor>&& bias,
+      std::optional<Tensor>&& bias,
       std::vector<int64_t>&& padding,
       std::vector<int64_t>&& output_padding,
       std::vector<int64_t>&& stride,
       std::vector<int64_t>&& dilation,
       int64_t groups,
-      const c10::optional<Scalar>& output_min,
-      const c10::optional<Scalar>& output_max);
+      const std::optional<Scalar>& output_min,
+      const std::optional<Scalar>& output_max);
 };
 
 } // namespace at::native::xnnpack
diff --git a/aten/src/ATen/ops/from_blob.h b/aten/src/ATen/ops/from_blob.h
index 8ebc01a922029..88089092c1fd7 100644
--- a/aten/src/ATen/ops/from_blob.h
+++ b/aten/src/ATen/ops/from_blob.h
@@ -31,7 +31,7 @@ class TORCH_API TensorMaker {
     return *this;
   }
 
-  TensorMaker& storage_offset(c10::optional<int64_t> value) noexcept {
+  TensorMaker& storage_offset(std::optional<int64_t> value) noexcept {
     storage_offset_ = value;
 
     return *this;
@@ -50,7 +50,7 @@ class TORCH_API TensorMaker {
     return *this;
   }
 
-  TensorMaker& target_device(c10::optional<Device> value) noexcept {
+  TensorMaker& target_device(std::optional<Device> value) noexcept {
     device_ = value;
 
     return *this;
@@ -91,10 +91,10 @@ class TORCH_API TensorMaker {
   void* data_;
   IntArrayRef sizes_;
   OptionalIntArrayRef strides_{};
-  c10::optional<int64_t> storage_offset_{};
+  std::optional<int64_t> storage_offset_{};
   std::function<void(void*)> deleter_{};
   std::unique_ptr<void, ContextDeleter> ctx_{nullptr, detail::noopDelete};
-  c10::optional<Device> device_{};
+  std::optional<Device> device_{};
   TensorOptions opts_{};
   bool resizeable_{};
   c10::Allocator* allocator_{};
@@ -110,7 +110,7 @@ inline Tensor from_blob(
     IntArrayRef strides,
     const std::function<void(void*)>& deleter,
     const TensorOptions& options = {},
-    const c10::optional<Device> target_device = c10::nullopt) {
+    const std::optional<Device> target_device = c10::nullopt) {
   return for_blob(data, sizes)
       .strides(strides)
       .deleter(deleter)
@@ -126,7 +126,7 @@ inline Tensor from_blob(
     int64_t storage_offset,
     const std::function<void(void*)>& deleter,
     const TensorOptions& options = {},
-    const c10::optional<Device> target_device = c10::nullopt) {
+    const std::optional<Device> target_device = c10::nullopt) {
   return for_blob(data, sizes)
       .strides(strides)
       .storage_offset(storage_offset)
@@ -141,7 +141,7 @@ inline Tensor from_blob(
     IntArrayRef sizes,
     std::function<void(void*)> deleter,
     const TensorOptions& options = {},
-    const c10::optional<Device> target_device = c10::nullopt) {
+    const std::optional<Device> target_device = c10::nullopt) {
   return for_blob(data, sizes)
       .deleter(std::move(deleter))
       .options(options)
diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp
index bc3a0ba517483..04743ff256ece 100644
--- a/aten/src/ATen/record_function.cpp
+++ b/aten/src/ATen/record_function.cpp
@@ -43,7 +43,7 @@ RecordFunctionCallbacks::iterator findCallback(
   return std::find_if(entries.begin(), entries.end(), match_handle);
 }
 
-c10::optional<RecordFunctionCallback> extractCallback(
+std::optional<RecordFunctionCallback> extractCallback(
     RecordFunctionCallbacks& entries,
     CallbackHandle handle) {
   auto it = findCallback(entries, handle);
@@ -132,7 +132,7 @@ class CacheEntry {
   // The caller is expected to check `GlobalCallbackManager::get().version()'
   // and call CacheEntry::update() if necessary.
   StepCallbacks getActiveCallbacks();
-  c10::optional<StepCallbacks> getActiveCallbacksUnlessEmpty();
+  std::optional<StepCallbacks> getActiveCallbacksUnlessEmpty();
 
   // Full rebuild. (E.g. during registration)
   void update(const std::vector<RecordFunctionCallback>& callbacks);
@@ -174,7 +174,7 @@ class LocalCallbackManager {
  public:
   const RecordFunctionTLS& getTLS() const;
   StepCallbacks getActiveCallbacks(const RecordScope scope);
-  c10::optional<StepCallbacks> getActiveCallbacksUnlessEmpty(const RecordScope scope);
+  std::optional<StepCallbacks> getActiveCallbacksUnlessEmpty(const RecordScope scope);
 
   void setTLS(const RecordFunctionTLS& tls);
   void seed(uint32_t seed);
@@ -310,7 +310,7 @@ StepCallbacks CacheEntry::getActiveCallbacks() {
   return active_callbacks_;
 }
 
-c10::optional<StepCallbacks> CacheEntry::getActiveCallbacksUnlessEmpty() {
+std::optional<StepCallbacks> CacheEntry::getActiveCallbacksUnlessEmpty() {
   getActiveCallbacksImpl();
   if (C10_LIKELY(active_callbacks_.empty())) {
     return c10::nullopt;
@@ -397,7 +397,7 @@ StepCallbacks LocalCallbackManager::getActiveCallbacks(
   return active_callbacks_[static_cast<size_t>(scope)].getActiveCallbacks();
 }
 
-c10::optional<StepCallbacks> LocalCallbackManager::getActiveCallbacksUnlessEmpty(
+std::optional<StepCallbacks> LocalCallbackManager::getActiveCallbacksUnlessEmpty(
     const RecordScope scope) {
   rebuildActiveCallbacksIfNeeded();
   return active_callbacks_[static_cast<size_t>(scope)].getActiveCallbacksUnlessEmpty();
@@ -585,25 +585,25 @@ size_t RecordFunction::num_outputs() const {
       fn_);
 }
 
-c10::optional<OperatorName> RecordFunction::operator_name() const {
+std::optional<OperatorName> RecordFunction::operator_name() const {
   return std::visit(
       c10::overloaded(
-          [&](const std::string&) -> c10::optional<OperatorName> {
+          [&](const std::string&) -> std::optional<OperatorName> {
             return c10::nullopt;
           },
-          [](const schema_ref_t schema) -> c10::optional<OperatorName> {
+          [](const schema_ref_t schema) -> std::optional<OperatorName> {
             return schema.get().operator_name();
           }),
       fn_);
 }
 
-c10::optional<c10::FunctionSchema> RecordFunction::operator_schema() const {
+std::optional<c10::FunctionSchema> RecordFunction::operator_schema() const {
   return std::visit(
       c10::overloaded(
-          [&](const std::string&) -> c10::optional<c10::FunctionSchema> {
+          [&](const std::string&) -> std::optional<c10::FunctionSchema> {
             return c10::nullopt;
           },
-          [](const schema_ref_t schema) -> c10::optional<c10::FunctionSchema> {
+          [](const schema_ref_t schema) -> std::optional<c10::FunctionSchema> {
             return schema.get();
           }),
       fn_);
@@ -613,7 +613,7 @@ StepCallbacks getStepCallbacks(RecordScope scope) {
   return LocalCallbackManager::get().getActiveCallbacks(scope);
 }
 
-c10::optional<StepCallbacks> getStepCallbacksUnlessEmpty(RecordScope scope) {
+std::optional<StepCallbacks> getStepCallbacksUnlessEmpty(RecordScope scope) {
   return LocalCallbackManager::get().getActiveCallbacksUnlessEmpty(scope);
 }
 
diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h
index c6f79289e6c21..014260fb220f8 100644
--- a/aten/src/ATen/record_function.h
+++ b/aten/src/ATen/record_function.h
@@ -433,10 +433,10 @@ struct TORCH_API RecordFunction {
     return handle_;
   }
 
-  c10::optional<OperatorName> operator_name() const;
+  std::optional<OperatorName> operator_name() const;
 
   // This method returns a copy of the FunctionSchema and can be expensive.
-  c10::optional<FunctionSchema> operator_schema() const;
+  std::optional<FunctionSchema> operator_schema() const;
 
   void setHandle(RecordFunctionHandle handle) {
     handle_ = handle;
@@ -521,7 +521,7 @@ struct TORCH_API RecordFunction {
 
 TORCH_API StepCallbacks getStepCallbacks(RecordScope scope);
 
-TORCH_API c10::optional<StepCallbacks> getStepCallbacksUnlessEmpty(
+TORCH_API std::optional<StepCallbacks> getStepCallbacksUnlessEmpty(
     RecordScope scope);
 
 namespace detail {
diff --git a/aten/src/ATen/templates/RegisterBackendSelect.cpp b/aten/src/ATen/templates/RegisterBackendSelect.cpp
index dcb5986ab69ed..3586e44da999b 100644
--- a/aten/src/ATen/templates/RegisterBackendSelect.cpp
+++ b/aten/src/ATen/templates/RegisterBackendSelect.cpp
@@ -23,7 +23,7 @@ namespace {
 
 ${backend_select_method_definitions}
 
-bool is_pinned(const Tensor& self, c10::optional<at::Device> device) {
+bool is_pinned(const Tensor& self, std::optional<at::Device> device) {
   // Only CPU tensors can be pinned
   if (!self.is_cpu()) {
     return false;
@@ -33,7 +33,7 @@ bool is_pinned(const Tensor& self, c10::optional<at::Device> device) {
   return at::_ops::is_pinned::redispatch(_dk, self, device);
 }
 
-at::Tensor _pin_memory(const Tensor& self, c10::optional<at::Device> device) {
+at::Tensor _pin_memory(const Tensor& self, std::optional<at::Device> device) {
   TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned");
   DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(at::kCUDA)));
   if (self.is_nested()) {
diff --git a/aten/src/ATen/templates/RegisterFunctionalization.cpp b/aten/src/ATen/templates/RegisterFunctionalization.cpp
index fabc12a03fa9f..74d02be9f93d3 100644
--- a/aten/src/ATen/templates/RegisterFunctionalization.cpp
+++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp
@@ -60,7 +60,7 @@ inline Tensor to_meta(const Tensor& t) {
 /*device=*/c10::make_optional(c10::Device(kMeta)), /*pin_memory=*/c10::nullopt);
 }
 
-inline c10::optional<Tensor> to_meta(const c10::optional<Tensor>& t) {
+inline std::optional<Tensor> to_meta(const c10::optional<Tensor>& t) {
   if (t.has_value()) {
     return c10::make_optional<Tensor>(to_meta(*t));
   }
diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index 010f12d4cfbce..1515442dd1f94 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -398,7 +398,7 @@ class TORCH_API Tensor: public TensorBase {
   /// // f requires grad, has no operation creating it
   /// @endcode
 
-  /// \fn void backward(const Tensor & gradient={}, c10::optional<bool> retain_graph=c10::nullopt, bool create_graph=false, c10::optional<TensorList> inputs=c10::nullopt) const;
+  /// \fn void backward(const Tensor & gradient={}, std::optional<bool> retain_graph=c10::nullopt, bool create_graph=false, c10::optional<TensorList> inputs=c10::nullopt) const;
   ///
   /// Computes the gradient of current tensor with respect to graph leaves.
   ///
@@ -433,7 +433,7 @@ class TORCH_API Tensor: public TensorBase {
   ///     the current implementation will call its grad_fn (even though it is not strictly needed to get this gradients).
   ///     It is an implementation detail on which the user should not rely.
   ///     See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details.
-  void backward(const Tensor & gradient={}, c10::optional<bool> retain_graph=c10::nullopt, bool create_graph=false, c10::optional<TensorList> inputs=c10::nullopt) const {
+  void backward(const Tensor & gradient={}, std::optional<bool> retain_graph=c10::nullopt, bool create_graph=false, c10::optional<TensorList> inputs=c10::nullopt) const {
     // NB: Adding this wrapper to _backward here because we'd like our
     // 'backwards' api to accept the 'inputs' argument optionally. Since code gen
     // currently does not support optional of TensorList our approach is to replace
@@ -626,7 +626,7 @@ class TORCH_API Tensor: public TensorBase {
     return TensorBase::data();
   }
 
-  void _backward(TensorList inputs, const c10::optional<Tensor>& gradient, c10::optional<bool> keep_graph, bool create_graph) const;
+  void _backward(TensorList inputs, const std::optional<Tensor>& gradient, c10::optional<bool> keep_graph, bool create_graph) const;
 
   const Tensor& requires_grad_(bool _requires_grad=true) const {
     TensorBase::requires_grad_(_requires_grad);
@@ -737,7 +737,7 @@ struct ExclusivelyOwnedTraits<at::Tensor> {
 namespace at {
 
 inline c10::MaybeOwned<Tensor> borrow_from_optional_tensor(
-    const c10::optional<Tensor>& opt) {
+    const std::optional<Tensor>& opt) {
   return opt.has_value()
     ? c10::MaybeOwned<Tensor>::borrowed(*opt)
     : c10::MaybeOwned<Tensor>::owned(std::in_place);
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index a1a6249414dea..09579b2367206 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -123,16 +123,6 @@ list(APPEND ATen_XPU_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/xpu_generator_test.cpp
   )
 
-# Caffe2 specific tests
-if(BUILD_CAFFE2)
-  list(APPEND ATen_CPU_TEST_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/ExclusivelyOwned_test.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/tensor_interop_test.cpp)
-  list(APPEND ATen_CUDA_TEST_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/cuda_tensor_interop_test.cpp)
-endif()
-
-
 # ---[ Send the lists to the parent scope.
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/test/cpu_rng_test.cpp b/aten/src/ATen/test/cpu_rng_test.cpp
index ebc3eee12f3f6..593d78d47887f 100644
--- a/aten/src/ATen/test/cpu_rng_test.cpp
+++ b/aten/src/ATen/test/cpu_rng_test.cpp
@@ -22,10 +22,10 @@ struct TestCPUGenerator : public c10::GeneratorImpl {
   ~TestCPUGenerator() override = default;
   uint32_t random() { return value_; }
   uint64_t random64() { return value_; }
-  c10::optional<float> next_float_normal_sample() { return next_float_normal_sample_; }
-  c10::optional<double> next_double_normal_sample() { return next_double_normal_sample_; }
-  void set_next_float_normal_sample(c10::optional<float> randn) { next_float_normal_sample_ = randn; }
-  void set_next_double_normal_sample(c10::optional<double> randn) { next_double_normal_sample_ = randn; }
+  std::optional<float> next_float_normal_sample() { return next_float_normal_sample_; }
+  std::optional<double> next_double_normal_sample() { return next_double_normal_sample_; }
+  void set_next_float_normal_sample(std::optional<float> randn) { next_float_normal_sample_ = randn; }
+  void set_next_double_normal_sample(std::optional<double> randn) { next_double_normal_sample_ = randn; }
   void set_current_seed(uint64_t seed) override { throw std::runtime_error("not implemented"); }
   void set_offset(uint64_t offset) override { throw std::runtime_error("not implemented"); }
   uint64_t get_offset() const override { throw std::runtime_error("not implemented"); }
@@ -38,95 +38,95 @@ struct TestCPUGenerator : public c10::GeneratorImpl {
   static DeviceType device_type() { return DeviceType::CPU; }
 
   uint64_t value_;
-  c10::optional<float> next_float_normal_sample_;
-  c10::optional<double> next_double_normal_sample_;
+  std::optional<float> next_float_normal_sample_;
+  std::optional<double> next_double_normal_sample_;
 };
 
 // ==================================================== Random ========================================================
 
-Tensor& random_(Tensor& self, c10::optional<Generator> generator) {
+Tensor& random_(Tensor& self, std::optional<Generator> generator) {
   return at::native::templates::random_impl<native::templates::cpu::RandomKernel, TestCPUGenerator>(self, generator);
 }
 
-Tensor& random_from_to(Tensor& self, int64_t from, optional<int64_t> to, c10::optional<Generator> generator) {
+Tensor& random_from_to(Tensor& self, int64_t from, optional<int64_t> to, std::optional<Generator> generator) {
   return at::native::templates::random_from_to_impl<native::templates::cpu::RandomFromToKernel, TestCPUGenerator>(self, from, to, generator);
 }
 
-Tensor& random_to(Tensor& self, int64_t to, c10::optional<Generator> generator) {
+Tensor& random_to(Tensor& self, int64_t to, std::optional<Generator> generator) {
   return random_from_to(self, 0, to, generator);
 }
 
 // ==================================================== Normal ========================================================
 
-Tensor& normal_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+Tensor& normal_(Tensor& self, double mean, double std, std::optional<Generator> gen) {
   return at::native::templates::normal_impl_<native::templates::cpu::NormalKernel, TestCPUGenerator>(self, mean, std, gen);
 }
 
-Tensor& normal_Tensor_float_out(const Tensor& mean, double std, c10::optional<Generator> gen, Tensor& output) {
+Tensor& normal_Tensor_float_out(const Tensor& mean, double std, std::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<native::templates::cpu::NormalKernel, TestCPUGenerator>(output, mean, std, gen);
 }
 
-Tensor& normal_float_Tensor_out(double mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+Tensor& normal_float_Tensor_out(double mean, const Tensor& std, std::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<native::templates::cpu::NormalKernel, TestCPUGenerator>(output, mean, std, gen);
 }
 
-Tensor& normal_Tensor_Tensor_out(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+Tensor& normal_Tensor_Tensor_out(const Tensor& mean, const Tensor& std, std::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<native::templates::cpu::NormalKernel, TestCPUGenerator>(output, mean, std, gen);
 }
 
-Tensor normal_Tensor_float(const Tensor& mean, double std, c10::optional<Generator> gen) {
+Tensor normal_Tensor_float(const Tensor& mean, double std, std::optional<Generator> gen) {
   return at::native::templates::normal_impl<native::templates::cpu::NormalKernel, TestCPUGenerator>(mean, std, gen);
 }
 
-Tensor normal_float_Tensor(double mean, const Tensor& std, c10::optional<Generator> gen) {
+Tensor normal_float_Tensor(double mean, const Tensor& std, std::optional<Generator> gen) {
   return at::native::templates::normal_impl<native::templates::cpu::NormalKernel, TestCPUGenerator>(mean, std, gen);
 }
 
-Tensor normal_Tensor_Tensor(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
+Tensor normal_Tensor_Tensor(const Tensor& mean, const Tensor& std, std::optional<Generator> gen) {
   return at::native::templates::normal_impl<native::templates::cpu::NormalKernel, TestCPUGenerator>(mean, std, gen);
 }
 
 // ==================================================== Uniform =======================================================
 
-Tensor& uniform_(Tensor& self, double from, double to, c10::optional<Generator> generator) {
+Tensor& uniform_(Tensor& self, double from, double to, std::optional<Generator> generator) {
   return at::native::templates::uniform_impl_<native::templates::cpu::UniformKernel, TestCPUGenerator>(self, from, to, generator);
 }
 
 // ==================================================== Cauchy ========================================================
 
-Tensor& cauchy_(Tensor& self, double median, double sigma, c10::optional<Generator> generator) {
+Tensor& cauchy_(Tensor& self, double median, double sigma, std::optional<Generator> generator) {
   return at::native::templates::cauchy_impl_<native::templates::cpu::CauchyKernel, TestCPUGenerator>(self, median, sigma, generator);
 }
 
 // ================================================== LogNormal =======================================================
 
-Tensor& log_normal_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+Tensor& log_normal_(Tensor& self, double mean, double std, std::optional<Generator> gen) {
   return at::native::templates::log_normal_impl_<native::templates::cpu::LogNormalKernel, TestCPUGenerator>(self, mean, std, gen);
 }
 
 // ================================================== Geometric =======================================================
 
-Tensor& geometric_(Tensor& self, double p, c10::optional<Generator> gen) {
+Tensor& geometric_(Tensor& self, double p, std::optional<Generator> gen) {
   return at::native::templates::geometric_impl_<native::templates::cpu::GeometricKernel, TestCPUGenerator>(self, p, gen);
 }
 
 // ================================================== Exponential =====================================================
 
-Tensor& exponential_(Tensor& self, double lambda, c10::optional<Generator> gen) {
+Tensor& exponential_(Tensor& self, double lambda, std::optional<Generator> gen) {
   return at::native::templates::exponential_impl_<native::templates::cpu::ExponentialKernel, TestCPUGenerator>(self, lambda, gen);
 }
 
 // ================================================== Bernoulli =======================================================
 
-Tensor& bernoulli_Tensor(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+Tensor& bernoulli_Tensor(Tensor& self, const Tensor& p_, std::optional<Generator> gen) {
   return at::native::templates::bernoulli_impl_<native::templates::cpu::BernoulliKernel, TestCPUGenerator>(self, p_, gen);
 }
 
-Tensor& bernoulli_float(Tensor& self, double p, c10::optional<Generator> gen) {
+Tensor& bernoulli_float(Tensor& self, double p, std::optional<Generator> gen) {
   return at::native::templates::bernoulli_impl_<native::templates::cpu::BernoulliKernel, TestCPUGenerator>(self, p, gen);
 }
 
-Tensor& bernoulli_out(const Tensor& self, c10::optional<Generator> gen, Tensor& result) {
+Tensor& bernoulli_out(const Tensor& self, std::optional<Generator> gen, Tensor& result) {
   return at::native::templates::bernoulli_out_impl<native::templates::cpu::BernoulliKernel, TestCPUGenerator>(result, self, gen);
 }
 
diff --git a/aten/src/ATen/test/cuda_distributions_test.cu b/aten/src/ATen/test/cuda_distributions_test.cu
index 82d3d7777bc23..dcb5c9cc19cf0 100644
--- a/aten/src/ATen/test/cuda_distributions_test.cu
+++ b/aten/src/ATen/test/cuda_distributions_test.cu
@@ -173,7 +173,7 @@ TEST(RandomPermutationTest, TestIslandShuffle) {
   bool shuffled2 = false;
   for (int i = 0; i < 100; i++) {
     cudaDeviceSynchronize();
-    c10::optional<at::Generator> gen = c10::nullopt;
+    std::optional<at::Generator> gen = c10::nullopt;
     randperm_handle_duplicate_keys(keys, values, 8, 5, gen);
     cudaDeviceSynchronize();
     std::vector<int> slice1 = {values[0], values[1], values[2]};
diff --git a/aten/src/ATen/test/cuda_optional_test.cu b/aten/src/ATen/test/cuda_optional_test.cu
index b35180d921e9f..be51a4cbe8c97 100644
--- a/aten/src/ATen/test/cuda_optional_test.cu
+++ b/aten/src/ATen/test/cuda_optional_test.cu
@@ -11,8 +11,8 @@ using namespace at;
 // optional in cuda files
 TEST(OptionalTest, OptionalTestCUDA) {
   if (!at::cuda::is_available()) return;
-  c10::optional<int64_t> trivially_destructible;
-  c10::optional<std::vector<int64_t>> non_trivially_destructible;
+  std::optional<int64_t> trivially_destructible;
+  std::optional<std::vector<int64_t>> non_trivially_destructible;
   ASSERT_FALSE(trivially_destructible.has_value());
   ASSERT_FALSE(non_trivially_destructible.has_value());
 
diff --git a/aten/src/ATen/test/cuda_stream_test.cpp b/aten/src/ATen/test/cuda_stream_test.cpp
index 77100482b5955..b6b3bf7f9e7de 100644
--- a/aten/src/ATen/test/cuda_stream_test.cpp
+++ b/aten/src/ATen/test/cuda_stream_test.cpp
@@ -408,7 +408,7 @@ TEST(TestStream, ExternalMultiThreadTest) {
 
   std::promise<void> aToBProm;
   std::promise<void> bToAProm;
-  c10::optional<at::cuda::CUDAStream> foundStream;
+  std::optional<at::cuda::CUDAStream> foundStream;
 
   std::thread threadA([&]() {
     at::cuda::CUDAGuard device_guard(0);
diff --git a/aten/src/ATen/test/extension_backend_test.cpp b/aten/src/ATen/test/extension_backend_test.cpp
index 4be68b1d0a710..3b2345f347d63 100644
--- a/aten/src/ATen/test/extension_backend_test.cpp
+++ b/aten/src/ATen/test/extension_backend_test.cpp
@@ -15,8 +15,8 @@ using namespace at;
 
 static int test_int;
 
-Tensor empty_override(SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout,
-                      c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> optional_memory_format) {
+Tensor empty_override(SymIntArrayRef size, std::optional<ScalarType> dtype, c10::optional<Layout> layout,
+                      std::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> optional_memory_format) {
   test_int = 1;
   auto tensor_impl = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
       Storage(
@@ -39,10 +39,10 @@ Tensor add_override(const Tensor & a, const Tensor & b , const Scalar& c) {
 Tensor empty_strided_override(
   IntArrayRef size,
   IntArrayRef stride,
-  c10::optional<c10::ScalarType> dtype,
-  c10::optional<c10::Layout> layout,
-  c10::optional<c10::Device> device,
-  c10::optional<bool> pin_memory) {
+  std::optional<c10::ScalarType> dtype,
+  std::optional<c10::Layout> layout,
+  std::optional<c10::Device> device,
+  std::optional<bool> pin_memory) {
 
   return empty_override(fromIntArrayRefSlow(size), dtype, layout, device, pin_memory, c10::nullopt);
 }
diff --git a/aten/src/ATen/test/operator_name_test.cpp b/aten/src/ATen/test/operator_name_test.cpp
index 6d074572dd748..f670a434cb638 100644
--- a/aten/src/ATen/test/operator_name_test.cpp
+++ b/aten/src/ATen/test/operator_name_test.cpp
@@ -9,7 +9,7 @@ TEST(OperatorNameTest, SetNamespaceIfNotSetWithoutExistingNamespace) {
   EXPECT_TRUE(result);
   EXPECT_EQ(testName.name, "ns::operator");
   EXPECT_EQ(testName.overload_name, "operator.overload");
-  EXPECT_EQ(testName.getNamespace(), c10::optional<c10::string_view>("ns"));
+  EXPECT_EQ(testName.getNamespace(), std::optional<c10::string_view>("ns"));
 }
 
 TEST(OperatorNameTest, SetNamespaceIfNotSetWithExistingNamespace) {
@@ -18,5 +18,5 @@ TEST(OperatorNameTest, SetNamespaceIfNotSetWithExistingNamespace) {
   EXPECT_FALSE(result);
   EXPECT_EQ(namespacedName.name, "already_namespaced::operator");
   EXPECT_EQ(namespacedName.overload_name, "operator.overload");
-  EXPECT_EQ(namespacedName.getNamespace(), c10::optional<c10::string_view>("already_namespaced"));
+  EXPECT_EQ(namespacedName.getNamespace(), std::optional<c10::string_view>("already_namespaced"));
 }
diff --git a/aten/src/ATen/test/rng_test.h b/aten/src/ATen/test/rng_test.h
index df04d340893fb..82b9c6d5a836e 100644
--- a/aten/src/ATen/test/rng_test.h
+++ b/aten/src/ATen/test/rng_test.h
@@ -68,14 +68,14 @@ void test_random_from_to(const at::Device& device) {
   constexpr auto uint64_max_val = std::numeric_limits<uint64_t>::max();
 
   std::vector<int64_t> froms;
-  std::vector<c10::optional<int64_t>> tos;
+  std::vector<::std::optional<int64_t>> tos;
   if constexpr (::std::is_same_v<T, bool>) {
     froms = {
       0L
     };
     tos = {
       1L,
-      static_cast<c10::optional<int64_t>>(c10::nullopt)
+      static_cast<::std::optional<int64_t>>(c10::nullopt)
     };
   } else if constexpr (::std::is_signed_v<T>) {
     constexpr int64_t min_from = _min_from<T>();
@@ -86,11 +86,11 @@ void test_random_from_to(const at::Device& device) {
       42L
     };
     tos = {
-      c10::optional<int64_t>(-42L),
-      c10::optional<int64_t>(0L),
-      c10::optional<int64_t>(42L),
-      c10::optional<int64_t>(max_to),
-      static_cast<c10::optional<int64_t>>(c10::nullopt)
+      ::std::optional<int64_t>(-42L),
+      ::std::optional<int64_t>(0L),
+      ::std::optional<int64_t>(42L),
+      ::std::optional<int64_t>(max_to),
+      static_cast<::std::optional<int64_t>>(c10::nullopt)
     };
   } else {
     froms = {
@@ -98,9 +98,9 @@ void test_random_from_to(const at::Device& device) {
       42L
     };
     tos = {
-      c10::optional<int64_t>(42L),
-      c10::optional<int64_t>(max_to),
-      static_cast<c10::optional<int64_t>>(c10::nullopt)
+      ::std::optional<int64_t>(42L),
+      ::std::optional<int64_t>(max_to),
+      static_cast<::std::optional<int64_t>>(c10::nullopt)
     };
   }
 
@@ -116,7 +116,7 @@ void test_random_from_to(const at::Device& device) {
   bool from_to_case_covered = false;
   bool from_case_covered = false;
   for (const int64_t from : froms) {
-    for (const c10::optional<int64_t> to : tos) {
+    for (const ::std::optional<int64_t> to : tos) {
       if (!to.has_value() || from < *to) {
         for (const uint64_t val : vals) {
           auto gen = at::make_generator<RNG>(val);
diff --git a/aten/src/ATen/test/type_test.cpp b/aten/src/ATen/test/type_test.cpp
index 3ea64a4da2124..955d60c586c0f 100644
--- a/aten/src/ATen/test/type_test.cpp
+++ b/aten/src/ATen/test/type_test.cpp
@@ -9,7 +9,7 @@ namespace c10 {
 
 TEST(TypeCustomPrinter, Basic) {
   TypePrinter printer =
-      [](const Type& t) -> c10::optional<std::string> {
+      [](const Type& t) -> std::optional<std::string> {
     if (auto tensorType = t.cast<TensorType>()) {
       return "CustomTensor";
     }
@@ -29,7 +29,7 @@ TEST(TypeCustomPrinter, Basic) {
 
 TEST(TypeCustomPrinter, ContainedTypes) {
   TypePrinter printer =
-      [](const Type& t) -> c10::optional<std::string> {
+      [](const Type& t) -> std::optional<std::string> {
     if (auto tensorType = t.cast<TensorType>()) {
       return "CustomTensor";
     }
@@ -53,7 +53,7 @@ TEST(TypeCustomPrinter, ContainedTypes) {
 
 TEST(TypeCustomPrinter, NamedTuples) {
   TypePrinter printer =
-      [](const Type& t) -> c10::optional<std::string> {
+      [](const Type& t) -> std::optional<std::string> {
     if (auto tupleType = t.cast<TupleType>()) {
       // Rewrite only NamedTuples
       if (tupleType->name()) {
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 5b6a31e0b5147..687691a370bf4 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -177,8 +177,8 @@ static void gen_all_subsets(
 static void slice_test(
     const std::vector<int64_t>& size,
     int64_t dim,
-    c10::optional<int64_t> start,
-    c10::optional<int64_t> end,
+    std::optional<int64_t> start,
+    std::optional<int64_t> end,
     int64_t step) {
   // Arrange
   const auto in_cpu = at::rand(size, at::device(at::kCPU).dtype(at::kFloat));
@@ -212,7 +212,7 @@ static void slice_tests(const std::unordered_map<int64_t, std::vector<int64_t>>&
   }
 }
 
-static void clone_test(const std::vector<int64_t>& size, c10::optional<at::MemoryFormat> optional_memory_format) {
+static void clone_test(const std::vector<int64_t>& size, std::optional<at::MemoryFormat> optional_memory_format) {
   // Arrange
   const auto in_cpu = at::rand(size, at::device(at::kCPU).dtype(at::kFloat));
   const auto in_vulkan = in_cpu.vulkan();
@@ -249,7 +249,7 @@ inline std::vector<c10::IValue> callOpByName(
     const char* func_name,
     const char* overload_name,
     Args... args) {
-  const c10::optional<c10::OperatorHandle> op_handle =
+  const std::optional<c10::OperatorHandle> op_handle =
       c10::Dispatcher::singleton().findSchema({func_name, overload_name});
   assert(op_handle.has_value());
   return callOpByHandle(op_handle.value(), std::forward<Args>(args)...);
@@ -7120,7 +7120,7 @@ TEST_F(VulkanAPITest, zeros) {
 
 TEST_F(VulkanAPITest, clone_success) {
   // Arrange
-  std::multimap<c10::optional<c10::MemoryFormat>, std::vector<int64_t>> mem2sizes {
+  std::multimap<std::optional<c10::MemoryFormat>, std::vector<int64_t>> mem2sizes {
     {c10::MemoryFormat::Preserve, {2, 3, 5, 161}},    // 4D tensors with MemoryFormat::Preserve
     {c10::MemoryFormat::Contiguous, {2, 3, 5, 161}},  // 4D tensors with MemoryFormat::Contiguous
     {{}, {2, 3, 5, 161}},                             // 4D tensors with null
diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index 031154de17f85..cf243d5ce50c9 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -136,7 +136,7 @@ inline std::vector<c10::IValue> callOpByName(
     const char* func_name,
     const char* overload_name,
     Args... args) {
-  const c10::optional<c10::OperatorHandle> op_handle =
+  const std::optional<c10::OperatorHandle> op_handle =
       c10::Dispatcher::singleton().findSchema({func_name, overload_name});
   assert(op_handle.has_value());
   return callOpByHandle(op_handle.value(), std::forward<Args>(args)...);
diff --git a/aten/src/ATen/xpu/CachingHostAllocator.cpp b/aten/src/ATen/xpu/CachingHostAllocator.cpp
index 13cd1b6124a9b..332114a8715b7 100644
--- a/aten/src/ATen/xpu/CachingHostAllocator.cpp
+++ b/aten/src/ATen/xpu/CachingHostAllocator.cpp
@@ -20,7 +20,7 @@ struct XPUCachingHostAllocatorImpl
   }
 
   void record_stream(
-      c10::optional<std::vector<XPUEvent>>& events,
+      std::optional<std::vector<XPUEvent>>& events,
       XPUStream stream) override {
     XPUEvent event;
     event.record(stream);
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 1987a60f64fbb..096dbc48ec7da 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -108,6 +108,7 @@
 current_onnx_compiler = ""
 current_batch_size = None
 output_filename = None
+disable_output = False
 
 MAX_DOWNLOAD_ATTEMPTS = 5
 
@@ -306,6 +307,9 @@ def load_model_from_path(path_and_class_str):
 
 
 def output_csv(filename, headers, row):
+    global disable_output
+    if disable_output:
+        return
     if os.path.exists(filename):
         with open(filename) as fd:
             lines = list(csv.reader(fd)) or [[]]
@@ -3212,6 +3216,11 @@ def get_example_inputs(self):
         "--output-directory",
         help="Overrides the directory to place output files.",
     )
+    parser.add_argument(
+        "--disable-output",
+        action="store_true",
+        help="Disable writing of output files, e.g., for warm-up runs",
+    )
     parser.add_argument(
         "--baseline",
         help="Compare with a prior --output",
@@ -3391,6 +3400,7 @@ def get_example_inputs(self):
     )
     group_latency.add_argument(
         "--warm-start-latency",
+        "--warm_start_latency",
         action="store_true",
         help="Run model(s) twice and preseve caches in between to enable a 'warm start' on the 2nd run",
     )
@@ -3610,10 +3620,11 @@ def main(runner, original_dir=None, args=None):
             cmd = [sys.executable] + sys.argv
             cmd.remove("--warm-start-latency")
 
-            print(f"Executing cold-start run for {args.only}")
-            subprocess.check_call(cmd, timeout=args.timeout, env=env)
+            print(f"Performing cold-start run for {args.only}")
+            warmup_cmd = cmd + ["--repeat=1", "--disable-output"]
+            subprocess.check_call(warmup_cmd, timeout=args.timeout, env=env)
 
-            print(f"Executing warm-start run for {args.only}")
+            print(f"Performing warm-start run for {args.only}")
             subprocess.check_call(cmd, timeout=args.timeout, env=env)
         else:
             # single process path just uses the main process
@@ -3666,7 +3677,7 @@ def run(runner, args, original_dir=None):
     if args.ci:
         if args.accuracy:
             # Run fewer iterations when checking accuracy
-            args.repeat = 2
+            args.repeat = min(args.repeat, 2)
 
             # Set translation validation on by default on CI accuracy runs.
             torch.fx.experimental._config.translation_validation = True
@@ -3820,9 +3831,12 @@ def run(runner, args, original_dir=None):
         runner.skip_models.clear()
 
     experiment = null_experiment
-    global current_name, current_device, current_batch_size, output_filename, optimize_ctx, current_onnx_compiler
+    global current_name, current_device, current_batch_size, output_filename, disable_output, optimize_ctx, current_onnx_compiler
     optimize_ctx = contextlib.nullcontext()
 
+    if args.disable_output:
+        disable_output = True
+
     if args.overhead:
         optimize_ctx = torch._dynamo.optimize(dummy_fx_compile, nopython=args.nopython)
         experiment = speedup_experiment
diff --git a/benchmarks/transformer/score_mod.py b/benchmarks/transformer/score_mod.py
index 0e9e8d11a35b9..2c5f41502f7ea 100644
--- a/benchmarks/transformer/score_mod.py
+++ b/benchmarks/transformer/score_mod.py
@@ -1,3 +1,4 @@
+import argparse
 import itertools
 from collections import defaultdict
 from dataclasses import asdict, dataclass
@@ -98,7 +99,7 @@ def generate_inputs(
     return query, key, value
 
 
-def run_single_experiment(config: ExperimentConfig) -> ExperimentResults:
+def run_single_experiment(config: ExperimentConfig, dynamic=False) -> ExperimentResults:
     device = torch.device("cuda")
     query, key, value = generate_inputs(
         config.batch_size,
@@ -113,7 +114,7 @@ def run_single_experiment(config: ExperimentConfig) -> ExperimentResults:
     def eager_sdpa(query, key, value, _):
         return F.scaled_dot_product_attention(query, key, value)
 
-    compiled_sdpa = torch.compile(_flex_attention)
+    compiled_sdpa = torch.compile(_flex_attention, dynamic=dynamic)
 
     score_mod = config.score_mod
 
@@ -242,16 +243,26 @@ def generate_experiment_configs() -> List[ExperimentConfig]:
     return all_configs
 
 
-def main():
+def main(dynamic=False):
     seed = 123
     np.random.seed(seed)
     torch.manual_seed(seed)
     results = []
     for config in tqdm(generate_experiment_configs()):
-        results.append(Experiment(config, run_single_experiment(config)))
+        results.append(
+            Experiment(config, run_single_experiment(config, dynamic=dynamic))
+        )
 
     print_results(results)
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dynamic",
+        action="store_true",
+        help="Runs a dynamic shapes version of compiled flex attention.",
+    )
+
+    args = parser.parse_args()
+    main(args.dynamic)
diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt
index 70b235e43e7d7..273353128baaf 100644
--- a/binaries/CMakeLists.txt
+++ b/binaries/CMakeLists.txt
@@ -7,16 +7,6 @@ if(INTERN_BUILD_MOBILE)
   return()
 endif()
 
-if(BUILD_CAFFE2)
-  caffe2_binary_target("at_launch_benchmark.cc")
-  target_include_directories(at_launch_benchmark PUBLIC
-    ${CMAKE_BINARY_DIR}/aten/src)
-
-  caffe2_binary_target("intra_inter_benchmark.cc")
-  target_include_directories(intra_inter_benchmark PUBLIC
-    ${CMAKE_BINARY_DIR}/aten/src)
-endif()
-
 caffe2_binary_target("parallel_info.cc")
 target_include_directories(parallel_info PUBLIC
   ${CMAKE_BINARY_DIR}/aten/src) # provides "ATen/TypeExtendedInterface.h" to ATen.h
diff --git a/binaries/compare_models_torch.cc b/binaries/compare_models_torch.cc
index 5e90445560bc7..c8338fe546a59 100644
--- a/binaries/compare_models_torch.cc
+++ b/binaries/compare_models_torch.cc
@@ -305,7 +305,7 @@ int main(int argc, char** argv) {
   torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard(false);
 
   c10::CPUCachingAllocator caching_allocator;
-  c10::optional<c10::WithCPUCachingAllocatorGuard> caching_allocator_guard;
+  std::optional<c10::WithCPUCachingAllocatorGuard> caching_allocator_guard;
   if (FLAGS_use_caching_allocator) {
     caching_allocator_guard.emplace(&caching_allocator);
   }
diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc
index b2c521e569b16..00b17ddd47488 100644
--- a/binaries/speed_benchmark_torch.cc
+++ b/binaries/speed_benchmark_torch.cc
@@ -294,7 +294,7 @@ int main(int argc, char** argv) {
   }
 
   c10::CPUCachingAllocator caching_allocator;
-  c10::optional<c10::WithCPUCachingAllocatorGuard> caching_allocator_guard;
+  std::optional<c10::WithCPUCachingAllocatorGuard> caching_allocator_guard;
   if (FLAGS_use_caching_allocator) {
     caching_allocator_guard.emplace(&caching_allocator);
   }
diff --git a/buckbuild.bzl b/buckbuild.bzl
index 89707dd9bc3f0..4c4fc9a89a280 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -279,7 +279,6 @@ def get_pt_preprocessor_flags():
         "-D_THP_CORE",
         "-DUSE_SCALARS",
         "-DNO_CUDNN_DESTROY_HANDLE",
-        "-DBUILD_CAFFE2",
     ]
 
     if _is_build_mode_dev():
diff --git a/build_variables.bzl b/build_variables.bzl
index d0d5857c2b3c9..6fd04b7701157 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -106,6 +106,7 @@ libtorch_profiler_sources = [
     "torch/csrc/profiler/standalone/execution_trace_observer.cpp",
     "torch/csrc/profiler/standalone/itt_observer.cpp",
     "torch/csrc/profiler/standalone/nvtx_observer.cpp",
+    "torch/csrc/profiler/standalone/privateuse1_observer.cpp",
     "torch/csrc/profiler/stubs/base.cpp",
     "torch/csrc/profiler/orchestration/vulkan.cpp",
     "torch/csrc/profiler/perf.cpp",
@@ -825,6 +826,7 @@ libtorch_python_core_sources = [
     "torch/csrc/mtia/Module.cpp",
     "torch/csrc/inductor/aoti_runner/pybind.cpp",
     "torch/csrc/inductor/aoti_eager/kernel_holder.cpp",
+    "torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp",
     "torch/csrc/jit/backends/backend_init.cpp",
     "torch/csrc/jit/python/init.cpp",
     "torch/csrc/jit/passes/onnx.cpp",
@@ -1171,7 +1173,6 @@ aten_native_source_codegen_list = [
     "aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp",
     "aten/src/ATen/native/cpu/FusedAdamKernel.cpp",
     "aten/src/ATen/native/cpu/FusedSGDKernel.cpp",
-    "aten/src/ATen/native/cpu/FusedAdagradKernel.cpp",
 ]
 
 # This aten native source file list will not go through aten codegen process
@@ -1408,7 +1409,6 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/xnnpack/Shim.cpp",
     "aten/src/ATen/native/FusedAdam.cpp",
     "aten/src/ATen/native/FusedSGD.cpp",
-    "aten/src/ATen/native/FusedAdagrad.cpp",
     # Files not in native, but depends on native symbols
     # "aten/src/ATen/TensorIndexing.cpp",
     "aten/src/ATen/TensorIterator.cpp",
diff --git a/c10/core/ConstantSymNodeImpl.h b/c10/core/ConstantSymNodeImpl.h
index 4df1d1010f807..3c0fb66f7469f 100644
--- a/c10/core/ConstantSymNodeImpl.h
+++ b/c10/core/ConstantSymNodeImpl.h
@@ -69,14 +69,14 @@ class C10_API ConstantSymNodeImpl : public SymNodeImpl {
       return ::std::get<bool>(value_) ? "true" : "false";
     }
   }
-  c10::optional<int64_t> constant_int() override {
+  std::optional<int64_t> constant_int() override {
     if constexpr (is_int_()) {
       return ::std::get<int64_t>(value_);
     } else {
       return c10::nullopt;
     }
   }
-  c10::optional<bool> constant_bool() override {
+  std::optional<bool> constant_bool() override {
     if constexpr (is_bool_()) {
       return ::std::get<bool>(value_);
     } else {
diff --git a/c10/core/StorageImpl.cpp b/c10/core/StorageImpl.cpp
index dc36064ddca4e..9dd6f5f431316 100644
--- a/c10/core/StorageImpl.cpp
+++ b/c10/core/StorageImpl.cpp
@@ -68,7 +68,7 @@ c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
     c10::DataPtr data_ptr,
     c10::Allocator* allocator,
     bool resizable,
-    c10::optional<at::Device> device_opt) {
+    std::optional<at::Device> device_opt) {
   // This will be non-nullptr only when there is a custom StorageImpl
   // constructor for the given device
   c10::StorageImplCreateHelper fptr = nullptr;
diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h
index 4ee9f62e620f5..abe6218fbc941 100644
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@@ -325,6 +325,6 @@ C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
     c10::DataPtr data_ptr,
     c10::Allocator* allocator,
     bool resizable,
-    c10::optional<at::Device> device_opt);
+    std::optional<at::Device> device_opt);
 
 } // namespace c10
diff --git a/c10/core/SymBool.h b/c10/core/SymBool.h
index cf984611e2340..9f9f141293a37 100644
--- a/c10/core/SymBool.h
+++ b/c10/core/SymBool.h
@@ -34,7 +34,7 @@ class C10_API SymBool {
   SymNode wrap_node(const SymNode& base) const;
 
   bool expect_bool() const {
-    c10::optional<bool> c = maybe_as_bool();
+    std::optional<bool> c = maybe_as_bool();
     TORCH_CHECK(c.has_value());
     return *c;
   }
@@ -66,7 +66,7 @@ class C10_API SymBool {
     return data_;
   }
 
-  c10::optional<bool> maybe_as_bool() const {
+  std::optional<bool> maybe_as_bool() const {
     if (!is_heap_allocated()) {
       return c10::make_optional(data_);
     }
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index 79ce4054b8640..025c351334a01 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -229,7 +229,7 @@ class C10_API SymInt {
     return data_;
   }
 
-  c10::optional<int64_t> maybe_as_int() const {
+  std::optional<int64_t> maybe_as_int() const {
     if (!is_heap_allocated()) {
       return c10::make_optional(data_);
     }
diff --git a/c10/core/SymIntArrayRef.h b/c10/core/SymIntArrayRef.h
index 76137aa47bdbb..760f4ba4e79a2 100644
--- a/c10/core/SymIntArrayRef.h
+++ b/c10/core/SymIntArrayRef.h
@@ -19,7 +19,7 @@ inline at::IntArrayRef asIntArrayRefUnchecked(c10::SymIntArrayRef ar) {
 // allocate another buffer and write the integers into it.  If you need it,
 // we can do it.  But I don't think you need it.
 
-inline c10::optional<at::IntArrayRef> asIntArrayRefSlowOpt(
+inline std::optional<at::IntArrayRef> asIntArrayRefSlowOpt(
     c10::SymIntArrayRef ar) {
   for (const c10::SymInt& sci : ar) {
     if (sci.is_heap_allocated()) {
diff --git a/c10/core/SymNodeImpl.h b/c10/core/SymNodeImpl.h
index 0413b9ff28482..9ffab5065109e 100644
--- a/c10/core/SymNodeImpl.h
+++ b/c10/core/SymNodeImpl.h
@@ -30,61 +30,61 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
   // these could be pure virtual when we implement LTC versions
   virtual bool is_int() {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual bool is_bool() {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual bool is_float() {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual bool is_nested_int() const {
     return false;
-  };
+  }
   virtual SymNode add(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode sub(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode mul(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode truediv(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode pow(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode floordiv(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode mod(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode eq(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode ne(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode gt(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode lt(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode le(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode ge(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode ceil() {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode floor() {
     TORCH_CHECK(false, "NYI");
-  };
+  }
   virtual SymNode neg() {
     TORCH_CHECK(false, "NYI");
   };
@@ -188,19 +188,19 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
   virtual std::string str() {
     TORCH_CHECK(false, "NYI");
   };
-  virtual c10::optional<int64_t> nested_int() {
+  virtual std::optional<int64_t> nested_int() {
     return c10::nullopt;
   }
-  virtual c10::optional<int64_t> nested_int_coeff() {
+  virtual std::optional<int64_t> nested_int_coeff() {
     return c10::nullopt;
   }
-  virtual c10::optional<int64_t> constant_int() {
+  virtual std::optional<int64_t> constant_int() {
     return c10::nullopt;
   }
-  virtual c10::optional<bool> constant_bool() {
+  virtual std::optional<bool> constant_bool() {
     return c10::nullopt;
   }
-  virtual c10::optional<int64_t> maybe_as_int() {
+  virtual std::optional<int64_t> maybe_as_int() {
     return c10::nullopt;
   }
   virtual bool is_constant() {
diff --git a/c10/core/SymbolicShapeMeta.cpp b/c10/core/SymbolicShapeMeta.cpp
index 04b2f8da832f4..62b03d36ec71c 100644
--- a/c10/core/SymbolicShapeMeta.cpp
+++ b/c10/core/SymbolicShapeMeta.cpp
@@ -28,7 +28,7 @@ SymbolicShapeMeta::SymbolicShapeMeta(const SymbolicShapeMeta& other)
 }
 
 // base, sizes, strides
-static c10::optional<
+static std::optional<
     std::tuple<SymNode, std::vector<SymNode>, std::vector<SymNode>>>
 normalize_sym_sizes_strides(SymIntArrayRef sizes, SymIntArrayRef strides) {
   // Look for a SymNode to dispatch on
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 320dc7796877e..47f83c78e5789 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -127,7 +127,7 @@ TensorImpl::TensorImpl(
 TensorImpl::TensorImpl(
     DispatchKeySet key_set,
     const caffe2::TypeMeta data_type,
-    c10::optional<c10::Device> device_opt)
+    std::optional<c10::Device> device_opt)
     : TensorImpl({}, key_set, data_type, device_opt) {}
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
@@ -135,7 +135,7 @@ TensorImpl::TensorImpl(
     Storage&& storage,
     DispatchKeySet key_set,
     const caffe2::TypeMeta data_type,
-    c10::optional<c10::Device> device_opt)
+    std::optional<c10::Device> device_opt)
     : storage_(std::move(storage)),
 
       numel_(0),
@@ -846,7 +846,7 @@ static void clone_symvec(SymIntArrayRef src, SymDimVector& dst) {
 void TensorImpl::set_sizes_and_strides(
     c10::SymIntArrayRef sizes,
     c10::SymIntArrayRef strides,
-    c10::optional<c10::SymInt> storage_offset) {
+    std::optional<c10::SymInt> storage_offset) {
   auto int_sizes = asIntArrayRefSlowOpt(sizes);
   auto int_strides = asIntArrayRefSlowOpt(strides);
   if (int_sizes && int_strides &&
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 3a74c8936297e..e49a66c916ffb 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -233,8 +233,8 @@ struct C10_API ExtraMeta {
   std::unique_ptr<c10::SymbolicShapeMeta> symbolic_shape_meta_ = nullptr;
   std::unique_ptr<c10::NamedTensorMetaInterface> named_tensor_meta_ = nullptr;
   intrusive_ptr<c10::BackendMeta> backend_meta_ = nullptr;
-  c10::optional<std::string> custom_data_ptr_error_msg_ = c10::nullopt;
-  c10::optional<std::string> custom_storage_error_msg_ = c10::nullopt;
+  std::optional<std::string> custom_data_ptr_error_msg_ = c10::nullopt;
+  std::optional<std::string> custom_storage_error_msg_ = c10::nullopt;
 
   ExtraMeta() = default;
   ExtraMeta(const ExtraMeta& other) {
@@ -260,8 +260,8 @@ struct C10_API ExtraMeta {
       std::unique_ptr<c10::SymbolicShapeMeta> symbolic_shape_meta,
       std::unique_ptr<c10::NamedTensorMetaInterface> named_tensor_meta,
       intrusive_ptr<c10::BackendMeta> backend_meta,
-      c10::optional<std::string> custom_data_ptr_error_msg = c10::nullopt,
-      c10::optional<std::string> custom_storage_access_error_msg = c10::nullopt)
+      std::optional<std::string> custom_data_ptr_error_msg = c10::nullopt,
+      std::optional<std::string> custom_storage_access_error_msg = c10::nullopt)
       : symbolic_shape_meta_(std::move(symbolic_shape_meta)),
         named_tensor_meta_(std::move(named_tensor_meta)),
         backend_meta_(std::move(backend_meta)),
@@ -528,7 +528,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   TensorImpl(
       DispatchKeySet,
       const caffe2::TypeMeta data_type,
-      c10::optional<c10::Device> device_opt);
+      std::optional<c10::Device> device_opt);
 
   // Legacy constructors so I don't have to go update call sites.
   // TODO: When Variable is added, delete these constructors
@@ -543,7 +543,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   TensorImpl(
       DispatchKey dispatch_key,
       const caffe2::TypeMeta data_type,
-      c10::optional<c10::Device> device_opt)
+      std::optional<c10::Device> device_opt)
       : TensorImpl(DispatchKeySet(dispatch_key), data_type, device_opt) {}
 
  private:
@@ -555,7 +555,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       Storage&& storage,
       DispatchKeySet,
       const caffe2::TypeMeta data_type,
-      c10::optional<c10::Device>);
+      std::optional<c10::Device>);
 
  public:
   TensorImpl(const TensorImpl&) = delete;
@@ -1253,7 +1253,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
  protected:
   c10::Device device_default() const {
     TORCH_CHECK(device_opt_.has_value(), "tensor does not have a device");
-    // See NOTE [c10::optional operator usage in CUDA]
+    // See NOTE [std::optional operator usage in CUDA]
     return *device_opt_;
   }
 
@@ -1687,7 +1687,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   void release_storage_and_set_meta_custom_data_ptr_error_msg_(
-      c10::optional<std::string> s) {
+      std::optional<std::string> s) {
     storage_ = {};
     set_storage_access_should_throw();
     get_extra_meta().custom_data_ptr_error_msg_ = s;
@@ -1737,7 +1737,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   void set_sizes_and_strides(
       c10::SymIntArrayRef sizes,
       c10::SymIntArrayRef strides,
-      c10::optional<c10::SymInt> storage_offset = c10::nullopt);
+      std::optional<c10::SymInt> storage_offset = c10::nullopt);
   // This is renamed to avoid breaking overload BC
   void generic_set_sizes_contiguous(c10::SymIntArrayRef sizes);
   void generic_set_sizes_contiguous(c10::IntArrayRef sizes) {
@@ -1834,7 +1834,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   void set_sizes_and_strides(
       IntArrayRef new_size,
       IntArrayRef new_stride,
-      c10::optional<int64_t> storage_offset = c10::nullopt) {
+      std::optional<int64_t> storage_offset = c10::nullopt) {
     TORCH_CHECK(
         allow_tensor_metadata_change(),
         "set_sizes_and_strides ",
@@ -2129,10 +2129,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
  private:
-  // See NOTE [c10::optional operator usage in CUDA]
+  // See NOTE [std::optional operator usage in CUDA]
   // We probably don't want to expose this publicly until
   // the note is addressed.
-  c10::optional<c10::Device> device_opt() const {
+  std::optional<c10::Device> device_opt() const {
     return device_opt_;
   }
 
@@ -2146,7 +2146,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     TORCH_CHECK(
         device_opt_.has_value(),
         "device_type cannot be run on undefined Tensor");
-    // See NOTE [c10::optional operator usage in CUDA]
+    // See NOTE [std::optional operator usage in CUDA]
     return (*device_opt_).type();
   }
 
@@ -2875,7 +2875,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // agree with the type meta in storage
   caffe2::TypeMeta data_type_;
 
-  // NOTE [c10::optional operator usage in CUDA]
+  // NOTE [std::optional operator usage in CUDA]
   // Our optional definition doesn't compile in .cu file if `value()` or
   // `operator->` are used.  Instead, we always use `operator*`.
   // See https://github.com/pytorch/pytorch/issues/18496 for more info.
@@ -2887,7 +2887,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   //
   // INVARIANT: device_opt_ is only nullopt for undefined tensors
   // (which do not have a device.)
-  c10::optional<c10::Device> device_opt_;
+  std::optional<c10::Device> device_opt_;
 
   // default member initializers for bit-fields only available with -std=c++2a
   // or -std=gnu++2a
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index 765f474702ef7..d99005d3d28f8 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -24,28 +24,28 @@
 namespace c10 {
 
 DispatchKey computeDispatchKey(
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device);
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device);
 
-inline ScalarType dtype_or_default(c10::optional<ScalarType> dtype) {
+inline ScalarType dtype_or_default(std::optional<ScalarType> dtype) {
   return value_or_else(dtype, [] { return get_default_dtype_as_scalartype(); });
 }
 
 inline caffe2::TypeMeta dtype_or_default(
-    c10::optional<caffe2::TypeMeta> dtype) {
+    std::optional<caffe2::TypeMeta> dtype) {
   return value_or_else(dtype, [] { return get_default_dtype(); });
 }
 
-inline Layout layout_or_default(c10::optional<Layout> layout) {
+inline Layout layout_or_default(std::optional<Layout> layout) {
   return layout.value_or(kStrided);
 }
 
-inline Device device_or_default(c10::optional<Device> device) {
+inline Device device_or_default(std::optional<Device> device) {
   return value_or_else(device, [] { return Device(kCPU); });
 }
 
-inline bool pinned_memory_or_default(c10::optional<bool> pinned_memory) {
+inline bool pinned_memory_or_default(std::optional<bool> pinned_memory) {
   return pinned_memory.value_or(false);
 }
 
@@ -193,19 +193,19 @@ struct C10_API TensorOptions {
   /// Return a copy of `TensorOptions` with `device` set to the given one, or
   /// cleared if `device` is `nullopt`.
   C10_NODISCARD TensorOptions
-  device(c10::optional<Device> device) const noexcept {
+  device(std::optional<Device> device) const noexcept {
     TensorOptions r = *this;
     r.set_device(device);
     return r;
   }
 
   /// Return a copy of `TensorOptions` with `device` set to the given one.
-  /// (This overload ensures that variadic template c10::optional constructor
+  /// (This overload ensures that variadic template std::optional constructor
   /// for Device work correctly.)
   template <typename... Args>
   C10_NODISCARD TensorOptions device(Args&&... args) const noexcept {
     return device(
-        c10::optional<Device>(std::in_place, std::forward<Args>(args)...));
+        std::optional<Device>(std::in_place, std::forward<Args>(args)...));
   }
 
   /// Return a copy of `TensorOptions`, but with device set to CUDA, and the
@@ -220,7 +220,7 @@ struct C10_API TensorOptions {
 
   /// Return a copy of `TensorOptions` with `dtype` set to the given one.
   C10_NODISCARD TensorOptions
-  dtype(c10::optional<caffe2::TypeMeta> dtype) const noexcept {
+  dtype(std::optional<caffe2::TypeMeta> dtype) const noexcept {
     TensorOptions r = *this;
     r.set_dtype(dtype);
     return r;
@@ -228,7 +228,7 @@ struct C10_API TensorOptions {
 
   // legacy function to support ScalarType
   C10_NODISCARD TensorOptions
-  dtype(c10::optional<ScalarType> dtype) const noexcept {
+  dtype(std::optional<ScalarType> dtype) const noexcept {
     TensorOptions r = *this;
     r.set_dtype(dtype);
     return r;
@@ -244,7 +244,7 @@ struct C10_API TensorOptions {
 
   /// Sets the layout of the `TensorOptions`.
   C10_NODISCARD TensorOptions
-  layout(c10::optional<Layout> layout) const noexcept {
+  layout(std::optional<Layout> layout) const noexcept {
     TensorOptions r = *this;
     r.set_layout(layout);
     return r;
@@ -252,7 +252,7 @@ struct C10_API TensorOptions {
 
   /// Sets the `requires_grad` property of the `TensorOptions`.
   C10_NODISCARD TensorOptions
-  requires_grad(c10::optional<bool> requires_grad) const noexcept {
+  requires_grad(std::optional<bool> requires_grad) const noexcept {
     TensorOptions r = *this;
     r.set_requires_grad(requires_grad);
     return r;
@@ -260,7 +260,7 @@ struct C10_API TensorOptions {
 
   /// Sets the `pinned_memory` property on the `TensorOptions`.
   C10_NODISCARD TensorOptions
-  pinned_memory(c10::optional<bool> pinned_memory) const noexcept {
+  pinned_memory(std::optional<bool> pinned_memory) const noexcept {
     TensorOptions r = *this;
     r.set_pinned_memory(pinned_memory);
     return r;
@@ -268,7 +268,7 @@ struct C10_API TensorOptions {
 
   /// Sets the `memory_format` property on `TensorOptions`.
   C10_NODISCARD TensorOptions
-  memory_format(c10::optional<MemoryFormat> memory_format) const noexcept {
+  memory_format(std::optional<MemoryFormat> memory_format) const noexcept {
     TensorOptions r = *this;
     r.set_memory_format(memory_format);
     return r;
@@ -286,7 +286,7 @@ struct C10_API TensorOptions {
 
   /// Returns the device of the `TensorOptions`, or `c10::nullopt` if
   /// device is not specified.
-  c10::optional<Device> device_opt() const noexcept {
+  std::optional<Device> device_opt() const noexcept {
     return has_device_ ? c10::make_optional(device_) : c10::nullopt;
   }
 
@@ -307,7 +307,7 @@ struct C10_API TensorOptions {
 
   /// Returns the dtype of the `TensorOptions`, or `c10::nullopt` if
   /// device is not specified.
-  c10::optional<caffe2::TypeMeta> dtype_opt() const noexcept {
+  std::optional<caffe2::TypeMeta> dtype_opt() const noexcept {
     return has_dtype_ ? c10::make_optional(dtype_) : c10::nullopt;
   }
 
@@ -323,7 +323,7 @@ struct C10_API TensorOptions {
 
   /// Returns the layout of the `TensorOptions`, or `c10::nullopt` if
   /// layout is not specified.
-  c10::optional<Layout> layout_opt() const noexcept {
+  std::optional<Layout> layout_opt() const noexcept {
     return has_layout_ ? c10::make_optional(layout_) : c10::nullopt;
   }
 
@@ -339,7 +339,7 @@ struct C10_API TensorOptions {
 
   /// Returns the `requires_grad` property of the `TensorOptions`, or
   /// `c10::nullopt` if `requires_grad` is not specified.
-  c10::optional<bool> requires_grad_opt() const noexcept {
+  std::optional<bool> requires_grad_opt() const noexcept {
     return has_requires_grad_ ? c10::make_optional(requires_grad_)
                               : c10::nullopt;
   }
@@ -379,7 +379,7 @@ struct C10_API TensorOptions {
 
   /// Returns the `pinned_memory` property of the `TensorOptions`, or
   /// `c10::nullopt` if `pinned_memory` is not specified.
-  c10::optional<bool> pinned_memory_opt() const noexcept {
+  std::optional<bool> pinned_memory_opt() const noexcept {
     return has_pinned_memory_ ? c10::make_optional(pinned_memory_)
                               : c10::nullopt;
   }
@@ -394,7 +394,7 @@ struct C10_API TensorOptions {
 
   /// Returns the `memory_layout` property of `TensorOptions, or
   /// `c10::nullopt` if `memory_format` is not specified.
-  c10::optional<MemoryFormat> memory_format_opt() const noexcept {
+  std::optional<MemoryFormat> memory_format_opt() const noexcept {
     return has_memory_format_ ? c10::make_optional(memory_format_)
                               : c10::nullopt;
   }
@@ -435,7 +435,7 @@ struct C10_API TensorOptions {
 
   // TODO remove after TensorOptions rationalization
   TensorOptions merge_memory_format(
-      c10::optional<MemoryFormat> optional_memory_format) const noexcept {
+      std::optional<MemoryFormat> optional_memory_format) const noexcept {
     TensorOptions merged = *this;
     if (optional_memory_format.has_value()) {
       merged.set_memory_format(*optional_memory_format);
@@ -466,7 +466,7 @@ struct C10_API TensorOptions {
   // on temporaries.)
 
   /// Mutably set the device of `TensorOptions`.
-  void set_device(c10::optional<Device> device) & noexcept {
+  void set_device(std::optional<Device> device) & noexcept {
     if (device) {
       device_ = *device;
       has_device_ = true;
@@ -476,7 +476,7 @@ struct C10_API TensorOptions {
   }
 
   /// Mutably set the dtype of `TensorOptions`.
-  void set_dtype(c10::optional<caffe2::TypeMeta> dtype) & noexcept {
+  void set_dtype(std::optional<caffe2::TypeMeta> dtype) & noexcept {
     if (dtype) {
       dtype_ = *dtype;
       has_dtype_ = true;
@@ -486,7 +486,7 @@ struct C10_API TensorOptions {
   }
 
   // legacy function to support ScalarType
-  void set_dtype(c10::optional<ScalarType> dtype) & noexcept {
+  void set_dtype(std::optional<ScalarType> dtype) & noexcept {
     if (dtype) {
       dtype_ = scalarTypeToTypeMeta(*dtype);
       has_dtype_ = true;
@@ -496,7 +496,7 @@ struct C10_API TensorOptions {
   }
 
   /// Mutably set the layout of `TensorOptions`.
-  void set_layout(c10::optional<Layout> layout) & noexcept {
+  void set_layout(std::optional<Layout> layout) & noexcept {
     if (layout) {
       layout_ = *layout;
       has_layout_ = true;
@@ -506,7 +506,7 @@ struct C10_API TensorOptions {
   }
 
   /// Mutably set the `requires_grad` property of `TensorOptions`.
-  void set_requires_grad(c10::optional<bool> requires_grad) & noexcept {
+  void set_requires_grad(std::optional<bool> requires_grad) & noexcept {
     if (requires_grad) {
       requires_grad_ = *requires_grad;
       has_requires_grad_ = true;
@@ -516,7 +516,7 @@ struct C10_API TensorOptions {
   }
 
   /// Mutably set the `pinned_memory` property of `TensorOptions`.
-  void set_pinned_memory(c10::optional<bool> pinned_memory) & noexcept {
+  void set_pinned_memory(std::optional<bool> pinned_memory) & noexcept {
     if (pinned_memory) {
       pinned_memory_ = *pinned_memory;
       has_pinned_memory_ = true;
@@ -526,7 +526,7 @@ struct C10_API TensorOptions {
   }
 
   /// Mutably set the `memory_Format` property of `TensorOptions`.
-  void set_memory_format(c10::optional<MemoryFormat> memory_format) & noexcept {
+  void set_memory_format(std::optional<MemoryFormat> memory_format) & noexcept {
     if (memory_format) {
       memory_format_ = *memory_format;
       has_memory_format_ = true;
@@ -544,7 +544,7 @@ struct C10_API TensorOptions {
   //
   // TODO: MemoryFormat is not implemented in this way
 
-  // NB: We didn't use c10::optional here, because then we can't pack
+  // NB: We didn't use std::optional here, because then we can't pack
   // the has_***_ boolean fields.
 
   Device device_ = at::kCPU; // 16-bit
@@ -632,9 +632,9 @@ inline std::string toString(const TensorOptions& options) {
 // This is intended to be a centralized location by which we can determine
 // what an appropriate DispatchKey for a tensor is.
 inline DispatchKey computeDispatchKey(
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device) {
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device) {
   const auto layout_ = layout_or_default(layout);
   const auto device_ = device_or_default(device);
   switch (layout_) {
diff --git a/c10/core/impl/PyObjectSlot.h b/c10/core/impl/PyObjectSlot.h
index b3a4b85f05e8e..518b0e63e4921 100644
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@@ -93,8 +93,8 @@ struct C10_API PyObjectSlot {
   // be properly treated as a nonhermetic PyObject.
   //
   // NB: this lives in header so that we can avoid actually creating the
-  // c10::optional
-  c10::optional<PyObject*> check_pyobj(
+  // std::optional
+  std::optional<PyObject*> check_pyobj(
       PyInterpreter* self_interpreter,
       bool ignore_hermetic_tls = false) const {
     // Note [Memory ordering on Python interpreter tag]
diff --git a/c10/core/impl/TorchDispatchModeTLS.cpp b/c10/core/impl/TorchDispatchModeTLS.cpp
index e558a70522aca..f1847cb005b4c 100644
--- a/c10/core/impl/TorchDispatchModeTLS.cpp
+++ b/c10/core/impl/TorchDispatchModeTLS.cpp
@@ -121,7 +121,7 @@ int64_t TorchDispatchModeTLS::stack_len() {
   return stack_len + infra_modes_len;
 }
 
-const c10::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
+const std::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
 TorchDispatchModeTLS::get_mode(TorchDispatchModeKey mode_key) {
   return torchDispatchModeState.infra_modes_[static_cast<size_t>(mode_key)];
 }
@@ -145,7 +145,7 @@ void TorchDispatchModeTLS::set_mode(
   torchDispatchModeState.infra_modes_[static_cast<size_t>(mode_key)] = mode;
 }
 
-const c10::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
+const std::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
 TorchDispatchModeTLS::unset_mode(TorchDispatchModeKey mode_key) {
   auto out = torchDispatchModeState.infra_modes_[static_cast<size_t>(mode_key)];
   torchDispatchModeState.infra_modes_[static_cast<size_t>(mode_key)] =
diff --git a/c10/core/impl/TorchDispatchModeTLS.h b/c10/core/impl/TorchDispatchModeTLS.h
index d9ac8d8449b49..7179d52c35162 100644
--- a/c10/core/impl/TorchDispatchModeTLS.h
+++ b/c10/core/impl/TorchDispatchModeTLS.h
@@ -35,9 +35,9 @@ struct C10_API TorchDispatchModeTLS {
       int64_t idx);
   static int64_t stack_len();
 
-  static const c10::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
+  static const std::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
   get_mode(TorchDispatchModeKey mode_key);
-  static const c10::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
+  static const std::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
   unset_mode(TorchDispatchModeKey mode_key);
   static void set_mode(
       const std::shared_ptr<PyObject_TorchDispatchMode>& mode,
@@ -55,7 +55,7 @@ struct C10_API TorchDispatchModeTLS {
   // However, we only allow a single FakeTensorMode onto the stack at a time
   // (Pushing additional FakeTensorModes onto the stack is a no-op)
   std::array<
-      c10::optional<std::shared_ptr<PyObject_TorchDispatchMode>>,
+      std::optional<std::shared_ptr<PyObject_TorchDispatchMode>>,
       static_cast<size_t>(TorchDispatchModeKey::NUM_MODE_KEYS)>
       infra_modes_;
 };
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 8af2c41dfab7e..2479f96ab30b5 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -550,7 +550,7 @@ struct ExpandableSegment {
   CUdeviceptr ptr_{};
   size_t max_handles_{0};
   size_t segment_size_;
-  std::vector<c10::optional<CUmemGenericAllocationHandle>> handles_;
+  std::vector<std::optional<CUmemGenericAllocationHandle>> handles_;
   // devices on which this memory should be mapped in addition
   // to the device where the physical memory lives (device_).
   std::vector<c10::DeviceIndex> peers_;
@@ -1107,6 +1107,26 @@ class DeviceCachingAllocator {
               .current;
       auto observers_local = oom_observers_;
 
+      size_t allocated_in_private_pools = 0;
+      auto get_size_block = [](const BlockPool& pool) {
+        size_t res = 0;
+        for (const auto& block : pool.blocks) {
+          res += block->size;
+        }
+        return res;
+      };
+      for (const auto& p : graph_pools) {
+        allocated_in_private_pools += get_size_block(p.second->large_blocks);
+        allocated_in_private_pools += get_size_block(p.second->small_blocks);
+      }
+
+      std::string private_pool_msg;
+
+      if (allocated_in_private_pools > 0) {
+        private_pool_msg = "with " + format_size(allocated_in_private_pools) +
+            " allocated in private pools (e.g., CUDA Graphs), ";
+      }
+
       // Make sure we do not have the device lock before calling our
       // observers which might need hold the GIL
       // It is safe to release at this point because will no longer
@@ -1153,9 +1173,12 @@ class DeviceCachingAllocator {
           " is free. ",
           proc_info,
           "Of the allocated memory ",
-          format_size(allocated_bytes),
-          " is allocated by PyTorch, and ",
-          format_size(reserved_bytes - allocated_bytes),
+          format_size(allocated_bytes + allocated_in_private_pools),
+          " is allocated by PyTorch, ",
+          private_pool_msg,
+          "and ",
+          format_size(
+              reserved_bytes - allocated_bytes - allocated_in_private_pools),
           " is reserved by PyTorch but unallocated.",
           " If reserved but unallocated memory is large try setting",
           " PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid"
diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
index 652f222385465..2b53eb4d7c7cb 100644
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@@ -151,7 +151,7 @@ void warn_or_error_on_sync() {
   }
 }
 
-c10::optional<DeviceIndex> getDeviceIndexWithPrimaryContext() {
+std::optional<DeviceIndex> getDeviceIndexWithPrimaryContext() {
   // check current device first
   auto current_device_index = current_device();
   if (current_device_index >= 0) {
diff --git a/c10/cuda/CUDAFunctions.h b/c10/cuda/CUDAFunctions.h
index 72fdfc6fd692f..192fafbad10f4 100644
--- a/c10/cuda/CUDAFunctions.h
+++ b/c10/cuda/CUDAFunctions.h
@@ -111,6 +111,6 @@ C10_CUDA_API void __inline__ stream_synchronize(cudaStream_t stream) {
 }
 
 C10_CUDA_API bool hasPrimaryContext(DeviceIndex device_index);
-C10_CUDA_API c10::optional<DeviceIndex> getDeviceIndexWithPrimaryContext();
+C10_CUDA_API std::optional<DeviceIndex> getDeviceIndexWithPrimaryContext();
 
 } // namespace c10::cuda
diff --git a/c10/cuda/impl/CUDAGuardImpl.h b/c10/cuda/impl/CUDAGuardImpl.h
index 113f896c6fa29..ec50c8152b33e 100644
--- a/c10/cuda/impl/CUDAGuardImpl.h
+++ b/c10/cuda/impl/CUDAGuardImpl.h
@@ -40,7 +40,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     return Device(DeviceType::CUDA, device);
   }
-  c10::optional<Device> uncheckedGetDevice() const noexcept {
+  std::optional<Device> uncheckedGetDevice() const noexcept {
     DeviceIndex device{-1};
     const auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDevice(&device));
     C10_CUDA_CHECK_WARN(err);
diff --git a/c10/test/util/optional_test.cpp b/c10/test/util/optional_test.cpp
index f17cc61c51b1c..f95fc864d812c 100644
--- a/c10/test/util/optional_test.cpp
+++ b/c10/test/util/optional_test.cpp
@@ -22,7 +22,7 @@ using testing::Not;
 template <typename T>
 class OptionalTest : public ::testing::Test {
  public:
-  using optional = c10::optional<T>;
+  using optional = std::optional<T>;
 };
 
 template <typename T>
@@ -96,10 +96,10 @@ TYPED_TEST(OptionalTest, Initialized) {
   }
 }
 
-class SelfCompareTest : public testing::TestWithParam<c10::optional<int>> {};
+class SelfCompareTest : public testing::TestWithParam<std::optional<int>> {};
 
 TEST_P(SelfCompareTest, SelfCompare) {
-  c10::optional<int> x = GetParam();
+  std::optional<int> x = GetParam();
   EXPECT_THAT(x, Eq(x));
   EXPECT_THAT(x, Le(x));
   EXPECT_THAT(x, Ge(x));
@@ -118,7 +118,7 @@ INSTANTIATE_TEST_SUITE_P(
     testing::Values(c10::make_optional(2)));
 
 TEST(OptionalTest, Nullopt) {
-  c10::optional<int> x = 2;
+  std::optional<int> x = 2;
 
   EXPECT_THAT(c10::nullopt, Not(Eq(x)));
   EXPECT_THAT(x, Not(Eq(c10::nullopt)));
@@ -142,17 +142,17 @@ TEST(OptionalTest, Nullopt) {
 // Ensure comparisons work...
 using CmpTestTypes = testing::Types<
     // between two optionals
-    std::pair<c10::optional<int>, c10::optional<int>>,
+    std::pair<std::optional<int>, c10::optional<int>>,
 
     // between an optional and a value
-    std::pair<c10::optional<int>, int>,
+    std::pair<std::optional<int>, int>,
     // between a value and an optional
-    std::pair<int, c10::optional<int>>,
+    std::pair<int, std::optional<int>>,
 
     // between an optional and a differently typed value
-    std::pair<c10::optional<int>, long>,
+    std::pair<std::optional<int>, long>,
     // between a differently typed value and an optional
-    std::pair<long, c10::optional<int>>>;
+    std::pair<long, std::optional<int>>>;
 template <typename T>
 class CmpTest : public testing::Test {};
 TYPED_TEST_SUITE(CmpTest, CmpTestTypes);
diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h
index 59ea43f8d959c..2a56e60832993 100644
--- a/c10/util/ArrayRef.h
+++ b/c10/util/ArrayRef.h
@@ -61,7 +61,7 @@ class ArrayRef final {
   void debugCheckNullptrInvariant() {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         Data != nullptr || Length == 0,
-        "created ArrayRef with nullptr and non-zero length! c10::optional relies on this being illegal");
+        "created ArrayRef with nullptr and non-zero length! std::optional relies on this being illegal");
   }
 
  public:
diff --git a/c10/util/BFloat16.h b/c10/util/BFloat16.h
index 95bc5f91838b6..badde3681f341 100644
--- a/c10/util/BFloat16.h
+++ b/c10/util/BFloat16.h
@@ -99,7 +99,7 @@ struct alignas(2) BFloat16 {
   }
 
   constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t)
-      : x(bits){};
+      : x(bits) {}
   inline C10_HOST_DEVICE BFloat16(float value);
   inline C10_HOST_DEVICE operator float() const;
 
diff --git a/c10/util/Backtrace.cpp b/c10/util/Backtrace.cpp
index bbad1c879b7a4..7d0fedbb335a2 100644
--- a/c10/util/Backtrace.cpp
+++ b/c10/util/Backtrace.cpp
@@ -31,7 +31,30 @@
 
 namespace c10 {
 
-#if SUPPORTS_BACKTRACE && defined(C10_ANDROID)
+namespace {
+
+#ifdef FBCODE_CAFFE2
+
+// For some reason, the stacktrace implementation in fbcode is better than ours,
+// see https://github.com/pytorch/pytorch/issues/56399 When it's available, just
+// use that.
+class GetBacktraceImpl {
+ public:
+  C10_ALWAYS_INLINE GetBacktraceImpl(
+      size_t frames_to_skip,
+      size_t /* maximum_number_of_frames */,
+      bool /* skip_python_frames */)
+      : st_(/*skipFrames=*/frames_to_skip) {}
+
+  std::string symbolize() const {
+    return st_.toString();
+  }
+
+ private:
+  facebook::process::StackTrace st_;
+};
+
+#elif SUPPORTS_BACKTRACE && defined(C10_ANDROID)
 
 struct AndroidBacktraceState {
   std::vector<void*> buffer;
@@ -48,44 +71,49 @@ _Unwind_Reason_Code android_unwind_callback(
   return _URC_NO_REASON;
 }
 
-void dump_stack(
-    std::ostream& os,
-    size_t frames_to_skip,
-    size_t maximum_number_of_frames) {
-  AndroidBacktraceState state;
-
-  _Unwind_Backtrace(android_unwind_callback, &state);
+class GetBacktraceImpl {
+ public:
+  C10_ALWAYS_INLINE GetBacktraceImpl(
+      size_t /* frames_to_skip */,
+      size_t /* maximum_number_of_frames */,
+      bool /* skip_python_frames */) {
+    _Unwind_Backtrace(android_unwind_callback, &state_);
+  }
 
-  int idx = 0;
-  char* demangled = nullptr;
-  size_t length = 0;
+  std::string symbolize() const {
+    std::ostringstream os;
+    int idx = 0;
+    char* demangled = nullptr;
+    size_t length = 0;
 
-  for (const void* addr : state.buffer) {
-    const char* symbol = "";
+    for (const void* addr : state_.buffer) {
+      const char* symbol = "";
 
-    Dl_info info;
-    if (dladdr(addr, &info) && info.dli_sname) {
-      symbol = info.dli_sname;
-    }
+      Dl_info info;
+      if (dladdr(addr, &info) && info.dli_sname) {
+        symbol = info.dli_sname;
+      }
 
-    int status = 0;
-    demangled = __cxxabiv1::__cxa_demangle(
-        /*mangled_name*/ symbol,
-        /*output_buffer*/ demangled,
-        /*length*/ &length,
-        /*status*/ &status);
+      int status = 0;
+      demangled = __cxxabiv1::__cxa_demangle(
+          /*mangled_name*/ symbol,
+          /*output_buffer*/ demangled,
+          /*length*/ &length,
+          /*status*/ &status);
 
-    os << " frame #" << idx++ << "\t"
-       << ((demangled != NULL && status == 0) ? demangled : symbol) << "["
-       << addr << "]\t" << std::endl;
+      os << " frame #" << idx++ << "\t"
+         << ((demangled != NULL && status == 0) ? demangled : symbol) << "["
+         << addr << "]\t" << std::endl;
+    }
+    free(demangled);
+    return os.str();
   }
-  free(demangled);
-}
 
-#endif /* SUPPORTS_BACKTRACE && defined(C10_ANDROID) */
+ private:
+  AndroidBacktraceState state_;
+};
 
-#if SUPPORTS_BACKTRACE
-namespace {
+#elif SUPPORTS_BACKTRACE // !defined(C10_ANDROID)
 
 struct FrameInformation {
   /// If available, the demangled name of the function at this frame, else
@@ -101,13 +129,12 @@ struct FrameInformation {
   std::string object_file;
 };
 
-#ifndef C10_ANDROID
 bool is_python_frame(const FrameInformation& frame) {
   return frame.object_file == "python" || frame.object_file == "python3" ||
       (frame.object_file.find("libpython") != std::string::npos);
 }
 
-c10::optional<FrameInformation> parse_frame_information(
+std::optional<FrameInformation> parse_frame_information(
     const std::string& frame_string) {
   FrameInformation frame;
 
@@ -173,10 +200,89 @@ c10::optional<FrameInformation> parse_frame_information(
   frame.function_name = demangle(mangled_function_name.c_str());
   return frame;
 }
-#endif /* !defined(C10_ANDROID) */
-} // anonymous namespace
-#elif defined(_MSC_VER)
-namespace {
+
+class GetBacktraceImpl {
+ public:
+  C10_ALWAYS_INLINE GetBacktraceImpl(
+      size_t frames_to_skip,
+      size_t maximum_number_of_frames,
+      bool skip_python_frames)
+      : skip_python_frames_(skip_python_frames),
+        callstack_(frames_to_skip + maximum_number_of_frames, nullptr) {
+    // We always skip this frame (backtrace).
+    frames_to_skip += 1;
+
+    // backtrace() gives us a list of return addresses in the current call
+    // stack. NOTE: As per man (3) backtrace it can never fail
+    // (http://man7.org/linux/man-pages/man3/backtrace.3.html).
+    auto number_of_frames = static_cast<size_t>(
+        ::backtrace(callstack_.data(), static_cast<int>(callstack_.size())));
+
+    // Skip as many frames as requested.
+    frames_to_skip = std::min(frames_to_skip, number_of_frames);
+    number_of_frames -= frames_to_skip;
+    callstack_.erase(
+        callstack_.begin(),
+        callstack_.begin() + static_cast<ssize_t>(frames_to_skip));
+    callstack_.resize(number_of_frames);
+  }
+
+  std::string symbolize() const {
+    // `backtrace_symbols` takes the return addresses obtained from
+    // `backtrace()` and fetches string representations of each stack.
+    // Unfortunately it doesn't return a struct of individual pieces of
+    // information but a concatenated string, so we'll have to parse the string
+    // after. NOTE: The array returned by `backtrace_symbols` is malloc'd and
+    // must be manually freed, but not the strings inside the array.
+    std::unique_ptr<char*, std::function<void(char**)>> raw_symbols(
+        ::backtrace_symbols(
+            callstack_.data(), static_cast<int>(callstack_.size())),
+        /*deleter=*/free);
+    const std::vector<std::string> symbols(
+        raw_symbols.get(), raw_symbols.get() + callstack_.size());
+
+    // The backtrace string goes into here.
+    std::ostringstream stream;
+
+    // Toggles to true after the first skipped python frame.
+    bool has_skipped_python_frames = false;
+
+    for (const auto frame_number : c10::irange(callstack_.size())) {
+      const auto frame = parse_frame_information(symbols[frame_number]);
+
+      if (skip_python_frames_ && frame && is_python_frame(*frame)) {
+        if (!has_skipped_python_frames) {
+          stream << "<omitting python frames>\n";
+          has_skipped_python_frames = true;
+        }
+        continue;
+      }
+
+      // frame #<number>:
+      stream << "frame #" << frame_number << ": ";
+
+      if (frame) {
+        // <function_name> + <offset> (<return-address> in <object-file>)
+        stream << frame->function_name << " + " << frame->offset_into_function
+               << " (" << callstack_[frame_number] << " in "
+               << frame->object_file << ")\n";
+      } else {
+        // In the edge-case where we couldn't parse the frame string, we can
+        // just use it directly (it may have a different format).
+        stream << symbols[frame_number] << "\n";
+      }
+    }
+
+    return stream.str();
+  }
+
+ private:
+  const bool skip_python_frames_;
+  std::vector<void*> callstack_;
+};
+
+#elif defined(_MSC_VER) // !SUPPORTS_BACKTRACE
+
 const int max_name_len = 256;
 std::string get_module_base_name(void* addr) {
   HMODULE h_module;
@@ -225,180 +331,144 @@ class SymbolHelper {
   SymbolHelper(SymbolHelper const&) = delete;
   void operator=(SymbolHelper const&) = delete;
 };
-} // anonymous namespace
-#endif // SUPPORTS_BACKTRACE
 
-std::string get_backtrace(
-    size_t frames_to_skip,
-    size_t maximum_number_of_frames,
-    bool skip_python_frames) {
-#ifdef FBCODE_CAFFE2
-  // For some reason, the stacktrace implementation in fbcode is
-  // better than ours, see  https://github.com/pytorch/pytorch/issues/56399
-  // When it's available, just use that.
-  facebook::process::StackTrace st;
-  return st.toString();
-
-#elif SUPPORTS_BACKTRACE && !defined(C10_ANDROID)
-
-  // We always skip this frame (backtrace).
-  frames_to_skip += 1;
-
-  std::vector<void*> callstack(
-      frames_to_skip + maximum_number_of_frames, nullptr);
-  // backtrace() gives us a list of return addresses in the current call stack.
-  // NOTE: As per man (3) backtrace it can never fail
-  // (http://man7.org/linux/man-pages/man3/backtrace.3.html).
-  auto number_of_frames =
-      ::backtrace(callstack.data(), static_cast<int>(callstack.size()));
-
-  // Skip as many frames as requested. This is not efficient, but the sizes here
-  // are small and it makes the code nicer and safer.
-  for (; frames_to_skip > 0 && number_of_frames > 0;
-       --frames_to_skip, --number_of_frames) {
-    callstack.erase(callstack.begin());
+// This backtrace retrieval is implemented on Windows via the Windows API using
+// `CaptureStackBackTrace`, `SymFromAddr` and `SymGetLineFromAddr64`.
+// https://stackoverflow.com/questions/5693192/win32-backtrace-from-c-code
+// https://stackoverflow.com/questions/26398064/counterpart-to-glibcs-backtrace-and-backtrace-symbols-on-windows
+// https://docs.microsoft.com/en-us/windows/win32/debug/capturestackbacktrace
+// https://docs.microsoft.com/en-us/windows/win32/api/dbghelp/nf-dbghelp-symfromaddr
+// https://docs.microsoft.com/en-us/windows/win32/api/dbghelp/nf-dbghelp-symgetlinefromaddr64
+// TODO: Support skipping python frames
+class GetBacktraceImpl {
+ public:
+  C10_ALWAYS_INLINE GetBacktraceImpl(
+      size_t frames_to_skip,
+      size_t maximum_number_of_frames,
+      bool /* skip_python_frames */)
+      : back_trace_(new void*[maximum_number_of_frames]) {
+    // We always skip this frame (backtrace).
+    frames_to_skip += 1;
+
+    // Get the frames
+    n_frame_ = CaptureStackBackTrace(
+        static_cast<DWORD>(frames_to_skip),
+        static_cast<DWORD>(maximum_number_of_frames),
+        back_trace_.get(),
+        NULL);
   }
 
-  // `number_of_frames` is strictly less than the current capacity of
-  // `callstack`, so this is just a pointer subtraction and makes the subsequent
-  // code safer.
-  callstack.resize(static_cast<size_t>(number_of_frames));
-
-  // `backtrace_symbols` takes the return addresses obtained from `backtrace()`
-  // and fetches string representations of each stack. Unfortunately it doesn't
-  // return a struct of individual pieces of information but a concatenated
-  // string, so we'll have to parse the string after. NOTE: The array returned
-  // by `backtrace_symbols` is malloc'd and must be manually freed, but not the
-  // strings inside the array.
-  std::unique_ptr<char*, std::function<void(char**)>> raw_symbols(
-      ::backtrace_symbols(callstack.data(), static_cast<int>(callstack.size())),
-      /*deleter=*/free);
-  const std::vector<std::string> symbols(
-      raw_symbols.get(), raw_symbols.get() + callstack.size());
-
-  // The backtrace string goes into here.
-  std::ostringstream stream;
-
-  // Toggles to true after the first skipped python frame.
-  bool has_skipped_python_frames = false;
-
-  for (const auto frame_number : c10::irange(callstack.size())) {
-    const auto frame = parse_frame_information(symbols[frame_number]);
-
-    if (skip_python_frames && frame && is_python_frame(*frame)) {
-      if (!has_skipped_python_frames) {
-        stream << "<omitting python frames>\n";
-        has_skipped_python_frames = true;
+  std::string symbolize() const {
+    DWORD64 displacement;
+    DWORD disp;
+    std::unique_ptr<IMAGEHLP_LINE64> line;
+
+    char buffer[sizeof(SYMBOL_INFO) + MAX_SYM_NAME * sizeof(TCHAR)];
+    PSYMBOL_INFO p_symbol = (PSYMBOL_INFO)buffer;
+
+    bool with_symbol = false;
+    bool with_line = false;
+
+    // The backtrace string goes into here.
+    std::ostringstream stream;
+
+    // Initialize symbols if necessary
+    SymbolHelper& sh = SymbolHelper::getInstance();
+
+    for (USHORT i_frame = 0; i_frame < n_frame_; ++i_frame) {
+      // Get the address and the name of the symbol
+      if (sh.inited) {
+        p_symbol->SizeOfStruct = sizeof(SYMBOL_INFO);
+        p_symbol->MaxNameLen = MAX_SYM_NAME;
+        with_symbol = SymFromAddr(
+            sh.process, (ULONG64)back_trace_[i_frame], &displacement, p_symbol);
       }
-      continue;
-    }
 
-    // frame #<number>:
-    stream << "frame #" << frame_number << ": ";
-
-    if (frame) {
-      // <function_name> + <offset> (<return-address> in <object-file>)
-      stream << frame->function_name << " + " << frame->offset_into_function
-             << " (" << callstack[frame_number] << " in " << frame->object_file
-             << ")\n";
-    } else {
-      // In the edge-case where we couldn't parse the frame string, we can
-      // just use it directly (it may have a different format).
-      stream << symbols[frame_number] << "\n";
+      // Get the line number and the module
+      if (sh.inited) {
+        line.reset(new IMAGEHLP_LINE64());
+        line->SizeOfStruct = sizeof(IMAGEHLP_LINE64);
+        with_line = SymGetLineFromAddr64(
+            sh.process, (ULONG64)back_trace_[i_frame], &disp, line.get());
+      }
+
+      // Get the module basename
+      std::string module = get_module_base_name(back_trace_[i_frame]);
+
+      // The pattern on Windows is
+      // `<return-address> <symbol-address>
+      // <module-name>!<demangled-function-name> [<file-name> @ <line-number>]
+      stream << std::setfill('0') << std::setw(16) << std::uppercase << std::hex
+             << back_trace_[i_frame] << std::dec;
+      if (with_symbol) {
+        stream << std::setfill('0') << std::setw(16) << std::uppercase
+               << std::hex << p_symbol->Address << std::dec << " " << module
+               << "!" << p_symbol->Name;
+      } else {
+        stream << " <unknown symbol address> " << module << "!<unknown symbol>";
+      }
+      stream << " [";
+      if (with_line) {
+        stream << line->FileName << " @ " << line->LineNumber;
+      } else {
+        stream << "<unknown file> @ <unknown line number>";
+      }
+      stream << "]" << std::endl;
     }
+
+    return stream.str();
   }
 
-  return stream.str();
+ private:
+  std::unique_ptr<void*[]> back_trace_;
+  USHORT n_frame_;
+};
 
-#elif SUPPORTS_BACKTRACE && defined(C10_ANDROID)
+#else
 
-  std::ostringstream oss;
-  dump_stack(oss, frames_to_skip, maximum_number_of_frames);
-  return oss.str().c_str();
+class GetBacktraceImpl {
+ public:
+  C10_ALWAYS_INLINE GetBacktraceImpl(
+      size_t /* frames_to_skip */,
+      size_t /* maximum_number_of_frames */,
+      bool /* skip_python_frames */) {}
 
-#elif defined(_MSC_VER) // !SUPPORTS_BACKTRACE
-  // This backtrace retrieval is implemented on Windows via the Windows
-  // API using `CaptureStackBackTrace`, `SymFromAddr` and
-  // `SymGetLineFromAddr64`.
-  // https://stackoverflow.com/questions/5693192/win32-backtrace-from-c-code
-  // https://stackoverflow.com/questions/26398064/counterpart-to-glibcs-backtrace-and-backtrace-symbols-on-windows
-  // https://docs.microsoft.com/en-us/windows/win32/debug/capturestackbacktrace
-  // https://docs.microsoft.com/en-us/windows/win32/api/dbghelp/nf-dbghelp-symfromaddr
-  // https://docs.microsoft.com/en-us/windows/win32/api/dbghelp/nf-dbghelp-symgetlinefromaddr64
-  // TODO: Support skipping python frames
-
-  // We always skip this frame (backtrace).
-  frames_to_skip += 1;
-
-  DWORD64 displacement;
-  DWORD disp;
-  std::unique_ptr<IMAGEHLP_LINE64> line;
-
-  char buffer[sizeof(SYMBOL_INFO) + MAX_SYM_NAME * sizeof(TCHAR)];
-  PSYMBOL_INFO p_symbol = (PSYMBOL_INFO)buffer;
-
-  std::unique_ptr<void*[]> back_trace(new void*[maximum_number_of_frames]);
-  bool with_symbol = false;
-  bool with_line = false;
-
-  // The backtrace string goes into here.
-  std::ostringstream stream;
-
-  // Get the frames
-  const USHORT n_frame = CaptureStackBackTrace(
-      static_cast<DWORD>(frames_to_skip),
-      static_cast<DWORD>(maximum_number_of_frames),
-      back_trace.get(),
-      NULL);
-
-  // Initialize symbols if necessary
-  SymbolHelper& sh = SymbolHelper::getInstance();
-
-  for (USHORT i_frame = 0; i_frame < n_frame; ++i_frame) {
-    // Get the address and the name of the symbol
-    if (sh.inited) {
-      p_symbol->SizeOfStruct = sizeof(SYMBOL_INFO);
-      p_symbol->MaxNameLen = MAX_SYM_NAME;
-      with_symbol = SymFromAddr(
-          sh.process, (ULONG64)back_trace[i_frame], &displacement, p_symbol);
-    }
+  std::string symbolize() const {
+    return "(no backtrace available)";
+  }
+};
 
-    // Get the line number and the module
-    if (sh.inited) {
-      line.reset(new IMAGEHLP_LINE64());
-      line->SizeOfStruct = sizeof(IMAGEHLP_LINE64);
-      with_line = SymGetLineFromAddr64(
-          sh.process, (ULONG64)back_trace[i_frame], &disp, line.get());
-    }
+#endif
 
-    // Get the module basename
-    std::string module = get_module_base_name(back_trace[i_frame]);
+} // namespace
 
-    // The pattern on Windows is
-    // `<return-address> <symbol-address>
-    // <module-name>!<demangled-function-name> [<file-name> @ <line-number>]
-    stream << std::setfill('0') << std::setw(16) << std::uppercase << std::hex
-           << back_trace[i_frame] << std::dec;
-    if (with_symbol) {
-      stream << std::setfill('0') << std::setw(16) << std::uppercase << std::hex
-             << p_symbol->Address << std::dec << " " << module << "!"
-             << p_symbol->Name;
-    } else {
-      stream << " <unknown symbol address> " << module << "!<unknown symbol>";
-    }
-    stream << " [";
-    if (with_line) {
-      stream << line->FileName << " @ " << line->LineNumber;
-    } else {
-      stream << "<unknown file> @ <unknown line number>";
+std::string get_backtrace(
+    size_t frames_to_skip,
+    size_t maximum_number_of_frames,
+    bool skip_python_frames) {
+  return GetBacktraceImpl{
+      frames_to_skip, maximum_number_of_frames, skip_python_frames}
+      .symbolize();
+}
+
+Backtrace get_lazy_backtrace(
+    size_t frames_to_skip,
+    size_t maximum_number_of_frames,
+    bool skip_python_frames) {
+  class LazyBacktrace : public OptimisticLazyValue<std::string> {
+   public:
+    LazyBacktrace(GetBacktraceImpl&& impl) : impl_(std::move(impl)) {}
+
+   private:
+    std::string compute() const override {
+      return impl_.symbolize();
     }
-    stream << "]" << std::endl;
-  }
 
-  return stream.str();
-#else // !SUPPORTS_BACKTRACE && !_WIN32
-  return "(no backtrace available)";
-#endif // SUPPORTS_BACKTRACE
+    GetBacktraceImpl impl_;
+  };
+
+  return std::make_shared<LazyBacktrace>(GetBacktraceImpl{
+      frames_to_skip, maximum_number_of_frames, skip_python_frames});
 }
 
 } // namespace c10
diff --git a/c10/util/Backtrace.h b/c10/util/Backtrace.h
index 75691286d9019..500bf4cf407b2 100644
--- a/c10/util/Backtrace.h
+++ b/c10/util/Backtrace.h
@@ -2,16 +2,30 @@
 #define C10_UTIL_BACKTRACE_H_
 
 #include <cstddef>
+#include <memory>
 #include <string>
 #include <typeinfo>
 
 #include <c10/macros/Macros.h>
+#include <c10/util/Lazy.h>
 
 namespace c10 {
+
+// Symbolizing the backtrace can be expensive; pass it around as a lazy string
+// so it is symbolized only if actually needed.
+using Backtrace = std::shared_ptr<const LazyValue<std::string>>;
+
+// DEPRECATED: Prefer get_lazy_backtrace().
 C10_API std::string get_backtrace(
     size_t frames_to_skip = 0,
     size_t maximum_number_of_frames = 64,
     bool skip_python_frames = true);
+
+C10_API Backtrace get_lazy_backtrace(
+    size_t frames_to_skip = 0,
+    size_t maximum_number_of_frames = 64,
+    bool skip_python_frames = true);
+
 } // namespace c10
 
 #endif // C10_UTIL_BACKTRACE_H_
diff --git a/c10/util/Exception.cpp b/c10/util/Exception.cpp
index a0b9fa1e72ec8..76083cd14a838 100644
--- a/c10/util/Exception.cpp
+++ b/c10/util/Exception.cpp
@@ -58,7 +58,7 @@ std::string Error::compute_what(bool include_backtrace) const {
   return oss.str();
 }
 
-const Error::Backtrace& Error::backtrace() const {
+const Backtrace& Error::backtrace() const {
   return backtrace_;
 }
 
@@ -142,7 +142,7 @@ namespace {
 WarningHandler* getBaseHandler() {
   static WarningHandler base_warning_handler_ = WarningHandler();
   return &base_warning_handler_;
-};
+}
 
 class ThreadWarningHandler {
  public:
diff --git a/c10/util/Exception.h b/c10/util/Exception.h
index 750e978059ba9..d75c6a8cd30c3 100644
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@@ -3,6 +3,7 @@
 
 #include <c10/macros/Export.h>
 #include <c10/macros/Macros.h>
+#include <c10/util/Backtrace.h>
 #include <c10/util/Lazy.h>
 #include <c10/util/StringUtil.h>
 
@@ -27,11 +28,6 @@ namespace c10 {
 /// NB: c10::Error is handled specially by the default torch to suppress the
 /// backtrace, see torch/csrc/Exceptions.h
 class C10_API Error : public std::exception {
- public:
-  // Symbolizing the backtrace can be expensive; pass it around as a lazy string
-  // so it is symbolized only if actually needed.
-  using Backtrace = std::shared_ptr<const LazyValue<std::string>>;
-
  private:
   // The actual error message.
   std::string msg_;
diff --git a/c10/util/Float8_e4m3fn.h b/c10/util/Float8_e4m3fn.h
index d51feabcc8c4d..8e05e2e43bb01 100644
--- a/c10/util/Float8_e4m3fn.h
+++ b/c10/util/Float8_e4m3fn.h
@@ -233,7 +233,7 @@ struct alignas(1) Float8_e4m3fn {
   Float8_e4m3fn() = default;
 
   constexpr C10_HOST_DEVICE Float8_e4m3fn(uint8_t bits, from_bits_t)
-      : x(bits){};
+      : x(bits) {}
   inline C10_HOST_DEVICE Float8_e4m3fn(float value);
   inline C10_HOST_DEVICE operator float() const;
   inline C10_HOST_DEVICE bool isnan() const;
diff --git a/c10/util/Float8_e4m3fnuz.h b/c10/util/Float8_e4m3fnuz.h
index bed29891749a3..86ece9ebdadbb 100644
--- a/c10/util/Float8_e4m3fnuz.h
+++ b/c10/util/Float8_e4m3fnuz.h
@@ -121,7 +121,7 @@ struct alignas(1) Float8_e4m3fnuz {
   Float8_e4m3fnuz() = default;
 
   constexpr C10_HOST_DEVICE Float8_e4m3fnuz(uint8_t bits, from_bits_t)
-      : x(bits){};
+      : x(bits) {}
   inline C10_HOST_DEVICE Float8_e4m3fnuz(float value);
   inline C10_HOST_DEVICE operator float() const;
   inline C10_HOST_DEVICE bool isnan() const;
diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp
index 27feb9346f880..66a24980a44b4 100644
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@@ -3,7 +3,6 @@
 #include <c10/util/Lazy.h>
 #include <c10/util/Logging.h>
 #ifdef FBCODE_CAFFE2
-#include <common/process/StackTrace.h>
 #include <folly/synchronization/SanitizeThread.h>
 #endif
 
@@ -26,30 +25,15 @@ C10_DEFINE_bool(
 namespace c10 {
 
 namespace {
-std::function<::c10::Error::Backtrace()>& GetFetchStackTrace() {
-  static std::function<::c10::Error::Backtrace()> func = []() {
-#ifdef FBCODE_CAFFE2
-    // Same implementation as get_backtrace() in fbcode, but with lazy
-    // symbolization.
-    class LazyBacktrace : public OptimisticLazyValue<std::string> {
-      facebook::process::StackTrace st_;
-
-      std::string compute() const override {
-        return st_.toString();
-      }
-    };
-
-    return std::make_shared<LazyBacktrace>();
-#else
-    return std::make_shared<PrecomputedLazyValue<std::string>>(
-        get_backtrace(/*frames_to_skip=*/1));
-#endif
+std::function<::c10::Backtrace()>& GetFetchStackTrace() {
+  static std::function<::c10::Backtrace()> func = []() {
+    return get_lazy_backtrace(/*frames_to_skip=*/1);
   };
   return func;
-};
+}
 } // namespace
 
-void SetStackTraceFetcher(std::function<::c10::Error::Backtrace()> fetcher) {
+void SetStackTraceFetcher(std::function<::c10::Backtrace()> fetcher) {
   GetFetchStackTrace() = std::move(fetcher);
 }
 
@@ -116,7 +100,7 @@ class PyTorchStyleBacktrace : public OptimisticLazyValue<std::string> {
         backtrace_->get());
   }
 
-  ::c10::Error::Backtrace backtrace_;
+  ::c10::Backtrace backtrace_;
   SourceLocation source_location_;
 };
 
@@ -150,19 +134,19 @@ APIUsageLoggerType* GetAPIUsageLogger() {
   static APIUsageLoggerType func =
       IsAPIUsageDebugMode() ? &APIUsageDebug : [](const string&) {};
   return &func;
-};
+}
 
 APIUsageMetadataLoggerType* GetAPIUsageMetadataLogger() {
   static APIUsageMetadataLoggerType func =
       [](const std::string&,
          const std::map<std::string, std::string>& metadata_map) {};
   return &func;
-};
+}
 
 DDPUsageLoggerType* GetDDPUsageLogger() {
   static DDPUsageLoggerType func = [](const DDPLoggingData&) {};
   return &func;
-};
+}
 } // namespace
 
 void SetAPIUsageLogger(std::function<void(const std::string&)> logger) {
diff --git a/c10/util/Logging.h b/c10/util/Logging.h
index caab50c8e0cda..a2349e423d013 100644
--- a/c10/util/Logging.h
+++ b/c10/util/Logging.h
@@ -8,6 +8,7 @@
 #include <sstream>
 
 #include <c10/macros/Macros.h>
+#include <c10/util/Backtrace.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Flags.h>
 #include <c10/util/StringUtil.h>
@@ -126,8 +127,7 @@ constexpr bool IsUsingGoogleLogging() {
  */
 C10_API void ShowLogInfoToStderr();
 
-C10_API void SetStackTraceFetcher(
-    std::function<::c10::Error::Backtrace()> fetcher);
+C10_API void SetStackTraceFetcher(std::function<::c10::Backtrace()> fetcher);
 
 /**
  * Convenience function for non-lazy stack trace fetchers. The Backtrace
diff --git a/c10/util/OptionalArrayRef.h b/c10/util/OptionalArrayRef.h
index 2c2b88722d4d7..98237bba92f56 100644
--- a/c10/util/OptionalArrayRef.h
+++ b/c10/util/OptionalArrayRef.h
@@ -1,11 +1,11 @@
 // This file defines OptionalArrayRef<T>, a class that has almost the same
-// exact functionality as c10::optional<ArrayRef<T>>, except that its
+// exact functionality as std::optional<ArrayRef<T>>, except that its
 // converting constructor fixes a dangling pointer issue.
 //
-// The implicit converting constructor of both c10::optional<ArrayRef<T>> and
+// The implicit converting constructor of both std::optional<ArrayRef<T>> and
 // std::optional<ArrayRef<T>> can cause the underlying ArrayRef<T> to store
 // a dangling pointer. OptionalArrayRef<T> prevents this by wrapping
-// a c10::optional<ArrayRef<T>> and fixing the constructor implementation.
+// a std::optional<ArrayRef<T>> and fixing the constructor implementation.
 //
 // See https://github.com/pytorch/pytorch/issues/63645 for more on this.
 
diff --git a/c10/xpu/test/impl/XPUStreamTest.cpp b/c10/xpu/test/impl/XPUStreamTest.cpp
index 16f6e20c2163e..01a1dbb62621b 100644
--- a/c10/xpu/test/impl/XPUStreamTest.cpp
+++ b/c10/xpu/test/impl/XPUStreamTest.cpp
@@ -82,7 +82,7 @@ TEST(XPUStreamTest, StreamBehavior) {
   EXPECT_NE(stream.device_index(), c10::xpu::current_device());
 }
 
-void thread_fun(c10::optional<c10::xpu::XPUStream>& cur_thread_stream) {
+void thread_fun(std::optional<c10::xpu::XPUStream>& cur_thread_stream) {
   auto new_stream = c10::xpu::getStreamFromPool();
   c10::xpu::setCurrentXPUStream(new_stream);
   cur_thread_stream = {c10::xpu::getCurrentXPUStream()};
@@ -94,7 +94,7 @@ TEST(XPUStreamTest, MultithreadStreamBehavior) {
   if (!has_xpu()) {
     return;
   }
-  c10::optional<c10::xpu::XPUStream> s0, s1;
+  std::optional<c10::xpu::XPUStream> s0, s1;
 
   std::thread t0{thread_fun, std::ref(s0)};
   std::thread t1{thread_fun, std::ref(s1)};
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 13282063d9078..bd2588b5aef35 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -110,21 +110,11 @@ endif()
 add_subdirectory(core)
 add_subdirectory(serialize)
 add_subdirectory(utils)
-if(BUILD_CAFFE2 OR (NOT USE_FBGEMM))
+if(NOT USE_FBGEMM)
   add_subdirectory(perfkernels)
 endif()
 
-# Skip modules that are not used by libtorch mobile yet.
-if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
-  add_subdirectory(core/nomnigraph)
-  if(USE_NVRTC)
-    add_subdirectory(cuda_rtc)
-  endif()
-  if(BUILD_CAFFE2_OPS)
-  endif()
-  add_subdirectory(proto)
-endif()
-if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
+if(NOT INTERN_BUILD_MOBILE)
   add_subdirectory(proto)
 endif()
 
@@ -585,17 +575,10 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
     ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
   )
 
-  # Disable legacy import of building without Caffe2 support
-  if(BUILD_CAFFE2)
-    list(APPEND TORCH_SRCS
-      ${TORCH_SRC_DIR}/csrc/jit/serialization/import_legacy.cpp
-    )
-  else()
-    set_source_files_properties(
-      ${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp
-      PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
-    )
-  endif()
+  set_source_files_properties(
+    ${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp
+    PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
+  )
   if(USE_DISTRIBUTED)
     append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
     if(NOT WIN32)
@@ -809,11 +792,6 @@ if(HAVE_SOVERSION)
       VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
 endif()
 torch_compile_options(torch_cpu)  # see cmake/public/utils.cmake
-if(BUILD_CAFFE2 AND NOT MSVC)
-  # Caffe2 has too many signed-unsigned violation, but the framework is dead
-  # So no point in fixing those
-  target_compile_options(torch_cpu PRIVATE "-Wno-sign-compare")
-endif()
 
 # Ignore Wdeprecated-XXX errors from third-party libraries
 if(NOT MSVC)
@@ -1921,14 +1899,6 @@ if(BUILD_TEST)
       endif()
     endforeach()
   endif()
-
-  # For special tests that explicitly uses dependencies, we add them here
-  if(BUILD_CAFFE2 AND USE_MPI)
-    target_link_libraries(mpi_test MPI::MPI_CXX)
-    if(USE_CUDA)
-      target_link_libraries(mpi_gpu_test MPI::MPI_CXX)
-    endif()
-  endif()
 endif()
 
 if(MSVC)
@@ -1998,11 +1968,6 @@ if(BUILD_PYTHON)
     set_source_files_properties(${TORCH_SRC_DIR}/../caffe2/operators/box_with_nms_limit_op.cc PROPERTIES COMPILE_FLAGS -Wno-attributes)
   endif()
 
-  # ---[ Python.
-  if(BUILD_CAFFE2)
-  target_compile_definitions(torch PRIVATE BUILD_CAFFE2)
-  endif()
-
   # generated pb files are copied from build/caffe2 to caffe2
   # if we copied them back to build this would create a build cycle
   # consider removing the need for globs
diff --git a/caffe2/__init__.py b/caffe2/__init__.py
index 4096a98283857..f319e8e2dc15b 100644
--- a/caffe2/__init__.py
+++ b/caffe2/__init__.py
@@ -2,5 +2,4 @@
 from torch.onnx import _CAFFE2_ATEN_FALLBACK
 
 if not _CAFFE2_ATEN_FALLBACK:
-    warnings.warn("Caffe2 support is not fully enabled in this PyTorch build. "
-                  "Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.")
+    warnings.warn("Caffe2 support is no longer present in PyTorch.")
diff --git a/caffe2/core/CMakeLists.txt b/caffe2/core/CMakeLists.txt
index f59c0e703edf7..371d2216b50ea 100644
--- a/caffe2/core/CMakeLists.txt
+++ b/caffe2/core/CMakeLists.txt
@@ -1,68 +1,4 @@
-if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE)
-  list(APPEND Caffe2_CPU_SRCS
-    "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
-  )
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-  return()
-endif()
-
-# ---[ GPU files
-# ------[ cuDNN
-if(USE_CUDNN)
-  file(GLOB tmp *_cudnn.cc)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-endif()
-# ------[ general GPU
-file(GLOB tmp *_gpu.cc)
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-# ------[ CUDA sources
-file(GLOB tmp *.cu)
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-# exclude test files
-file(GLOB tmp *_test.cc)
-exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
-
-# ---[ general HIP files
-file(GLOB tmp hip/*.cc)
-set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
-# ------[ HIP sources
-file(GLOB tmp hip/*.hip)
-set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
-# exclude test files
-file(GLOB tmp hip/*_test.cc)
-exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
-
-# ---[ CPU files.
-file(GLOB tmp *.cc)
-# Manually remove the cudnn files since we might be using USE_CUDNN=OFF
-# TODO: when we move to explicit file list, this would not be needed.
-file(GLOB tmp_cudnn *_cudnn.cc)
-exclude(tmp "${tmp}" ${tmp_cudnn})
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
-# exclude test files and gpu files
-file(GLOB tmp *_test.cc)
-exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
-exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
-exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
-
-# ---[ GPU test files
-file(GLOB tmp *_gpu_test.cc)
-set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
-
-# ---[ HIP test files
-file(GLOB tmp hip/*_test.cc)
-set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
-
-# ---[ CPU test files
-file(GLOB tmp *_test.cc)
-set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
-exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
-exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS})
-
-# ---[ Send the lists to the parent scope.
+list(APPEND Caffe2_CPU_SRCS
+  "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
+)
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index 5cd01ba7cc59c..fca3e63f72182 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -1,5 +1,6 @@
 #include "caffe2/core/blob_serialization.h"
 
+#include <limits>
 #include <mutex>
 #include <sstream>
 #include <utility>
@@ -83,8 +84,7 @@ Range<T*> GetMutableTensorDataRange(
     size_t start,
     size_t numElements) {
   CAFFE_ENFORCE(
-      // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      start + numElements <= tensor.numel(),
+      static_cast<int64_t>(start + numElements) <= tensor.numel(),
       "Requested invalid mutable tensor range [",
       start,
       ", ",
@@ -100,8 +100,7 @@ c10::ArrayRef<T> GetTensorDataRange(
     size_t start,
     size_t numElements) {
   CAFFE_ENFORCE(
-      // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      start + numElements <= tensor.numel(),
+      static_cast<int64_t>(start + numElements) <= tensor.numel(),
       "Requested invalid tensor range [",
       start,
       ", ",
@@ -390,8 +389,7 @@ void TensorSerializer::SerializeWithOptions(
   // Poorman's IOBound ThreadPool
   SimpleQueue<size_t> chunkQueue;
   auto task = [&]() {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    size_t chunkStart;
+    size_t chunkStart = std::numeric_limits<size_t>::max();
     while (chunkQueue.Pop(&chunkStart)) {
       processChunk(chunkStart);
     }
@@ -409,8 +407,7 @@ void TensorSerializer::SerializeWithOptions(
   VLOG(1) << "Serializing blob " << name;
   // Serialize whole vector. If vector is empty, it's shape still needs to be
   // serialized in empty proto
-  for (size_t chunkBegin = 0;
-       // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
+  for (int64_t chunkBegin = 0;
        chunkBegin < std::max(tensor.numel(), static_cast<int64_t>(1));
        chunkBegin += chunk_size) {
     VLOG(2) << "Starting a chunk at " << chunkBegin;
@@ -582,8 +579,7 @@ void SerializeTensorData(const SerializeParams<float>& params) {
       BlobSerializationOptions_FloatFormat_FLOAT_BFLOAT16) {
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
     std::unique_ptr<float[]> tmp_buffer;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    const float* src;
+    const float* src = nullptr;
     if (params.context.device() == CPU) {
       src = params.input.data();
     } else {
@@ -653,14 +649,12 @@ void TensorSerializer::Serialize(
     size_t chunkBegin,
     int32_t chunkSize) {
   CAFFE_ENFORCE(
-      // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      chunkBegin <= input.numel(),
+      static_cast<int64_t>(chunkBegin) <= input.numel(),
       "Chunk begin is out of tensor: ",
       chunkBegin,
       ' ',
       input.numel());
-  // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-  if (chunkBegin + chunkSize > input.numel()) {
+  if (static_cast<int64_t>(chunkBegin + chunkSize) > input.numel()) {
     chunkSize = input.numel() - chunkBegin;
   }
 
@@ -1029,8 +1023,7 @@ DESERIALIZE_IMPL(float, FMT_BFLOAT16) {
       params.tensor_proto.raw_data().data());
 
   // If we are on a big-endian machine, byte-swap the serialized data.
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  const fbgemm::bfloat16* src;
+  const fbgemm::bfloat16* src = nullptr;
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
   std::unique_ptr<fbgemm::bfloat16[]> bswap_buffer;
   if (kIsLittleEndian) {
@@ -1045,8 +1038,7 @@ DESERIALIZE_IMPL(float, FMT_BFLOAT16) {
   // bfloat16 to float conversion.
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
   std::unique_ptr<float[]> tmp_buffer;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  float* dest;
+  float* dest = nullptr;
   if (params.context.device() == CPU) {
     dest = params.dest.data();
   } else {
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index 36fd4e400fe8c..eb46f78f8b0d9 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -63,23 +63,23 @@ class TORCH_API CPUContext final : public BaseContext {
       return (static_cast<uint64_t>(random1) << 32) | random2;
     }
 
-    c10::optional<float> next_float_normal_sample() {
+    std::optional<float> next_float_normal_sample() {
       return next_float_normal_sample_;
     }
-    c10::optional<double> next_double_normal_sample() {
+    std::optional<double> next_double_normal_sample() {
       return next_double_normal_sample_;
     }
-    void set_next_float_normal_sample(c10::optional<float> randn) {
+    void set_next_float_normal_sample(std::optional<float> randn) {
       next_float_normal_sample_ = randn;
     }
-    void set_next_double_normal_sample(c10::optional<double> randn) {
+    void set_next_double_normal_sample(std::optional<double> randn) {
       next_double_normal_sample_ = randn;
     }
 
    private:
     at::mt19937 engine_;
-    c10::optional<float> next_float_normal_sample_;
-    c10::optional<double> next_double_normal_sample_;
+    std::optional<float> next_float_normal_sample_;
+    std::optional<double> next_double_normal_sample_;
   };
 #else
   typedef std::mt19937 rand_gen_type;
diff --git a/caffe2/core/export_c10_op_to_caffe2.h b/caffe2/core/export_c10_op_to_caffe2.h
index b8bbfda84a50e..f03da90c1b861 100644
--- a/caffe2/core/export_c10_op_to_caffe2.h
+++ b/caffe2/core/export_c10_op_to_caffe2.h
@@ -185,7 +185,7 @@ class C10OperatorWrapper final : public Operator<Context> {
   template <class T>
   IValue get_nontensor_argument_(
       const std::string& name,
-      const c10::optional<IValue>& default_value) {
+      const std::optional<IValue>& default_value) {
     if (default_value.has_value()) {
       return this->template GetSingleArgument<T>(name, default_value->to<T>());
     } else {
diff --git a/caffe2/core/export_caffe2_op_to_c10.h b/caffe2/core/export_caffe2_op_to_c10.h
index 216d3833648bf..7e803e545e212 100644
--- a/caffe2/core/export_caffe2_op_to_c10.h
+++ b/caffe2/core/export_caffe2_op_to_c10.h
@@ -126,7 +126,7 @@ void call_caffe2_op_from_c10(
 
 inline FunctionSchema make_function_schema_for_c10(
     const char* schema_str,
-    c10::optional<c10::AliasAnalysisKind> optional_alias_analysis_kind) {
+    std::optional<c10::AliasAnalysisKind> optional_alias_analysis_kind) {
 #if !defined(EXPOSE_C2_OPS) && \
     (defined(CAFFE2_IS_XPLAT_BUILD) || defined(C10_MOBILE))
   throw std::logic_error(
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index a978cfd164ce8..7cf1ef909f18b 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -825,7 +825,7 @@ std::function<void(const OperatorDef&)> GetOperatorLogger() {
   return OperatorLogger;
 }
 
-c10::optional<int> OperatorBase::argumentIndexWithName(
+std::optional<int> OperatorBase::argumentIndexWithName(
     c10::string_view name) const {
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 0dbf31e5932b0..3277357b4f34c 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -605,7 +605,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
   std::string type_;
   vector<const Blob*> inputs_;
   vector<Blob*> outputs_;
-  // Preferably use c10::optional, but nvcc doesn't work
+  // Preferably use std::optional, but nvcc doesn't work
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   std::unique_ptr<const c10::FunctionSchema> fn_schema_;
@@ -649,7 +649,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     }
   }
 
-  c10::optional<int> argumentIndexWithName(c10::string_view name) const;
+  std::optional<int> argumentIndexWithName(c10::string_view name) const;
 
   // An event used by asynchronous execution.
   std::unique_ptr<Event> event_;
diff --git a/caffe2/proto/CMakeLists.txt b/caffe2/proto/CMakeLists.txt
index ba6b696dde4ba..bdbc045afb3d7 100644
--- a/caffe2/proto/CMakeLists.txt
+++ b/caffe2/proto/CMakeLists.txt
@@ -1,8 +1,4 @@
-if(BUILD_CAFFE2)
-  file(GLOB Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto")
-else()
-  set(Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/torch.proto;${CMAKE_CURRENT_SOURCE_DIR}/caffe2.proto")
-endif()
+set(Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/torch.proto;${CMAKE_CURRENT_SOURCE_DIR}/caffe2.proto")
 
 caffe2_protobuf_generate_cpp_py(Caffe2_PROTO_SRCS Caffe2_PROTO_HEADERS Caffe2_PROTO_PY ${Caffe2_PROTOBUF_FILES})
 
diff --git a/caffe2/proto/__init__.py b/caffe2/proto/__init__.py
index ce54a1aee5745..c40ca97189d1b 100644
--- a/caffe2/proto/__init__.py
+++ b/caffe2/proto/__init__.py
@@ -14,8 +14,7 @@
 try:
     from caffe2.proto import caffe2_pb2, metanet_pb2, torch_pb2
 except ImportError:
-    warnings.warn('Caffe2 support is not enabled in this PyTorch build. '
-                  'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.')
+    warnings.warn('Caffe2 support is no longer present in PyTorch.')
     raise
 
 try:
diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py
index 888d286458a3a..1e44baf28153f 100644
--- a/caffe2/python/__init__.py
+++ b/caffe2/python/__init__.py
@@ -6,8 +6,7 @@
 try:
     from caffe2.proto import caffe2_pb2
 except ImportError:
-    warnings.warn('Caffe2 support is not enabled in this PyTorch build. '
-                  'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.')
+    warnings.warn('Caffe2 support is no longer present in PyTorch.')
     raise
 
 # TODO: refactor & remove the following alias
diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc
index 00d922f356dfc..83415da0a4f77 100644
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@@ -610,7 +610,8 @@ size_t ostream_write_func(
 
   // Get the CRC32 of uncompressed data from the data descriptor, if the written
   // data is identified as the data descriptor block.
-  if (n >= 8 && MZ_READ_LE32(pBuf) == MZ_ZIP_DATA_DESCRIPTOR_ID) {
+  // See [Note: write_record_metadata] for why we check for non-null pBuf here
+  if (pBuf && n >= 8 && MZ_READ_LE32(pBuf) == MZ_ZIP_DATA_DESCRIPTOR_ID) {
     const int8_t* pInt8Buf = (const int8_t*)pBuf;
     const uint32_t uncomp_crc32 = MZ_READ_LE32(pInt8Buf + 4);
     self->combined_uncomp_crc32_ =
@@ -654,7 +655,12 @@ void PyTorchStreamWriter::setup(const string& file_name) {
     }
     TORCH_CHECK(file_stream_, "File ", file_name, " cannot be opened.");
     writer_func_ = [this](const void* buf, size_t nbytes) -> size_t {
-      file_stream_.write(static_cast<const char*>(buf), nbytes);
+      if (!buf) {
+        // See [Note: write_record_metadata]
+        file_stream_.seekp(nbytes, std::ios_base::cur);
+      } else {
+        file_stream_.write(static_cast<const char*>(buf), nbytes);
+      }
       return !file_stream_ ? 0 : nbytes;
     };
   }
@@ -690,20 +696,20 @@ void PyTorchStreamWriter::writeRecord(
       detail::getPadding(ar_->m_archive_size, full_name.size(), size, padding_);
   uint32_t flags = compress ? MZ_BEST_COMPRESSION : 0;
   mz_zip_writer_add_mem_ex_v2(
-      ar_.get(),
-      full_name.c_str(),
-      data,
-      size,
-      nullptr,
-      0,
-      flags,
-      0,
-      0,
-      nullptr,
-      padding_.c_str(),
-      padding_size,
-      nullptr,
-      0);
+      /*pZip=*/ar_.get(),
+      /*pArchive_name=*/full_name.c_str(),
+      /*pBuf=*/data,
+      /*buf_size=*/size,
+      /*pComment=*/nullptr,
+      /*comment_size=*/0,
+      /*level_and_flags=*/flags,
+      /*uncomp_size=*/0,
+      /*uncomp_crc32=*/0,
+      /*last_modified=*/nullptr,
+      /*user_extra_data=*/padding_.c_str(),
+      /*user_extra_data_len=*/padding_size,
+      /*user_extra_data_central=*/nullptr,
+      /*user_extra_data_central_len=*/0);
   valid("writing file ", name.c_str());
   files_written_.insert(name);
 }
diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt
index 181d164d81327..e168eb595feb2 100644
--- a/caffe2/utils/CMakeLists.txt
+++ b/caffe2/utils/CMakeLists.txt
@@ -1,100 +1,18 @@
-if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE)
-  list(APPEND Caffe2_CPU_SRCS
-    utils/string_utils.cc
-    utils/threadpool/ThreadPool.cc
-  )
-
-  if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL)
-    list(APPEND Caffe2_CPU_SRCS
-      utils/threadpool/pthreadpool-cpp.cc
-      utils/threadpool/thread_pool_guard.cpp
-    )
-  endif()
-
-  if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
-    list(APPEND Caffe2_CPU_SRCS
-      utils/proto_wrap.cc
-    )
-  endif()
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-  return()
-endif()
-
 list(APPEND Caffe2_CPU_SRCS
-  utils/bench_utils.cc
-  utils/cpuid.cc
-  utils/math/broadcast.cc
-  utils/math/elementwise.cc
-  utils/math/reduce.cc
-  utils/math/transpose.cc
-  utils/math/utils.cc
-  utils/math_cpu.cc
-  utils/murmur_hash3.cc
-  utils/proto_utils.cc
-  utils/proto_wrap.cc
+  utils/string_utils.cc
   utils/threadpool/ThreadPool.cc
-  utils/signal_handler.cc
-  utils/smart_tensor_printer.cc
-  utils/string_utils.cc)
+)
 
-if(USE_PTHREADPOOL)
+if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL)
   list(APPEND Caffe2_CPU_SRCS
     utils/threadpool/pthreadpool-cpp.cc
-    utils/threadpool/thread_pool_guard.cpp)
-  if(USE_INTERNAL_PTHREADPOOL_IMPL)
-    list(APPEND Caffe2_CPU_SRCS
-      utils/threadpool/pthreadpool.cc
-      utils/threadpool/pthreadpool_impl.cc)
-  endif()
+    utils/threadpool/thread_pool_guard.cpp
+  )
 endif()
 
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS}
-        utils/math/broadcast.cu
-        utils/math/elementwise.cu
-        utils/math/reduce.cu
-        utils/math/transpose.cu
-        utils/math_gpu.cu
-        )
-
-set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS}
-        utils/math/hip/broadcast.hip
-        utils/math/hip/elementwise.hip
-        utils/math/hip/reduce.hip
-        utils/math/hip/transpose.hip
-        utils/hip/math_gpu.hip
-        )
-
-set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS}
-        utils/fixed_divisor_test.cc
-        utils/math_test.cc
-        utils/fatal_signal_asan_no_sig_test.cc
-        utils/simple_queue_test.cc
-        utils/proto_utils_test.cc
-        utils/smart_tensor_printer_test.cc
-        utils/cast_test.cc
-        )
-
-if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS}
-          utils/cpuid_test.cc
-          )
+if(NOT INTERN_BUILD_MOBILE)
+  list(APPEND Caffe2_CPU_SRCS
+    utils/proto_wrap.cc
+  )
 endif()
-
-set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS}
-        utils/math_gpu_test.cc
-        )
-
-set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS}
-        utils/hip/math_gpu_test.cc
-        utils/hip/math_blas_gpu_test.cc
-        )
-
-# TODO Once all source files are defined inside the local c10_utils_xxx targets,
-# it should be the job of the parent CMakeLists.txt to decide what to do with the target (i.e. link it to caffe2)
-# instead of us locally adding it to Caffe2_xxx variables.
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 73dba4061dced..a9a3aab8c5107 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1597,23 +1597,24 @@ if(NOT INTERN_BUILD_MOBILE)
 
   set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
 
-  if(USE_MAGMA)
-    find_package(MAGMA)
-  endif()
-  if((USE_CUDA OR USE_ROCM) AND MAGMA_FOUND)
-    set(USE_MAGMA 1)
-    message(STATUS "Compiling with MAGMA support")
-    message(STATUS "MAGMA INCLUDE DIRECTORIES: ${MAGMA_INCLUDE_DIR}")
-    message(STATUS "MAGMA LIBRARIES: ${MAGMA_LIBRARIES}")
-    message(STATUS "MAGMA V2 check: ${MAGMA_V2}")
+  if(USE_CUDA OR USE_ROCM)
+    if(USE_MAGMA)
+      find_package(MAGMA)
+      if(MAGMA_FOUND)
+        message(STATUS "Compiling with MAGMA support")
+        message(STATUS "MAGMA INCLUDE DIRECTORIES: ${MAGMA_INCLUDE_DIR}")
+        message(STATUS "MAGMA LIBRARIES: ${MAGMA_LIBRARIES}")
+        message(STATUS "MAGMA V2 check: ${MAGMA_V2}")
+      else()
+        message(STATUS "MAGMA not found. Compiling without MAGMA support")
+        caffe2_update_option(USE_MAGMA OFF)
+      endif()
+    endif()
   elseif(USE_MAGMA)
     message(WARNING
       "Not compiling with MAGMA. Suppress this warning with "
       "-DUSE_MAGMA=OFF.")
     caffe2_update_option(USE_MAGMA OFF)
-  else()
-    message(STATUS "MAGMA not found. Compiling without MAGMA support")
-    caffe2_update_option(USE_MAGMA OFF)
   endif()
 
   # ARM specific flags
@@ -1685,9 +1686,6 @@ if(NOT INTERN_BUILD_MOBILE)
     if(MKLDNN_FOUND)
       set(AT_MKLDNN_ENABLED 1)
       include_directories(AFTER SYSTEM ${MKLDNN_INCLUDE_DIR})
-      if(BUILD_CAFFE2_OPS)
-        list(APPEND Caffe2_DEPENDENCY_LIBS caffe2::mkldnn)
-      endif(BUILD_CAFFE2_OPS)
     else()
       message(WARNING "MKLDNN could not be found.")
       caffe2_update_option(USE_MKLDNN OFF)
diff --git a/cmake/Modules/FindLAPACK.cmake b/cmake/Modules/FindLAPACK.cmake
index fc8bf50d7d5d6..dbe47d6cdcf19 100644
--- a/cmake/Modules/FindLAPACK.cmake
+++ b/cmake/Modules/FindLAPACK.cmake
@@ -26,6 +26,7 @@ ENDIF(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
 
 # Old search lapack script
 include(CheckFortranFunctionExists)
+include(CheckFunctionExists)
 
 macro(Check_Lapack_Libraries LIBRARIES _prefix _name _flags _list _blas)
   # This macro checks for the existence of the combination of fortran libraries
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 72c1243c24ea9..09af98d0bc066 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -23,8 +23,6 @@ function(caffe2_print_configuration_summary)
   message(STATUS "")
 
   message(STATUS "  TORCH_VERSION         : ${TORCH_VERSION}")
-  message(STATUS "  BUILD_CAFFE2          : ${BUILD_CAFFE2}")
-  message(STATUS "  BUILD_CAFFE2_OPS      : ${BUILD_CAFFE2_OPS}")
   message(STATUS "  BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}")
   message(STATUS "  BUILD_BINARY          : ${BUILD_BINARY}")
   message(STATUS "  BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index 6d518a1489626..02e313285297b 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -80,9 +80,6 @@ else()
   # shared library.
   # TODO: this list might be incomplete.
   append_torchlib_if_found(c10)
-  if(@BUILD_CAFFE2@)
-    append_torchlib_if_found(Caffe2_perfkernels_avx512 Caffe2_perfkernels_avx2 Caffe2_perfkernels_avx)
-  endif()
 
   if(@USE_NNPACK@)
     append_torchlib_if_found(nnpack)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0f89d2799fa52..fe548737b3137 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -2796,6 +2796,7 @@
     "ConstraintViolationError",
     "DynamicDimConstraintPrinter",
     "GuardOnDataDependentSymNode",
+    "PendingUnbackedSymbolNotFound",
     "LoggingShapeGuardPrinter",
     "RelaxedUnspecConstraint",
     "RuntimeAssert",
diff --git a/docs/source/torch.compiler_troubleshooting.rst b/docs/source/torch.compiler_troubleshooting.rst
index f98a4dc779b63..7158149c09e19 100644
--- a/docs/source/torch.compiler_troubleshooting.rst
+++ b/docs/source/torch.compiler_troubleshooting.rst
@@ -727,3 +727,11 @@ and C++ backtrace whenever this symbol was created.
 ``TORCHDYNAMO_EXTENDED_DEBUG_CPP`` - provides extended debug information (C++ backtrace)
 for all extended debug settings as well as errors. For example, set this to "1". The C++
 backtrace is slow and very spammy so it is not included by default with extended debugging.
+
+Cold Start Timing and Cache Corruption Debugging
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In order to measure the cold start compilation time or debug a cache corruption,
+it is possible pass ``TORCHINDUCTOR_FORCE_DISABLE_CACHES=1`` or set
+``torch._inductor.config.force_disable_caches = True`` which will override any
+other caching config option and disable all compile time caching.
diff --git a/functorch/csrc/dim/arena.h b/functorch/csrc/dim/arena.h
index 3251321f998b2..fa68e67268d53 100644
--- a/functorch/csrc/dim/arena.h
+++ b/functorch/csrc/dim/arena.h
@@ -55,7 +55,7 @@ struct Slice {
     T& operator[](int i) const {
         return begin_[i];
     }
-    c10::optional<int> index(const T& value) {
+    std::optional<int> index(const T& value) {
         for (int i : enumerate()) {
             if (begin_[i] == value) {
                 return i;
diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
index e25b8d0e5731a..066f9517acefd 100644
--- a/functorch/csrc/dim/dim.cpp
+++ b/functorch/csrc/dim/dim.cpp
@@ -1123,7 +1123,7 @@ int64_t _Tensor_ndim(mpy::handle h) {
 
 mpy::handle handle_from_tensor(Arena& A, TensorRef t) {
     // fast case: tensor is live in python
-    c10::optional<PyObject*> mb_obj =
+    std::optional<PyObject*> mb_obj =
         t->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(getPyInterpreter(), /*ignore_hermetic_tls=*/false);
     if (mb_obj.has_value() && !t->unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()) {
         return *mb_obj;
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
deleted file mode 100644
index 598cac60bdbad..0000000000000
--- a/modules/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-project(modules CXX C)
-add_subdirectory(detectron)
-add_subdirectory(module_test)
-add_subdirectory(observers)
-
-# Finally, set Caffe2_MODULES to parent scope.
-set(Caffe2_MODULES ${Caffe2_MODULES} PARENT_SCOPE)
diff --git a/modules/detectron/CMakeLists.txt b/modules/detectron/CMakeLists.txt
deleted file mode 100644
index 7c9a2d7ff4f4a..0000000000000
--- a/modules/detectron/CMakeLists.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
-file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
-file(GLOB_RECURSE Detectron_HIP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.hip)
-
-if(BUILD_CAFFE2_OPS)
-  # Note(ilijar): Since Detectron ops currently have no
-  # CPU implementation, we only build GPU ops for now.
-  if(USE_CUDA)
-    add_library(
-        caffe2_detectron_ops_gpu SHARED
-        ${Detectron_CPU_SRCS}
-        ${Detectron_GPU_SRCS})
-
-    target_link_libraries(caffe2_detectron_ops_gpu PRIVATE torch)
-    if(USE_OPENMP)
-      target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::openmp)
-    endif()
-
-    if(USE_MKLDNN)
-      target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::mkldnn)
-    endif()
-    install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
-    if(MSVC)
-      install(FILES $<TARGET_PDB_FILE:caffe2_detectron_ops_gpu> DESTINATION lib OPTIONAL)
-    endif()
-  elseif(USE_ROCM)
-    hip_include_directories(${Caffe2_HIP_INCLUDES})
-    set_source_files_properties(${Detectron_HIP_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-    HIP_ADD_LIBRARY(
-        caffe2_detectron_ops_hip SHARED
-        ${Detectron_CPU_SRCS}
-        ${Detectron_HIP_SRCS})
-    target_compile_options(caffe2_detectron_ops_hip PRIVATE ${HIP_CXX_FLAGS})
-    if(USE_MKLDNN)
-      target_link_libraries(caffe2_detectron_ops_hip PRIVATE caffe2::mkldnn)
-    endif()
-    target_link_libraries(caffe2_detectron_ops_hip PRIVATE torch)
-    install(TARGETS caffe2_detectron_ops_hip DESTINATION lib)
-  elseif(NOT IOS_PLATFORM)
-    add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
-    if(HAVE_SOVERSION)
-      set_target_properties(caffe2_detectron_ops PROPERTIES
-        VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
-    endif()
-    target_link_libraries(caffe2_detectron_ops PRIVATE torch)
-    if(USE_OPENMP)
-      target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::openmp)
-    endif()
-    if(USE_MKLDNN)
-      target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::mkldnn)
-    endif()
-    install(TARGETS caffe2_detectron_ops DESTINATION lib)
-    if(MSVC)
-      install(FILES $<TARGET_PDB_FILE:caffe2_detectron_ops> DESTINATION lib OPTIONAL)
-    endif()
-  endif()
-endif()
diff --git a/modules/detectron/group_spatial_softmax_op.cc b/modules/detectron/group_spatial_softmax_op.cc
deleted file mode 100644
index 8b1fc052ef39b..0000000000000
--- a/modules/detectron/group_spatial_softmax_op.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "modules/detectron/group_spatial_softmax_op.h"
-
-#include "caffe2/operators/softmax_utils.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(
-    GroupSpatialSoftmax,
-    GroupSpatialSoftmaxOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    GroupSpatialSoftmaxGradient,
-    GroupSpatialSoftmaxGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(GroupSpatialSoftmax)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-RetinaNet specific form of spatial softmax.
-
-The input is assumed to be unnormalized scores (sometimes called 'logits')
-arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
-elements in the batch, H and W are the height and width, and C = num_anchors *
-num_classes defines num_anchors 'groups' of softmax inputs, each of length
-num_classes. The softmax is applied to each group independently.
-
-See: https://arxiv.org/abs/1708.02002 for details.
-)DOC")
-    .Arg(
-        "num_classes",
-        "(int) default 81; number of classes in each softmax group.")
-    .Input(
-        0,
-        "scores",
-        "4D tensor of softmax inputs (called 'scores' or 'logits') with shape "
-        "(N, C, H, W), where C = num_anchors * num_classes defines num_anchors "
-        "groups of contiguous num_classes softmax inputs.")
-    .Output(
-        0,
-        "probabilities",
-        "4D tensor of softmax probabilities with shape (N, C, H, W), where "
-        "C = num_anchors * num_classes, and softmax was applied to each of the "
-        "num_anchors groups; within a group the num_classes values sum to 1.");
-
-OPERATOR_SCHEMA(GroupSpatialSoftmaxGradient)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .Input(0, "scores", "See GroupSpatialSoftmax")
-    .Input(
-        1,
-        "d_probabilities",
-        "Gradient of forward output 0 (probabilities).")
-    .Output(0, "d_scores", "Gradient of forward input 0 (scores).");
-
-class GetGroupSpatialSoftmaxGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "GroupSpatialSoftmaxGradient",
-        "",
-        vector<string>{O(0), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(GroupSpatialSoftmax, GetGroupSpatialSoftmaxGradient);
-
-} // namespace caffe2
diff --git a/modules/detectron/group_spatial_softmax_op.cu b/modules/detectron/group_spatial_softmax_op.cu
deleted file mode 100644
index 741da27f59d2b..0000000000000
--- a/modules/detectron/group_spatial_softmax_op.cu
+++ /dev/null
@@ -1,181 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cfloat>
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/group_spatial_softmax_op.h"
-
-namespace caffe2 {
-
-namespace {
-
-__global__ void GroupSpatialSoftmaxKernel(const int num, const int A, const int W,
-    const int H, const float* Xdata, float* Pdata, const int num_classes) {
-  // Loop through labels (N x A x H x W)
-  CUDA_1D_KERNEL_LOOP(index, num * A * H * W) {
-    int D = num_classes * A;
-    int x = index % W;
-    int y = (index / W) % H;
-    int a = (index / (W * H)) % A;
-    int i = index / W / H / A;
-
-    // Subtract max on each cell for numerical reasons
-    float max_val = -FLT_MAX;
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = i * (H * W * D) +  c * (H * W) + y * W + x;
-      max_val = max(max_val, Xdata[idx]);
-    }
-    // Exponentiate
-    float expsum = 0.0f;
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
-      float expx = exp(Xdata[idx] - max_val);
-      Pdata[idx] = expx;
-      expsum += expx;
-    }
-
-    // Normalize
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
-      Pdata[idx] /= expsum;
-    }
-
-  }
-}
-
-__global__ void SumProbsKernel(const int N, const int A, const int W,
-    const int H, const float* Ydata, const float* dYdata,
-    float* sum_probs_data, const int num_classes) {
-  CUDA_1D_KERNEL_LOOP(i, N * A * W * H) {
-    int D = num_classes * A;
-    int x = i % W;
-    int y = (i / W) % H;
-    int a = (i / (W * H)) % A;
-    int n = i / (W * H * A);
-
-    sum_probs_data[i] = 0.0;
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = n * (H * W * D) + c * (H * W) + y * W + x;
-      sum_probs_data[i] += (Ydata[idx] * dYdata[idx]);
-    }
-  }
-}
-
-__global__ void SubSumKernel(
-    const int N, const int A, const int W, const int H,
-    const float* sum_probs_data, float* dXdata, const int num_classes) {
-  CUDA_1D_KERNEL_LOOP(i, N * (A * num_classes) * W * H) {
-    int D = num_classes * A;
-    int x = i % W;
-    int y = (i / W) % H;
-    int a = ((i / (W * H)) % D) / num_classes;
-    int n = i / W / H / D;
-    int idx = n * (H * W * A) + a * (H * W) + y * W + x;
-    dXdata[i] = (dXdata[i] - sum_probs_data[idx]);
-  }
-}
-
-} // namespace
-
-
-template <>
-bool GroupSpatialSoftmaxOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);  // Logits
-
-  int N = X.dim32(0);
-  int D = X.dim32(1);
-  int H = X.dim32(2);
-  int W = X.dim32(3);
-  int A = D / num_classes_;
-
-  auto* P = Output(0, X.sizes(), at::dtype<float>()); // Probabilities from softmax
-  TORCH_DCHECK_EQ(X.ndim(), 4);
-
-  const float* Xdata = X.data<float>();
-  float* Pdata = P->mutable_data<float>();
-
-  // Softmax for each x,y location
-  GroupSpatialSoftmaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
-                         0, context_.cuda_stream()>>>(
-      N, A, W, H, Xdata, Pdata, num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  return true;
-}
-
-
-template<>
-bool GroupSpatialSoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& Y = Input(0);  // Probabilities from softmax
-  auto& dY = Input(1);
-
-
-  TORCH_DCHECK_EQ(Y.ndim(), 4);
-
-  int N = Y.dim32(0);
-  int D = Y.dim32(1);
-  int H = Y.dim32(2);
-  int W = Y.dim32(3);
-  int A = D / num_classes_;
-
-  auto* dX = Output(0, Y.sizes(), at::dtype<float>());
-
-  if (sum_probs_.size() != N * A * H * W) {
-    ReinitializeTensor(&sum_probs_, {N * A * H * W}, at::dtype<float>().device(CUDA));
-  }
-
-  const float* Ydata = Y.data<float>();
-  const float* dYdata = dY.data<float>();
-  float* dXdata = dX->mutable_data<float>();
-
-  float* sum_probs_data = sum_probs_.mutable_data<float>();
-  math::Set<float, CUDAContext>(
-      sum_probs_.size(), 0.0f, sum_probs_data, &context_);
-
-  // Complete math:
-  // J_ij = h_i (delta_ij - h_j)
-  // d x_i = sum_j d h_ij = sum_j J_ij * dy_j
-  //       = sum_j h_i (delta_ij - h_j) * dy_j
-  //       = h_i dy_i - (sum_j h_i h_j dy_j)
-  //       = h_i dy_i - h_i sum_j h_j dy_j
-
-  // Step 0: dx = dy
-  context_.Copy<float, CUDAContext, CUDAContext>(Y.size(), dYdata, dXdata);
-
-  // Step 1: s = Sum(dY[j] * Y[j])
-  SumProbsKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
-                   context_.cuda_stream()>>>(
-    N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Step 2: dX[i] = dX[i] - s
-  SubSumKernel<<<CAFFE_GET_BLOCKS(Y.size()), CAFFE_CUDA_NUM_THREADS, 0,
-                  context_.cuda_stream()>>>(
-    N, A, W, H, sum_probs_.data<float>(), dXdata, num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Step 3: dX[i] = Y[i] * dX[i]
-  math::Mul<float, CUDAContext>(Y.size(), dXdata, Ydata, dXdata, &context_);
-
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(GroupSpatialSoftmax,
-                       GroupSpatialSoftmaxOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(GroupSpatialSoftmaxGradient,
-                       GroupSpatialSoftmaxGradientOp<float, CUDAContext>);
-} // namespace caffe2
diff --git a/modules/detectron/group_spatial_softmax_op.h b/modules/detectron/group_spatial_softmax_op.h
deleted file mode 100644
index b235a47146b58..0000000000000
--- a/modules/detectron/group_spatial_softmax_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef GROUP_SPATIAL_SOFTMAX_OP_H_
-#define GROUP_SPATIAL_SOFTMAX_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class GroupSpatialSoftmaxOp final : public Operator<Context> {
- public:
-  GroupSpatialSoftmaxOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
-        order_(StringToStorageOrder(
-            this->template GetSingleArgument<string>("order", "NCHW"))) {
-    CAFFE_ENFORCE_EQ(
-        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  int num_classes_;
-  StorageOrder order_;
-};
-
-template <typename T, class Context>
-class GroupSpatialSoftmaxGradientOp final : public Operator<Context> {
- public:
-  GroupSpatialSoftmaxGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
-        order_(StringToStorageOrder(
-            this->template GetSingleArgument<string>("order", "NCHW"))) {
-    CAFFE_ENFORCE_EQ(
-        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  int num_classes_;
-  StorageOrder order_;
-  Tensor sum_probs_;
-};
-
-} // namespace caffe2
-
-#endif // GROUP_SPATIAL_SOFTMAX_OP_H_
diff --git a/modules/detectron/ps_roi_pool_op.cc b/modules/detectron/ps_roi_pool_op.cc
deleted file mode 100644
index c57b0fc23678b..0000000000000
--- a/modules/detectron/ps_roi_pool_op.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ps_roi_pool_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(PSRoIPool, PSRoIPoolOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    PSRoIPoolGradient,
-    PSRoIPoolGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(PSRoIPool)
-    .NumInputs(2)
-    .NumOutputs(2)
-    .SetDoc(R"DOC(
-Position Sensitive Region of Interest Pooling as used in R-FCN.
-)DOC")
-    .Arg(
-        "spatial_scale",
-        "(float) default 1.0; Spatial scale of the input feature map X "
-        "relative to the input image. E.g., 0.0625 if X has a stride of 16 "
-        "w.r.t. the input image.")
-    .Arg(
-        "group_size",
-        "(int) default 1; pooled_h = pooled_w = group_size where pooled_{h,w} "
-        "is the pooled output Y's height and width, respectively.")
-    .Arg(
-        "output_dim",
-        "(int) default 1; number of channels in the pooled output, which might "
-        "be the number of classes is used for classification or 4 if used for "
-        "class agnostic bounding box regression.")
-    .Input(
-        0,
-        "X",
-        "4D position sensitive feature map input of shape (N, C, H, W), where "
-        "C = group_size**2 * output_dim.")
-    .Input(
-        1,
-        "RoIs",
-        "2D input of shape (R, 5) specifying R RoIs with five columns "
-        "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
-        "coordinates are in the coordinate system of the input image.")
-    .Output(
-        0,
-        "Y",
-        "4D output of shape (R, output_dim, pooled_h, pooled_w). The r-th "
-        "batch element is a pooled feature map cooresponding to the r-th RoI.")
-    .Output(
-        1,
-        "argmaxes",
-        "4D output of shape (R, output_dim, pooled_h, pooled_w). Same as Y, "
-        "except it records the argmax indices rather than the max pooled "
-        "values.");
-
-OPERATOR_SCHEMA(PSRoIPoolGradient)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "X",
-        "See PSRoIPool.")
-    .Input(
-        1,
-        "RoIs",
-        "See PSRoIPool.")
-    .Input(
-        2,
-        "argmaxes",
-        "See PSRoIPool.")
-    .Input(
-        3,
-        "dY",
-        "Gradient of forward output 0 (Y)")
-    .Output(
-        0,
-        "dX",
-        "Gradient of forward input 0 (X)");
-
-class GetPSRoIPoolGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "PSRoIPoolGradient",
-        "",
-        vector<string>{I(0), I(1), O(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(PSRoIPool, GetPSRoIPoolGradient);
-
-} // namespace caffe2
diff --git a/modules/detectron/ps_roi_pool_op.cu b/modules/detectron/ps_roi_pool_op.cu
deleted file mode 100644
index 68e4ec377d622..0000000000000
--- a/modules/detectron/ps_roi_pool_op.cu
+++ /dev/null
@@ -1,289 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Based on https://github.com/daijifeng001/caffe-rfcn/blob/r-fcn/src/caffe/layers/psroi_pooling_layer.cu
-//
-// ------------------------------------------------------------------
-// R-FCN
-// Copyright (c) 2016 Microsoft
-// Licensed under The MIT License [see r-fcn/LICENSE for details]
-// Written by Yi Li
-// ------------------------------------------------------------------
-//
-// COPYRIGHT
-//
-// All contributions by the University of California:
-// Copyright (c) 2014, 2015, The Regents of the University of California
-// (Regents)
-// All rights reserved.
-//
-// All other contributions:
-// Copyright (c) 2014, 2015, the respective contributors
-// All rights reserved.
-//
-// Caffe uses a shared copyright model: each contributor holds copyright over
-// their contributions to Caffe. The project versioning records all such
-// contribution and copyright details. If a contributor wants to further mark
-// their specific copyright on a particular contribution, they should indicate
-// their copyright solely in the commit message of the change when it is
-// committed.
-//
-// LICENSE
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimer.
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// CONTRIBUTION AGREEMENT
-//
-// By contributing to the BVLC/caffe repository through pull-request, comment,
-// or otherwise, the contributor releases their content to the
-// license and copyright terms herein.
-
-#include <cfloat>
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/ps_roi_pool_op.h"
-
-namespace caffe2 {
-
-namespace {
-
-template <typename T>
-inline __device__ T gpu_atomic_add(const T val, T* address);
-
-template <>
-inline __device__
-float gpu_atomic_add(const float val, float* address) {
-  return atomicAdd(address, val);
-}
-
-template <typename T>
-__global__ void PSRoIPoolForward(
-    const int nthreads,
-    const T* bottom_data,
-    const T spatial_scale,
-    const int channels,
-    const int height,
-    const int width,
-    const int pooled_height,
-    const int pooled_width,
-    const T* bottom_rois,
-    const int output_dim,
-    const int group_size,
-    T* top_data,
-    int* mapping_channel) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // The output is in order (n, ctop, ph, pw)
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int ctop = (index / pooled_width / pooled_height) % output_dim;
-    int n = index / pooled_width / pooled_height / output_dim;
-
-    // [start, end) interval for spatial sampling
-    const T* offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-    T roi_start_w = static_cast<T>(
-      roundf(offset_bottom_rois[1])) * spatial_scale;
-    T roi_start_h = static_cast<T>(
-      roundf(offset_bottom_rois[2])) * spatial_scale;
-    T roi_end_w = static_cast<T>(
-      roundf(offset_bottom_rois[3]) + 1.) * spatial_scale;
-    T roi_end_h = static_cast<T>(
-      roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast<T>(0.1));  // avoid 0
-    T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
-
-    // Compute w and h at bottom
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    // Add roi offsets and clip to input boundaries
-    int hstart = floor(
-      static_cast<T>(ph) * bin_size_h + roi_start_h);
-    int wstart = floor(
-      static_cast<T>(pw)* bin_size_w + roi_start_w);
-    int hend = ceil(
-      static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
-    int wend = ceil(
-      static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
-
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0),width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    int gw = pw;
-    int gh = ph;
-    int c = (ctop * group_size + gh) * group_size + gw;
-
-    const T* offset_bottom_data =
-      bottom_data + (roi_batch_ind * channels + c) * height * width;
-    T out_sum = 0;
-    for (int h = hstart; h < hend; ++h){
-     for (int w = wstart; w < wend; ++w){
-       int bottom_index = h*width + w;
-       out_sum += offset_bottom_data[bottom_index];
-     }
-    }
-
-    T bin_area = (hend - hstart) * (wend - wstart);
-    top_data[index] = is_empty ? 0. : out_sum / bin_area;
-    mapping_channel[index] = c;
-  }
-}
-
-template <typename T>
-__global__ void PSRoIPoolBackward(
-    const int nthreads,
-    const T* top_diff,
-    const int* mapping_channel,
-    const int num_rois,
-    const T spatial_scale,
-    const int channels,
-    const int height,
-    const int width,
-    const int pooled_height,
-    const int pooled_width,
-    const int output_dim,
-    T* bottom_diff,
-    const T* bottom_rois) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // The output is in order (n, ctop, ph, pw)
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int n = index / pooled_width / pooled_height / output_dim;
-
-    // [start, end) interval for spatial sampling
-    const T* offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-    T roi_start_w = static_cast<T>(
-      roundf(offset_bottom_rois[1])) * spatial_scale;
-    T roi_start_h = static_cast<T>(
-      roundf(offset_bottom_rois[2])) * spatial_scale;
-    T roi_end_w = static_cast<T>(
-      roundf(offset_bottom_rois[3]) + 1.) * spatial_scale;
-    T roi_end_h = static_cast<T>(
-      roundf(offset_bottom_rois[4]) + 1.) * spatial_scale;
-
-    // Force too small ROIs to be 1x1
-    T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast<T>(0.1)); //avoid 0
-    T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
-
-    // Compute w and h at bottom
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    int hstart = floor(
-      static_cast<T>(ph)* bin_size_h + roi_start_h);
-    int wstart = floor(
-      static_cast<T>(pw)* bin_size_w + roi_start_w);
-    int hend = ceil(
-      static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
-    int wend = ceil(
-      static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart, 0), height);
-    hend = min(max(hend, 0), height);
-    wstart = min(max(wstart, 0), width);
-    wend = min(max(wend, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    // Compute c at bottom
-    int c = mapping_channel[index];
-    T* offset_bottom_diff =
-      bottom_diff + (roi_batch_ind * channels + c) * height * width;
-    T bin_area = (hend - hstart) * (wend - wstart);
-    T diff_val = is_empty ? 0. : top_diff[index] / bin_area;
-    for (int h = hstart; h < hend; ++h){
-      for (int w = wstart; w < wend; ++w){
-        int bottom_index = h * width + w;
-        gpu_atomic_add(diff_val, offset_bottom_diff + bottom_index);
-      }
-    }
-  }
-}
-
-} // namespace
-
-template<>
-bool PSRoIPoolOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);  // Input data to pool
-  auto& R = Input(1);  // RoIs
-
-  auto* Y = Output(0, {R.dim32(0), output_dim_, pooled_height_, pooled_width_}, at::dtype<float>()); // PSRoI pooled data
-  auto* A = Output(1, Y->sizes(), at::dtype<int>()); // mapping_channel
-  int output_size = Y->numel();
-  PSRoIPoolForward<float><<<CAFFE_GET_BLOCKS(output_size),
-                            CAFFE_CUDA_NUM_THREADS,
-                            0, context_.cuda_stream()>>>(
-      output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
-      X.dim32(3), pooled_height_, pooled_width_, R.data<float>(), output_dim_,
-      group_size_, Y->mutable_data<float>(), A->mutable_data<int>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  return true;
-}
-
-
-template<>
-bool PSRoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X  = Input(0);  // Input data to pool
-  auto& R  = Input(1);  // RoIs
-  auto& A  = Input(2);  // mapping channels
-  auto& dY = Input(3);  // Gradient of net w.r.t. output of "forward" op
-                        // (aka "gradOutput")
-
-  auto* dX = Output(0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to "forward" op
-                                                       // (aka "gradInput")
-  // Must zero-out dX before accumulating gradients
-  math::Set<float, CUDAContext>(
-      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
-  PSRoIPoolBackward<float><<<CAFFE_GET_BLOCKS(dY.size()),
-                             CAFFE_CUDA_NUM_THREADS,
-                             0, context_.cuda_stream()>>>(
-      dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
-      X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
-      output_dim_, dX->mutable_data<float>(), R.data<float>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(PSRoIPool,
-                       PSRoIPoolOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(PSRoIPoolGradient,
-                       PSRoIPoolGradientOp<float, CUDAContext>);
-} // namespace caffe2
diff --git a/modules/detectron/ps_roi_pool_op.h b/modules/detectron/ps_roi_pool_op.h
deleted file mode 100644
index ecee1dd7041c4..0000000000000
--- a/modules/detectron/ps_roi_pool_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef PS_ROI_POOL_OP_H_
-#define PS_ROI_POOL_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class PSRoIPoolOp final : public Operator<Context> {
- public:
-  PSRoIPoolOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        spatial_scale_(this->template GetSingleArgument<float>(
-              "spatial_scale", 1.)),
-        group_size_(this->template GetSingleArgument<int>("group_size", 1)),
-        output_dim_(this->template GetSingleArgument<int>("output_dim", 1)) {
-    TORCH_DCHECK_GT(spatial_scale_, 0);
-    TORCH_DCHECK_GT(group_size_, 0);
-    pooled_height_ = group_size_;
-    pooled_width_ = group_size_;
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-   float spatial_scale_;
-   int group_size_;
-   int output_dim_;
-   int pooled_height_;
-   int pooled_width_;
-   int channels_;
-   int height_;
-   int width_;
- };
-
-template <typename T, class Context>
-class PSRoIPoolGradientOp final : public Operator<Context> {
- public:
-  PSRoIPoolGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        spatial_scale_(this->template GetSingleArgument<float>(
-              "spatial_scale", 1.)),
-        group_size_(this->template GetSingleArgument<int>("group_size", 1)),
-        output_dim_(this->template GetSingleArgument<int>("output_dim", 1)) {
-    TORCH_DCHECK_GT(spatial_scale_, 0);
-    TORCH_DCHECK_GT(group_size_, 0);
-    pooled_height_ = group_size_;
-    pooled_width_ = group_size_;
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float spatial_scale_;
-  int group_size_;
-  int output_dim_;
-  int pooled_height_;
-  int pooled_width_;
-  int channels_;
-  int height_;
-  int width_;
-};
-
-} // namespace caffe2
-
-#endif // PS_ROI_POOL_OP_H_
diff --git a/modules/detectron/roi_pool_f_op.cc b/modules/detectron/roi_pool_f_op.cc
deleted file mode 100644
index 81bf8bb62ed0a..0000000000000
--- a/modules/detectron/roi_pool_f_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "roi_pool_f_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(RoIPoolF, RoIPoolFOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(RoIPoolFGradient, RoIPoolFGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(RoIPoolF)
-    .NumInputs(2)
-    .NumOutputs(2)
-    .SetDoc(R"DOC(
-Region of Interest (RoI) pooling operation as used in Fast R-CNN.
-)DOC")
-    .Arg(
-        "spatial_scale",
-        "(float) default 1.0; Spatial scale of the input feature map X "
-        "relative to the input image. E.g., 0.0625 if X has a stride of 16 "
-        "w.r.t. the input image.")
-    .Arg(
-        "pooled_h",
-        "(int) default 1; Pooled output Y's height.")
-    .Arg(
-        "pooled_w",
-        "(int) default 1; Pooled output Y's width.")
-    .Input(
-        0,
-        "X",
-        "4D feature map input of shape (N, C, H, W).")
-    .Input(
-        1,
-        "RoIs",
-        "2D input of shape (R, 5) specifying R RoIs with five columns "
-        "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
-        "coordinates are in the coordinate system of the input image.")
-    .Output(
-        0,
-        "Y",
-        "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element "
-        "is a pooled feature map cooresponding to the r-th RoI.")
-    .Output(
-        1,
-        "argmaxes",
-        "4D output of shape (R, C, pooled_h, pooled_w). Same as Y, except it "
-        "records the argmax indices rather than the max pooled values.");
-
-OPERATOR_SCHEMA(RoIPoolFGradient)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "X",
-        "See RoIPoolF.")
-    .Input(
-        1,
-        "RoIs",
-        "See RoIPoolF.")
-    .Input(
-        2,
-        "argmaxes",
-        "See RoIPoolF.")
-    .Input(
-        3,
-        "dY",
-        "Gradient of forward output 0 (Y)")
-    .Output(
-        0,
-        "dX",
-        "Gradient of forward input 0 (X)");
-
-class GetRoIPoolFGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "RoIPoolFGradient",
-        "",
-        vector<string>{I(0), I(1), O(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(RoIPoolF, GetRoIPoolFGradient);
-
-} // namespace caffe2
diff --git a/modules/detectron/roi_pool_f_op.cu b/modules/detectron/roi_pool_f_op.cu
deleted file mode 100644
index b261911b95a16..0000000000000
--- a/modules/detectron/roi_pool_f_op.cu
+++ /dev/null
@@ -1,187 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cfloat>
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/roi_pool_f_op.h"
-
-namespace caffe2 {
-
-namespace {
-
-template <typename T>
-inline __device__ T gpu_atomic_add(const T val, T* address);
-
-template <>
-inline __device__
-float gpu_atomic_add(const float val, float* address) {
-  return atomicAdd(address, val);
-}
-
-template <typename T>
-__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
-    const T spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const T* bottom_rois, T* top_data, int* argmax_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-    int roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale);
-    int roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale);
-    int roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale);
-    int roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale);
-
-    // Force malformed ROIs to be 1x1
-    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-    T bin_size_h = static_cast<T>(roi_height)
-                       / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width)
-                       / static_cast<T>(pooled_width);
-
-    int hstart = static_cast<int>(floor(static_cast<T>(ph)
-                                        * bin_size_h));
-    int wstart = static_cast<int>(floor(static_cast<T>(pw)
-                                        * bin_size_w));
-    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1)
-                                     * bin_size_h));
-    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1)
-                                     * bin_size_w));
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart + roi_start_h, 0), height);
-    hend = min(max(hend + roi_start_h, 0), height);
-    wstart = min(max(wstart + roi_start_w, 0), width);
-    wend = min(max(wend + roi_start_w, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    // Define an empty pooling region to be zero
-    T maxval = is_empty ? 0 : -FLT_MAX;
-    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
-    int maxidx = -1;
-    const T* offset_bottom_data =
-        bottom_data + (roi_batch_ind * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int bottom_index = h * width + w;
-        if (offset_bottom_data[bottom_index] > maxval) {
-          maxval = offset_bottom_data[bottom_index];
-          maxidx = bottom_index;
-        }
-      }
-    }
-    top_data[index] = maxval;
-    argmax_data[index] = maxidx;
-  }
-}
-
-template <typename T>
-__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
-    const int* argmax_data, const int num_rois, const T spatial_scale,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, T* bottom_diff,
-    const T* bottom_rois) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-    int bottom_offset = (roi_batch_ind * channels + c) * height * width;
-    int top_offset    = (n * channels + c) * pooled_height * pooled_width;
-    const T* offset_top_diff = top_diff + top_offset;
-    T* offset_bottom_diff = bottom_diff + bottom_offset;
-    const int* offset_argmax_data = argmax_data + top_offset;
-
-    int argmax = offset_argmax_data[ph * pooled_width + pw];
-    if (argmax != -1) {
-      gpu_atomic_add(
-          static_cast<T>(offset_top_diff[ph * pooled_width + pw]),
-          offset_bottom_diff + argmax);
-    }
-  }
-}
-
-} // namespace
-
-template<>
-bool RoIPoolFOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);  // Input data to pool
-  auto& R = Input(1);  // RoIs
-
-  if (R.size() == 0) {
-    // Handle empty rois
-    std::vector<int64_t> sizes = {0, X.dim32(1), pooled_height_, pooled_width_};
-    /* auto* Y = */ Output(0, sizes, at::dtype<float>());
-    /* auto* A = */ Output(1, sizes, at::dtype<int>());
-    return true;
-  }
-
-  auto* Y = Output(0, {R.dim32(0), X.dim32(1), pooled_height_, pooled_width_}, at::dtype<float>()); // RoI pooled data
-  auto* A = Output(1, Y->sizes(), at::dtype<int>()); // argmaxes
-  int output_size = Y->size();
-  RoIPoolFForward<float><<<CAFFE_GET_BLOCKS(output_size),
-                          CAFFE_CUDA_NUM_THREADS,
-                          0, context_.cuda_stream()>>>(
-      output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
-      X.dim32(3), pooled_height_, pooled_width_, R.data<float>(),
-      Y->mutable_data<float>(), A->mutable_data<int>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  return true;
-}
-
-
-template<>
-bool RoIPoolFGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X  = Input(0);  // Input data to pool
-  auto& R  = Input(1);  // RoIs
-  auto& A  = Input(2);  // argmaxes
-  auto& dY = Input(3);  // Gradient of net w.r.t. output of "forward" op
-                        // (aka "gradOutput")
-
-  auto* dX = Output(0, X.sizes(), at::dtype<float>());    // Gradient of net w.r.t. input to "forward" op
-                        // (aka "gradInput")
-  // Must zero-out dX before accumulating gradients
-  math::Set<float, CUDAContext>(
-      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
-  if (dY.size() > 0) {  // Handle possibly empty gradient if there were no rois
-    RoIPoolFBackward<float><<<CAFFE_GET_BLOCKS(dY.size()),
-                             CAFFE_CUDA_NUM_THREADS,
-                             0, context_.cuda_stream()>>>(
-        dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
-        X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
-        dX->mutable_data<float>(), R.data<float>());
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-  }
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(RoIPoolF,
-                       RoIPoolFOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(RoIPoolFGradient,
-                       RoIPoolFGradientOp<float, CUDAContext>);
-} // namespace caffe2
diff --git a/modules/detectron/roi_pool_f_op.h b/modules/detectron/roi_pool_f_op.h
deleted file mode 100644
index 604c5606a203e..0000000000000
--- a/modules/detectron/roi_pool_f_op.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ROI_POOL_F_OP_H_
-#define ROI_POOL_F_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class RoIPoolFOp final : public Operator<Context> {
- public:
-  RoIPoolFOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        spatial_scale_(this->template GetSingleArgument<float>(
-              "spatial_scale", 1.)),
-        pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
-        pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)) {
-    TORCH_DCHECK_GT(spatial_scale_, 0);
-    TORCH_DCHECK_GT(pooled_height_, 0);
-    TORCH_DCHECK_GT(pooled_width_, 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float spatial_scale_;
-  int pooled_height_;
-  int pooled_width_;
-};
-
-template <typename T, class Context>
-class RoIPoolFGradientOp final : public Operator<Context> {
- public:
-  RoIPoolFGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        spatial_scale_(this->template GetSingleArgument<float>(
-              "spatial_scale", 1.)),
-        pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
-        pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)) {
-    TORCH_DCHECK_GT(spatial_scale_, 0);
-    TORCH_DCHECK_GT(pooled_height_, 0);
-    TORCH_DCHECK_GT(pooled_width_, 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float spatial_scale_;
-  int pooled_height_;
-  int pooled_width_;
-};
-
-} // namespace caffe2
-
-#endif // ROI_POOL_F_OP_H_
diff --git a/modules/detectron/sample_as_op.cc b/modules/detectron/sample_as_op.cc
deleted file mode 100644
index d22cfb8194e60..0000000000000
--- a/modules/detectron/sample_as_op.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "sample_as_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SampleAs, SampleAsOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(SampleAsGradient, SampleAsGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(SampleAs)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Select the batch elements from input tensor X where the corresponding input
-label value is > 0.
-)DOC")
-    .Input(
-        0,
-        "X",
-        "Tensor of at least 1D shape (N, ...).")
-    .Input(
-        1,
-        "labels",
-        "Tensor of type int with 1D shape (N, ).")
-    .Output(
-        0,
-        "Y",
-        "Tensor with number of dims matching X, but with the length of dim 0 "
-        "equal to the number of non-zero elements in labels. The batch items "
-        "from X corresponding to the non-zero elements in labels are copied "
-        "into Y.");
-
-OPERATOR_SCHEMA(SampleAsGradient)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "X",
-        "See SampleAs.")
-    .Input(
-        1,
-        "labels",
-        "See SampleAs."
-    )
-    .Input(
-        2,
-        "dY",
-        "Gradient of forward output 0 (Y).")
-    .Output(
-        0,
-        "dX",
-        "Gradient of forward input 0 (X).");
-
-class GetSampleAsGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "SampleAsGradient",
-        "",
-        vector<string>{I(0), I(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(SampleAs, GetSampleAsGradient);
-
-} // namespace caffe2
diff --git a/modules/detectron/sample_as_op.cu b/modules/detectron/sample_as_op.cu
deleted file mode 100644
index a58604de2b0d0..0000000000000
--- a/modules/detectron/sample_as_op.cu
+++ /dev/null
@@ -1,116 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* SampleAs by Kaiming He for Mask R-CNN
-X.dim32(0) = L.dim32(0)
-Y's output samples are the samples of X for which L > 0.
-*/
-#include <cfloat>
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/sample_as_op.h"
-
-#include <stdio.h>
-
-namespace caffe2 {
-
-template <>
-bool SampleAsOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0); // Input data to be sliced
-  auto& L = Input(1); // Target data that provide the identity
-
-  CAFFE_ENFORCE(
-      X.dim32(0) == L.dim32(0),
-      "X.dim32(0) must be equal to L.dim32(0)",
-      "(",
-      X.dim32(0),
-      " vs. ",
-      L.dim32(0),
-      ")");
-
-  // copy L to CPU:
-  std::vector<int> labels(L.dim32(0));
-  context_.CopyBytes<CUDAContext, CPUContext>(
-      L.dim32(0) * sizeof(int), L.data<int>(), &labels[0]);
-  // Make sure that the copy is finished
-  context_.FinishDeviceComputation();
-
-  int count = 0;
-  for (int i = 0; i < L.dim32(0); i++) {
-    if (labels[i] > 0) {
-      count++;
-    }
-  }
-  assert(count > 0);
-
-  // resize Y
-  vector<int64_t> out_shape(X.sizes().vec());
-  out_shape[0] = count;
-  auto* Y = Output(0, out_shape, at::dtype<float>()); // Sliced data (Y.dim32(0) = num of (L > 0))
-
-  const int len = X.size() / X.dim32(0);
-
-  float* output = Y->mutable_data<float>();
-  for (int i = 0; i < L.dim32(0); i++) {
-    if (labels[i] > 0) {
-      context_.CopyBytes<CUDAContext, CUDAContext>(
-          len * sizeof(float), X.data<float>() + i * len, output);
-      output += len;
-    } // if
-  } // i
-
-  return true;
-}
-
-template <>
-bool SampleAsGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
-  auto& L = Input(1);
-  auto& dY = Input(2);
-
-
-  auto* dX = Output(0, X.sizes(), at::dtype<float>());
-
-  // copy L to CPU:
-  std::vector<int> labels(L.dim32(0));
-  context_.CopyBytes<CUDAContext, CPUContext>(
-      L.dim32(0) * sizeof(int), L.data<int>(), &labels[0]);
-  // Make sure that the copy is finished
-  context_.FinishDeviceComputation();
-
-  // zero-out dX
-  math::Set<float, CUDAContext>(
-      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
-
-  const int len = X.size() / X.dim32(0);
-
-  const float* input = dY.data<float>();
-  for (int i = 0; i < L.dim32(0); i++) {
-    if (labels[i] > 0) {
-      context_.CopyBytes<CUDAContext, CUDAContext>(
-          len * sizeof(float), input, dX->mutable_data<float>() + i * len);
-      input += len;
-    } // if
-  } // i
-
-  return true;
-}
-
-REGISTER_CUDA_OPERATOR(SampleAs, SampleAsOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(
-    SampleAsGradient,
-    SampleAsGradientOp<float, CUDAContext>);
-} // namespace caffe2
diff --git a/modules/detectron/sample_as_op.h b/modules/detectron/sample_as_op.h
deleted file mode 100644
index 70d2214e1c8cf..0000000000000
--- a/modules/detectron/sample_as_op.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SAMPLE_AS_OP_H_
-#define SAMPLE_AS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class SampleAsOp final : public Operator<Context> {
- public:
-  SampleAsOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-};
-
-template <typename T, class Context>
-class SampleAsGradientOp final : public Operator<Context> {
- public:
-  SampleAsGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws) {}
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-};
-
-} // namespace caffe2
-
-#endif // SAMPLE_AS_OP_H_
diff --git a/modules/detectron/select_smooth_l1_loss_op.cc b/modules/detectron/select_smooth_l1_loss_op.cc
deleted file mode 100644
index 7f1441032acf6..0000000000000
--- a/modules/detectron/select_smooth_l1_loss_op.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "select_smooth_l1_loss_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(
-    SelectSmoothL1Loss,
-    SelectSmoothL1LossOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SelectSmoothL1LossGradient,
-    SelectSmoothL1LossGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(SelectSmoothL1Loss)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-RetinaNet specific op for computing Smooth L1 Loss at select locations in a 4D
-tensor that encodes bounding box regression predictions.
-)DOC")
-    .Arg(
-        "beta",
-        "(float) default 1.0; L2 to L1 transition point.")
-    .Arg(
-        "scale",
-        "(float) default 1.0; multiply the loss by this scale factor.")
-    .Input(
-        0,
-        "Y_hat",
-        "4D tensor of bounding box regression predictions with shape "
-        "(N, 4 * num_bbox_classes * num_anchors, H, W).")
-    .Input(
-        1,
-        "Y",
-        "2D tensor of labels shape (M, 4) for 4 contiguous channels starting "
-        "at each of the M locations selected by the locations input.")
-    .Input(
-        2,
-        "locations",
-        "2D tensor of shape (M, 4) that identifies M 'select' locations "
-        "encoded by the four columns: (n, c, y, x). The loss is computed on the "
-        "four contiguous channel locations [c, c + 3] (inclusive).")
-    .Input(
-        3,
-        "normalizer",
-        "Scalar; the loss is divided by max(1, normalizer).")
-    .Output(
-        0,
-        "loss",
-        "Scalar loss.");
-
-OPERATOR_SCHEMA(SelectSmoothL1LossGradient)
-    .NumInputs(5)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "Y_hat",
-        "See SelectSmoothL1Loss.")
-    .Input(
-        1,
-        "Y",
-        "See SelectSmoothL1Loss.")
-    .Input(
-        2,
-        "locations",
-        "See SelectSmoothL1Loss.")
-    .Input(
-        3,
-        "normalizer",
-        "See SelectSmoothL1Loss.")
-    .Input(
-        4,
-        "d_loss",
-        "Gradient of forward output 0 (loss).")
-    .Output(
-        0,
-        "d_Y_hat",
-        "Gradient of forward input 0 (Y_hat).");
-
-class GetSelectSmoothL1LossGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "SelectSmoothL1LossGradient",
-        "",
-        vector<string>{I(0), I(1), I(2), I(3), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(SelectSmoothL1Loss, GetSelectSmoothL1LossGradient);
-
-} // namespace caffe2
diff --git a/modules/detectron/select_smooth_l1_loss_op.cu b/modules/detectron/select_smooth_l1_loss_op.cu
deleted file mode 100644
index 72f1d563b4c92..0000000000000
--- a/modules/detectron/select_smooth_l1_loss_op.cu
+++ /dev/null
@@ -1,189 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/select_smooth_l1_loss_op.h"
-
-namespace caffe2 {
-
-namespace {
-__global__ void SelectSmoothL1Kernel(
-    const int D, const int H, const int W,
-    const int M, const float* Y_hat, const float* Y, const float* L, float* out,
-    const float* S, const float beta) {
-  // f(x) = 0.5 * x^2 / beta      if |x| < beta
-  //        |x| - 0.5 * beta      otherwise
-  CUDA_1D_KERNEL_LOOP(i, M) {
-    int n = L[i * 4];
-    int c = L[i * 4 + 1];
-    int y = L[i * 4 + 2];
-    int x = L[i * 4 + 3];
-
-    for (int j = 0; j < 4; j++){
-      // Y_hat: N x (A * CLS * 4) x H x W
-      int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x;
-      float y_hat = Y_hat[ind];
-      float y = Y[i * 4 + j];
-      float val = y_hat - y;
-      float abs_val = c10::cuda::compat::abs(val);
-      if (abs_val < beta) {
-        out[ind] = (0.5 * val * val / beta) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
-      } else {
-        out[ind] = (abs_val - 0.5 * beta) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
-      }
-    }
-  }
-}
-
-
-__global__ void SelectSmoothL1GradientKernel(
-    const int D, const int H, const int W,
-    const int M,
-    const float* Y_hat,
-    const float* Y,
-    const float* L,
-    float* out,
-    const float* d_loss_data,
-    float norm,
-    const float* S,
-    float beta) {
-  // f'(x) = x / beta     if |x| < beta
-  //       = sign(x)      otherwise
-  // We also scale by norm * d_loss in this kernel for convenience
-  CUDA_1D_KERNEL_LOOP(i, M) {
-    int n = L[i * 4];
-    int c = L[i * 4 + 1];
-    int y = L[i * 4 + 2];
-    int x = L[i * 4 + 3];
-    float d_loss = *d_loss_data;
-
-    for (int j = 0; j < 4; j++) {
-      int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x;
-      float y_hat = Y_hat[ind];
-      float y = Y[i * 4 + j];
-      float val = y_hat - y;
-      float abs_val = c10::cuda::compat::abs(val);
-      if (abs_val < beta) {
-        out[ind] = norm * d_loss * val / beta / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
-      } else {
-        out[ind] = norm * d_loss * ((float(0) < val) - (val < float(0))) / c10::cuda::compat::max(S[0], static_cast<float>(1.0));
-      }
-    }
-  }
-}
-} // namespace
-
-
-template<>
-bool SelectSmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
-  // bbox targets predictions, for example: N x (A * 4) H x W in cls-agnostic case
-  auto& Y_hat     = Input(0);
-  // true targets: for example: M x 4 where M is the #fg boxes per fpn level
-  auto& Y         = Input(1);
-  // locations of fg boxes: M x 4
-  auto& L         = Input(2);
-  // total number of fg boxes across all FPN levels: scalar
-  auto& S         = Input(3);
-
-
-  auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
-  if (Y.size() == 0){
-    math::Set<float, CUDAContext>(
-      1, static_cast<float>(0), avg_loss->mutable_data<float>(), &context_);
-    return true;
-  }
-
-  int N = Y_hat.dim32(0);
-  int D = Y_hat.dim32(1);
-  int H = Y_hat.dim32(2);
-  int W = Y_hat.dim32(3);
-
-  int M = Y.dim32(0);
-
-  // initialization
-  buff_.ResizeLike(Y_hat);
-  math::Set<float, CUDAContext>(
-    1, static_cast<float>(0), avg_loss->mutable_data<float>(), &context_);
-  math::Set<float, CUDAContext>(
-    buff_.size(), 0.0, buff_.mutable_data<float>(), &context_);
-
-  // Element-wise smooth l1 loss
-  // l := SelectSmoothL1((y_hat - y))
-  SelectSmoothL1Kernel<<<CAFFE_GET_BLOCKS(buff_.size()),
-                         CAFFE_CUDA_NUM_THREADS,
-                         0, context_.cuda_stream()>>>(
-    D, H, W,
-    M, Y_hat.data<float>(), Y.data<float>(),
-    L.data<float>(), buff_.mutable_data<float>(),
-    S.data<float>(), beta_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Sum of all losses
-  // al := sum_i l_i
-  float* avg_loss_data = avg_loss->mutable_data<float>();
-  math::Sum<float, CUDAContext>(
-      buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
-
-  // Average of input batch size
-  math::Scale<float, float, CUDAContext>(
-      1, scale_, avg_loss_data, avg_loss_data, &context_);
-  return true;
-}
-
-template<>
-bool SelectSmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& Y_hat      = Input(0);
-  auto& Y          = Input(1);
-  auto& L          = Input(2);
-  auto& S          = Input(3);
-  // Below is gradient of net w.r.t. avg_loss ("gradOutput"), should be all 1's
-  auto& d_avg_loss = Input(4);
-
-  auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype<float>()); // gradient of net w.r.t. Y_hat ("gradInput")
-  math::Set<float, CUDAContext>(
-    d_Y_hat->size(), 0.0, d_Y_hat->mutable_data<float>(), &context_);
-  if (Y.size() == 0){
-    return true;
-  }
-
-  int N = Y_hat.dim32(0);
-  int D = Y_hat.dim32(1);
-  int H = Y_hat.dim32(2);
-  int W = Y_hat.dim32(3);
-
-  int M = Y.dim32(0);
-  // Element-wise weighted difference (can be used to ignore or reweight
-  // specific components)
-  // d := (y_hat - y)
-  // d_Y_hat := d_avg_loss * SelectSmoothL1'((y_hat - y))
-
-  SelectSmoothL1GradientKernel<<<CAFFE_GET_BLOCKS(d_Y_hat->size()),
-                                 CAFFE_CUDA_NUM_THREADS,
-                                 0, context_.cuda_stream()>>>(
-    D, H, W, M, Y_hat.data<float>(), Y.data<float>(),
-    L.data<float>(), d_Y_hat->mutable_data<float>(),
-    d_avg_loss.data<float>(), scale_, S.data<float>(), beta_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(SelectSmoothL1Loss,
-                       SelectSmoothL1LossOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(SelectSmoothL1LossGradient,
-                       SelectSmoothL1LossGradientOp<float, CUDAContext>);
-}  // namespace caffe2
diff --git a/modules/detectron/select_smooth_l1_loss_op.h b/modules/detectron/select_smooth_l1_loss_op.h
deleted file mode 100644
index b5a3badfde716..0000000000000
--- a/modules/detectron/select_smooth_l1_loss_op.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SELECT_SMOOTH_L1_LOSS_OP_H_
-#define SELECT_SMOOTH_L1_LOSS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class SelectSmoothL1LossOp final : public Operator<Context> {
- public:
-  SelectSmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        beta_(this->template GetSingleArgument<float>("beta", 1.)),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)) {
-    CAFFE_ENFORCE(beta_ > 0);
-    CAFFE_ENFORCE(scale_ >= 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float beta_; // Transition point from L1 to L2 loss
-  float scale_; // Scale the loss by scale_
-  int dim_; // dimension for 1 anchor prediction
-  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
-};
-
-template <typename T, class Context>
-class SelectSmoothL1LossGradientOp final : public Operator<Context> {
- public:
-  SelectSmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        beta_(this->template GetSingleArgument<float>("beta", 1.)),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)) {
-    CAFFE_ENFORCE(beta_ > 0);
-    CAFFE_ENFORCE(scale_ >= 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float beta_; // Transition point from L1 to L2 loss
-  float scale_; // Scale the loss by scale_
-  int dim_; // dimension for 1 anchor prediction
-  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
-};
-
-} // namespace caffe2
-
-#endif // SELECT_SMOOTH_L1_LOSS_OP_H_
diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cc b/modules/detectron/sigmoid_cross_entropy_loss_op.cc
deleted file mode 100644
index f45ff40174bbc..0000000000000
--- a/modules/detectron/sigmoid_cross_entropy_loss_op.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "sigmoid_cross_entropy_loss_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(
-    SigmoidCrossEntropyLoss,
-    SigmoidCrossEntropyLossOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SigmoidCrossEntropyLossGradient,
-    SigmoidCrossEntropyLossGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(SigmoidCrossEntropyLoss)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Compute sigmoid activations followed by averaged binary cross entropy loss. The
-target values may be in {-1, 0, 1}, where -1 indicates that the corresponding
-sample should be ignored and {0, 1} correspond to the binary classes 0 and 1. By
-default the loss is divided by the number of targets > -1 and then multiplied by
-the `scale` op argument. The divisive normalization may be disable by setting
-the op argument `normalize` to 0 (the multiplication by `scale` still takes
-effect).
-
-This op fuses sigmoid and cross entropy for numerical stability in both forward
-and gradient computation.
-)DOC")
-    .Arg(
-        "scale",
-        "(float) default 1.0; multiply the loss by this scale factor.")
-    .Arg(
-        "normalize",
-        "(int) default 1; if true, divide the loss by the number of targets > "
-        "-1.")
-    .Input(
-        0,
-        "X",
-        "Tensor of predicted logits (shape must be at least 1D).")
-    .Input(
-        1,
-        "targets",
-        "Tensor of targets of type int and same shape as logits X.")
-    .Output(
-        0,
-        "loss",
-        "Scalar loss.");
-
-OPERATOR_SCHEMA(SigmoidCrossEntropyLossGradient)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "X",
-        "See SigmoidCrossEntropyLoss.")
-    .Input(
-        1,
-        "targets",
-        "See SigmoidCrossEntropyLoss.")
-    .Input(
-        2,
-        "d_loss",
-        "Gradient of forward output 0 (loss).")
-    .Output(
-        0,
-        "dX",
-        "Gradient of forward input 0 (X).");
-
-class GetSigmoidCrossEntropyLossGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "SigmoidCrossEntropyLossGradient",
-        "",
-        vector<string>{I(0), I(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(SigmoidCrossEntropyLoss, GetSigmoidCrossEntropyLossGradient);
-
-} // namespace caffe2
diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cu b/modules/detectron/sigmoid_cross_entropy_loss_op.cu
deleted file mode 100644
index bb86560fcb01f..0000000000000
--- a/modules/detectron/sigmoid_cross_entropy_loss_op.cu
+++ /dev/null
@@ -1,190 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/sigmoid_cross_entropy_loss_op.h"
-
-namespace caffe2 {
-
-namespace {
-__global__ void ElementwiseMaxKernel(const int n, float* data, const float a) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    data[index] = (data[index] > a) ? data[index] : a;
-  }
-}
-
-__global__ void SigmoidCrossEntropyLossKernel(
-    const int n,
-    const float* logits,
-    const int* targets,
-    float* losses,
-    float* counts) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    if (targets[index] == -1) {
-      losses[index] = 0.;
-      counts[index] = 0.;
-    } else {
-      losses[index] =
-          -1. * logits[index] * (targets[index] - (logits[index] >= 0)) +
-          logf(
-              1 +
-              expf(logits[index] - 2 * logits[index] * (logits[index] >= 0)));
-      counts[index] = 1.;
-    }
-  }
-}
-
-__global__ void SigmoidCrossEntropyLossGradientKernel(
-    const int n,
-    const float* logits,
-    const int* targets,
-    float* d_logits,
-    float* counts) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    if (targets[index] == -1) {
-      d_logits[index] = 0.;
-      counts[index] = 0.;
-    } else {
-      d_logits[index] = 1. / (1. + expf(-logits[index])) - targets[index];
-      counts[index] = 1.;
-    }
-  }
-}
-} // namespace
-
-template <>
-bool SigmoidCrossEntropyLossOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
-  auto& T = Input(1);
-
-
-  CAFFE_ENFORCE(
-      X.size() == T.size(),
-      "Logit and target must have the same size",
-      "(",
-      X.size(),
-      " vs. ",
-      T.size(),
-      ")");
-  auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
-  counts_.ResizeLike(X);
-  losses_.ResizeLike(X);
-  ReinitializeTensor(&normalizer_, vector<int64_t>(), at::dtype<float>().device(CUDA));
-  SigmoidCrossEntropyLossKernel<<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      X.size(),
-      X.data<float>(),
-      T.data<int>(),
-      losses_.mutable_data<float>(),
-      counts_.mutable_data<float>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  float* avg_loss_data = avg_loss->mutable_data<float>();
-  math::Sum<float, CUDAContext>(
-      losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
-  if (normalize_) {
-    float* normalizer_data = normalizer_.mutable_data<float>();
-    math::Sum<float, CUDAContext>(
-        counts_.size(), counts_.data<float>(), normalizer_data, &context_);
-    // Prevent division by zero is all counts are zero
-    ElementwiseMaxKernel<<<
-        CAFFE_GET_BLOCKS(normalizer_.size()),
-        CAFFE_CUDA_NUM_THREADS,
-        0,
-        context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-    math::Div<float, CUDAContext>(
-        1, avg_loss_data, normalizer_data, avg_loss_data, &context_);
-  }
-  math::Scale<float, float, CUDAContext>(
-      1, scale_, avg_loss_data, avg_loss_data, &context_);
-
-  return true;
-}
-
-template <>
-bool SigmoidCrossEntropyLossGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
-  auto& T = Input(1);
-  auto& d_avg_loss = Input(2);
-
-
-  auto* dX = Output(0, X.sizes(), at::dtype<float>());
-  counts_.ResizeLike(X);
-  ReinitializeTensor(&normalizer_, vector<int64_t>(), at::dtype<float>().device(CUDA));
-  SigmoidCrossEntropyLossGradientKernel<<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      X.size(),
-      X.data<float>(),
-      T.data<int>(),
-      dX->mutable_data<float>(),
-      counts_.mutable_data<float>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  if (normalize_) {
-    float* normalizer_data = normalizer_.mutable_data<float>();
-    math::Sum<float, CUDAContext>(
-        counts_.size(), counts_.data<float>(), normalizer_data, &context_);
-    // Prevent division by zero is all counts are zero
-    ElementwiseMaxKernel<<<
-        CAFFE_GET_BLOCKS(normalizer_.size()),
-        CAFFE_CUDA_NUM_THREADS,
-        0,
-        context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-    math::Div<float, CUDAContext>(
-        1,
-        d_avg_loss.data<float>(),
-        normalizer_data,
-        normalizer_data,
-        &context_);
-    math::Scale<float, float, CUDAContext>(
-        1, scale_, normalizer_data, normalizer_data, &context_);
-    math::Scale<float, float, CUDAContext>(
-        dX->size(),
-        normalizer_data,
-        dX->data<float>(),
-        dX->mutable_data<float>(),
-        &context_);
-  } else {
-    math::Scale<float, float, CUDAContext>(
-        dX->size(),
-        scale_,
-        dX->data<float>(),
-        dX->mutable_data<float>(),
-        &context_);
-    math::Scale<float, float, CUDAContext>(
-        dX->size(),
-        d_avg_loss.data<float>(),
-        dX->data<float>(),
-        dX->mutable_data<float>(),
-        &context_);
-  }
-  return true;
-}
-
-REGISTER_CUDA_OPERATOR(
-    SigmoidCrossEntropyLoss,
-    SigmoidCrossEntropyLossOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(
-    SigmoidCrossEntropyLossGradient,
-    SigmoidCrossEntropyLossGradientOp<float, CUDAContext>);
-} // namespace caffe2
diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.h b/modules/detectron/sigmoid_cross_entropy_loss_op.h
deleted file mode 100644
index 680519e9bdea9..0000000000000
--- a/modules/detectron/sigmoid_cross_entropy_loss_op.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
-#define SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class SigmoidCrossEntropyLossOp final : public Operator<Context> {
- public:
-  SigmoidCrossEntropyLossOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)),
-        normalize_(this->template GetSingleArgument<int>("normalize", 1)) {
-    CAFFE_ENFORCE(scale_ >= 0);
-    CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float scale_;
-  int normalize_;
-  Tensor losses_{Context::GetDeviceType()};
-  Tensor counts_{Context::GetDeviceType()};
-  Tensor normalizer_;
-};
-
-template <typename T, class Context>
-class SigmoidCrossEntropyLossGradientOp final : public Operator<Context> {
- public:
-  SigmoidCrossEntropyLossGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)),
-        normalize_(this->template GetSingleArgument<int>("normalize", 1)) {
-    CAFFE_ENFORCE(scale_ >= 0);
-    CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float scale_;
-  int normalize_;
-  Tensor counts_{Context::GetDeviceType()};
-  Tensor normalizer_;
-};
-
-} // namespace caffe2
-
-#endif // SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
diff --git a/modules/detectron/sigmoid_focal_loss_op.cc b/modules/detectron/sigmoid_focal_loss_op.cc
deleted file mode 100644
index 583e9a0de3283..0000000000000
--- a/modules/detectron/sigmoid_focal_loss_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "sigmoid_focal_loss_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SigmoidFocalLoss, SigmoidFocalLossOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SigmoidFocalLossGradient,
-    SigmoidFocalLossGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(SigmoidFocalLoss)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-The binary form of Focal Loss designed for use in RetinaNet-like models.
-The input is assumed to be unnormalized scores (sometimes called 'logits')
-arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
-elements in the batch, H and W are the height and width, and C = num_anchors *
-num_classes defines num_anchors 'groups' of logits, each of length
-num_classes. For the binary form of Focal Loss, num_classes does not include
-the background category. (So, for COCO, num_classes = 80, not 81.)
-
-The binary form of focal loss is:
-
-  FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t),
-
-where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0,
-respectively.
-
-See: https://arxiv.org/abs/1708.02002 for details.
-)DOC")
-    .Arg(
-       "scale",
-       "(float) default 1.0; multiply the loss by this scale factor.")
-    .Arg(
-       "alpha",
-       "(float) default 0.25; Focal Loss's alpha hyper-parameter.")
-    .Arg(
-       "gamma",
-       "(float) default 1.0; Focal Loss's gamma hyper-parameter.")
-    .Arg(
-       "num_classes",
-       "(int) default 80; number of classes (excluding background).")
-    .Input(
-       0,
-       "logits",
-       "4D tensor of sigmoid inputs (called 'scores' or 'logits') with shape "
-       "(N, C, H, W), where C = num_anchors * num_classes.")
-    .Input(
-       1,
-       "labels",
-       "4D tensor of labels with shape (N, num_anchors, H, W). Each entry is "
-       "a class label in [0, num_classes - 1] (inclusive). The label "
-       "identifies the one class that should have a sigmoid target of 1.")
-    .Input(
-       2,
-       "normalizer",
-       "Scalar; the loss is normalized by 1 / max(1, normalizer)."
-    )
-    .Output(
-       0,
-       "loss",
-       "Scalar loss.");
-
-OPERATOR_SCHEMA(SigmoidFocalLossGradient)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "logits",
-        "See SigmoidFocalLoss.")
-    .Input(
-        1,
-        "labels",
-        "See SigmoidFocalLoss.")
-    .Input(
-        2,
-        "normalizer",
-        "See SigmoidFocalLoss.")
-    .Input(
-        3,
-        "d_loss",
-        "Gradient of forward output 0 (loss)")
-    .Output(
-        0,
-        "d_logits",
-        "Gradient of forward input 0 (logits)");
-
-class GetSigmoidFocalLossGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-
-  vector<OperatorDef> GetGradientDefs() override {
-    vector<string> blob_names{
-        {I(0), I(1), I(2), GO(0)},
-    };
-
-    return SingleGradientDef(
-        "SigmoidFocalLossGradient", "", blob_names, vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(SigmoidFocalLoss, GetSigmoidFocalLossGradient);
-
-} // namespace caffe2
diff --git a/modules/detectron/sigmoid_focal_loss_op.cu b/modules/detectron/sigmoid_focal_loss_op.cu
deleted file mode 100644
index e6f2dea21b5df..0000000000000
--- a/modules/detectron/sigmoid_focal_loss_op.cu
+++ /dev/null
@@ -1,185 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cfloat>
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/sigmoid_focal_loss_op.h"
-
-namespace caffe2 {
-
-namespace {
-
-__global__ void SigmoidFocalLossKernel(
-    const int N, const int D, const int H, const int W, const float* logits,
-    const int* targets, const float* weight_pos,
-    const float gamma, const float alpha,
-    const int num_classes, float* losses) {
-  CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
-    int x = i % W;
-    int y = (i / W) % H;
-    int c = (i / (W * H)) % D;  // channel, here D is channel dim in input NxDxHxW
-    int n = i / (W * H * D);    // n in NxDxHxW
-
-    int A = D / num_classes;   // num_anchors = A
-    int a = c / num_classes;   // current anchor out of A anchors in D = A * num_cls
-    int d = c % num_classes;   // current class
-    int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];   // target
-
-    // check whether the class is true class or not.
-    // The target classes are in range 1 - 81 and the d is in range 0-80
-    // because we predict A*80 dim, so for comparison purpose, compare t and (d+1)
-    float c1 = (t == (d + 1));
-    float c2 = (t != -1 & t != (d + 1));
-
-    float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
-    float zn = (1.0 - alpha) / Np;
-    float zp = alpha / Np;
-
-    // p = 1. / 1. + expf(-x)
-    float p = 1. / (1. + expf(-logits[i]));
-
-    // (1 - p)**gamma * log(p) where
-    float term1 = powf((1. - p), gamma) * logf(c10::cuda::compat::max(p, FLT_MIN));
-    // p**gamma * log(1 - p)
-    float term2 =
-        powf(p, gamma) *
-        (-1. * logits[i] * (logits[i] >= 0) -
-         logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
-
-    losses[i] = 0.0;
-    losses[i] += -c1 * term1 * zp;
-    losses[i] += -c2 * term2 * zn;
-  }
-}
-
-__global__ void SigmoidFocalLossGradientKernel(
-    const int N, const int D, const int H, const int W, const float* logits,
-    const int* targets, float* dX_data, const float* weight_pos,
-    const float gamma, const float alpha, const int num_classes,
-    const float* avg_loss) {
-  CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
-      float a_loss = avg_loss[0];
-      int x = i % W;
-      int y = (i / W) % H;
-      int c = (i / (W * H)) % D;
-      int n = i / (W * H * D);
-
-      int A = D / num_classes;   // num_anchors
-      int a = c / num_classes;   // current anchor
-      int d = c % num_classes;   // current class
-
-      float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
-      float zn = (1.0 - alpha) / Np;
-      float zp = alpha / Np;
-      int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];
-
-      float c1 = (t == (d + 1));
-      float c2 = (t != -1 & t != (d + 1));
-      float p = 1. / (1. + expf(-logits[i]));
-
-      // (1-p)**g * (1 - p - g*p*log(p))
-      float term1 =
-          powf((1. - p), gamma) *
-          (1. - p - (p * gamma * logf(c10::cuda::compat::max(p, FLT_MIN))));
-      // (p**g) * (g*(1-p)*log(1-p) - p)
-      float term2 =
-          powf(p, gamma) *
-          ((-1. * logits[i] * (logits[i] >= 0) -
-           logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
-           (1. - p) * gamma - p);
-      dX_data[i] = 0.0;
-      dX_data[i] += -c1 * zp * term1;
-      dX_data[i] += -c2 * zn * term2;
-      dX_data[i] = dX_data[i] * a_loss;
-  }
-}
-} // namespace
-
-template<>
-bool SigmoidFocalLossOp<float, CUDAContext>::RunOnDevice() {
-  // Input logits, for example: N x (A * 80) x H x W in cls-agnostic
-  auto& X = Input(0);
-  // Target, for example: N x A x H x W
-  auto& T = Input(1);
-  // Number of positive examples: scalar
-  auto& wp = Input(2);
-  // output avg Sigmoid focal loss as mentioned in RetinaNet paper
-
-
-  int N = X.dim32(0);
-  int D = X.dim32(1);
-  int H = X.dim32(2);
-  int W = X.dim32(3);
-
-  auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
-  losses_.ResizeLike(X);
-  float* avg_loss_data = avg_loss->mutable_data<float>();
-
-  SigmoidFocalLossKernel<<<CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
-      N, D, H, W, X.data<float>(), T.data<int>(),
-      wp.data<float>(), gamma_, alpha_, num_classes_,
-      losses_.mutable_data<float>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  math::Sum<float, CUDAContext>(
-      losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
-  math::Scale<float, float, CUDAContext>(
-      1, scale_, avg_loss_data, avg_loss_data, &context_);
-
-  return true;
-}
-
-
-template<>
-bool SigmoidFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
-  auto& T = Input(1);
-  auto& wp = Input(2);
-  auto& d_avg_loss = Input(InputSize() - 1);
-
-
-  // get input shape
-  int N = X.dim32(0);
-  int D = X.dim32(1);
-  int H = X.dim32(2);
-  int W = X.dim32(3);
-
-  auto* dX = Output(0, X.sizes(), at::dtype<float>());
-
-  SigmoidFocalLossGradientKernel<<<CAFFE_GET_BLOCKS(X.size()),
-          CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
-      N, D, H, W, X.data<float>(), T.data<int>(), dX->mutable_data<float>(),
-      wp.data<float>(), gamma_, alpha_, num_classes_,
-      d_avg_loss.data<float>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  math::Scale<float, float, CUDAContext>(
-      dX->size(),
-      scale_,
-      dX->data<float>(),
-      dX->mutable_data<float>(),
-      &context_);
-
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(SigmoidFocalLoss,
-                       SigmoidFocalLossOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(SigmoidFocalLossGradient,
-                       SigmoidFocalLossGradientOp<float, CUDAContext>);
-} // namespace caffe2
diff --git a/modules/detectron/sigmoid_focal_loss_op.h b/modules/detectron/sigmoid_focal_loss_op.h
deleted file mode 100644
index 7640e0bc8a430..0000000000000
--- a/modules/detectron/sigmoid_focal_loss_op.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SIGMOID_FOCAL_LOSS_OP_H_
-#define SIGMOID_FOCAL_LOSS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class SigmoidFocalLossOp final : public Operator<Context> {
- public:
-  SigmoidFocalLossOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)),
-        num_classes_(this->template GetSingleArgument<int>("num_classes", 80)),
-        gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
-        alpha_(this->template GetSingleArgument<float>("alpha", 0.25)) {
-    CAFFE_ENFORCE(scale_ >= 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float scale_;
-  int num_classes_;
-  float gamma_;
-  float alpha_;
-  Tensor losses_{Context::GetDeviceType()};
-  Tensor counts_{Context::GetDeviceType()};
-};
-
-template <typename T, class Context>
-class SigmoidFocalLossGradientOp final : public Operator<Context> {
- public:
-  SigmoidFocalLossGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)),
-        num_classes_(this->template GetSingleArgument<int>("num_classes", 80)),
-        gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
-        alpha_(this->template GetSingleArgument<float>("alpha", 0.25)) {
-    CAFFE_ENFORCE(scale_ >= 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float scale_;
-  int num_classes_;
-  float gamma_;
-  float alpha_;
-  Tensor counts_{Context::GetDeviceType()};
-  Tensor weights_{Context::GetDeviceType()}; // unignored weights
-};
-
-} // namespace caffe2
-
-#endif // SIGMOID_FOCAL_LOSS_OP_H_
diff --git a/modules/detectron/smooth_l1_loss_op.cc b/modules/detectron/smooth_l1_loss_op.cc
deleted file mode 100644
index 9ea570ac9c1b0..0000000000000
--- a/modules/detectron/smooth_l1_loss_op.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "smooth_l1_loss_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SmoothL1Loss, SmoothL1LossOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SmoothL1LossGradient,
-    SmoothL1LossGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(SmoothL1Loss)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Smooth L1 Loss is a minor variation of Huber loss in which the point of
-transition between L2 loss and L1 loss is adjustable by a hyper-parameter beta:
-
-  SmoothL1(x) = 0.5 * x^2 / beta      if |x| < beta
-                |x| - 0.5 * beta      otherwise.
-
-SmoothL1 is used in Fast R-CNN and descendants as the loss function for bounding
-box regression.
-
-The loss computed by this op has a flexible form:
-
-  scale / N * sum_i alpha_out[i] * SmoothL1(alpha_in[i] * (y_hat[i] - y[i])).
-
-The weights alpha_in and alpha_out are called the "inside" and "outside"
-weights, respectively. The inside weights are typically set to either 0 or 1 to
-implement ignoring (when 0) certain samples. The outside weights can be used
-to implement a per-sample loss weight. The overall loss is scaled by scale / N,
-where N is the number of batch elements in the input predictions.
-)DOC")
-    .Arg(
-        "beta",
-        "(float) default 1.0; L2 to L1 transition point.")
-    .Arg(
-        "scale",
-        "(float) default 1.0; multiply the loss by this scale factor.")
-    .Input(
-        0,
-        "Y_hat",
-        "Tensor of predictions (at least 1D).")
-    .Input(
-        1,
-        "Y",
-        "Tensor of labels with the same shape as Y_hat.")
-    .Input(
-        2,
-        "alpha_in",
-        "Tensor of inside weights with the same shape as Y.")
-    .Input(
-        3,
-        "alpha_out",
-        "Tensor of outside weights with the same shape as Y.")
-    .Output(
-        0,
-        "loss",
-        "Scalar loss.");
-
-OPERATOR_SCHEMA(SmoothL1LossGradient)
-    .NumInputs(5)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "Y_hat",
-        "See SmoothL1Loss.")
-    .Input(
-        1,
-        "Y",
-        "See SmoothL1Loss.")
-    .Input(
-        2,
-        "alpha_in",
-        "See SmoothL1Loss.")
-    .Input(
-        3,
-        "alpha_out",
-        "See SmoothL1Loss.")
-    .Input(
-        4,
-        "d_loss",
-        "Gradient of forward output 0 (loss).")
-    .Output(
-        0,
-        "d_Y_hat",
-        "Gradient of forward input 0 (Y_hat).");
-
-class GetSmoothL1LossGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "SmoothL1LossGradient",
-        "",
-        vector<string>{I(0), I(1), I(2), I(3), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(SmoothL1Loss, GetSmoothL1LossGradient);
-
-} // namespace caffe2
diff --git a/modules/detectron/smooth_l1_loss_op.cu b/modules/detectron/smooth_l1_loss_op.cu
deleted file mode 100644
index ad2d9148c72f0..0000000000000
--- a/modules/detectron/smooth_l1_loss_op.cu
+++ /dev/null
@@ -1,185 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/smooth_l1_loss_op.h"
-
-namespace caffe2 {
-
-namespace {
-template <typename T>
-__global__ void SmoothL1Kernel(
-    const int n, const T* in, T* out, T beta) {
-  // f(x) = 0.5 * x^2 / beta      if |x| < beta
-  //        |x| - 0.5 * beta      otherwise
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    T val = in[index];
-    T abs_val = c10::cuda::compat::abs(val);
-    if (abs_val < beta) {
-      out[index] = 0.5 * val * val / beta;
-    } else {
-      out[index] = abs_val - 0.5 * beta;
-    }
-  }
-}
-
-template <typename T>
-__global__ void SmoothL1GradientKernel(
-    const int n,
-    const T* in,
-    T* out,
-    const T* d_loss_data,
-    T norm,
-    T beta) {
-  // f'(x) = x / beta     if |x| < beta
-  //       = sign(x)      otherwise
-  // We also scale by norm * d_loss in this kernel for convenience
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    T val = in[index];
-    T abs_val = c10::cuda::compat::abs(val);
-    T d_loss = *d_loss_data;
-    if (abs_val < beta) {
-      out[index] = norm * d_loss * val / beta;
-    } else {
-      out[index] = norm * d_loss * ((T(0) < val) - (val < T(0)));
-    }
-  }
-}
-} // namespace
-
-template<>
-bool SmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
-  auto& Y_hat     = Input(0);
-  auto& Y         = Input(1);
-  auto& alpha_in  = Input(2);
-  auto& alpha_out = Input(3);
-
-
-  int N = Y.dim32(0);
-  // Require the same number of elements along axis 0 (batch size), but
-  // otherwise don't care about the shape (just the number of elements)
-  CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
-      "Y_hat and Y must have the same number of elements along axis 0");
-  CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
-      "Y_hat and Y must have the same number of elements");
-  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
-  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
-
-  auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
-  buff_.ResizeLike(Y);
-
-  // Difference
-  // d := y_hat - y
-  math::Sub<float, CUDAContext>(
-      Y.size(), Y_hat.data<float>(), Y.data<float>(),
-      buff_.mutable_data<float>(), &context_);
-  // Element-wise weighted difference (can be used to ignore or reweight
-  // specific components)
-  // d := alpha_in * (y_hat - y)
-  math::Mul<float, CUDAContext>(
-      buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
-      buff_.mutable_data<float>(), &context_);
-
-  // Element-wise smooth l1 loss
-  // l := SmoothL1(alpha_in * (y_hat - y))
-  SmoothL1Kernel<float>
-  <<<CAFFE_GET_BLOCKS(buff_.size()),
-     CAFFE_CUDA_NUM_THREADS,
-     0,
-     context_.cuda_stream()>>>(
-          buff_.size(), buff_.data<float>(), buff_.mutable_data<float>(),
-          beta_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Element-wise weighted smooth l1 loss (can be used to specify a per-element
-  // loss weight)
-  // l := alpha_out * SmoothL1(alpha_in * (y_hat - y))
-  math::Mul<float, CUDAContext>(
-      buff_.size(), buff_.data<float>(), alpha_out.data<float>(),
-      buff_.mutable_data<float>(), &context_);
-  // Sum of all losses
-  // al := sum_i l_i
-  float* avg_loss_data = avg_loss->mutable_data<float>();
-  math::Sum<float, CUDAContext>(
-      buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
-  // Average of input batch size
-  // al := 1/N * al
-  math::Scale<float, float, CUDAContext>(
-      1, scale_ / N, avg_loss_data, avg_loss_data, &context_);
-  return true;
-}
-
-template<>
-bool SmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& Y_hat      = Input(0);
-  auto& Y          = Input(1);
-  auto& alpha_in   = Input(2);
-  auto& alpha_out  = Input(3);
-  auto& d_avg_loss = Input(4);  // gradient of net w.r.t. avg_loss ("gradOutput")
-  // We intentially don't compute gradients for Y, alpha_{in,out} since they
-  // are not needed (can change in the future if desired)
-
-  int N = Y.dim32(0);
-  // Require the same number of elements along axis 0 (batch size), but
-  // otherwise don't care about the shape (just the number of elements)
-  CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
-      "Y_hat and Y must have the same number of elements along axis 0");
-  CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
-      "Y_hat and Y must have the same number of elements");
-  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
-  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
-  CAFFE_ENFORCE_EQ(d_avg_loss.size(), 1);
-
-  auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype<float>()); // gradient of net w.r.t. Y_hat ("gradInput")
-  buff_.ResizeLike(Y);
-
-  // Difference
-  // d := y_hat - y
-  math::Sub<float, CUDAContext>(
-      Y.size(), Y_hat.data<float>(), Y.data<float>(),
-      buff_.mutable_data<float>(), &context_);
-  // Element-wise weighted difference (can be used to ignore or reweight
-  // specific components)
-  // d := alpha_in * (y_hat - y)
-  math::Mul<float, CUDAContext>(
-      buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
-      buff_.mutable_data<float>(), &context_);
-  // d_Y_hat := d_avg_loss / N * SmoothL1'(alpha_in * (y_hat - y))
-  SmoothL1GradientKernel<float>
-  <<<CAFFE_GET_BLOCKS(buff_.size()),
-     CAFFE_CUDA_NUM_THREADS,
-     0,
-     context_.cuda_stream()>>>(
-         buff_.size(), buff_.data<float>(), d_Y_hat->mutable_data<float>(),
-         d_avg_loss.data<float>(), scale_ / N, beta_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Element-wise scale by alpha_in and alpha_out
-  math::Mul<float, CUDAContext>(
-      d_Y_hat->size(), d_Y_hat->data<float>(), alpha_in.data<float>(),
-      d_Y_hat->mutable_data<float>(), &context_);
-  math::Mul<float, CUDAContext>(
-      d_Y_hat->size(), d_Y_hat->data<float>(), alpha_out.data<float>(),
-      d_Y_hat->mutable_data<float>(), &context_);
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(SmoothL1Loss,
-                       SmoothL1LossOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(SmoothL1LossGradient,
-                       SmoothL1LossGradientOp<float, CUDAContext>);
-}  // namespace caffe2
diff --git a/modules/detectron/smooth_l1_loss_op.h b/modules/detectron/smooth_l1_loss_op.h
deleted file mode 100644
index 5e5cfd882930e..0000000000000
--- a/modules/detectron/smooth_l1_loss_op.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SMOOTH_L1_LOSS_OP_H_
-#define SMOOTH_L1_LOSS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class SmoothL1LossOp final : public Operator<Context> {
- public:
-  SmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        beta_(this->template GetSingleArgument<float>("beta", 1.)),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)) {
-    CAFFE_ENFORCE(beta_ > 0);
-    CAFFE_ENFORCE(scale_ >= 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float beta_; // Transition point from L1 to L2 loss
-  float scale_; // Scale the loss by scale_
-  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
-};
-
-template <typename T, class Context>
-class SmoothL1LossGradientOp final : public Operator<Context> {
- public:
-  SmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        beta_(this->template GetSingleArgument<float>("beta", 1.)),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)) {
-    CAFFE_ENFORCE(beta_ > 0);
-    CAFFE_ENFORCE(scale_ >= 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float beta_; // Transition point from L1 to L2 loss
-  float scale_; // Scale the loss by scale_
-  Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences
-};
-
-} // namespace caffe2
-
-#endif // SMOOTH_L1_LOSS_OP_H_
diff --git a/modules/detectron/softmax_focal_loss_op.cc b/modules/detectron/softmax_focal_loss_op.cc
deleted file mode 100644
index 7bc44571f7a5e..0000000000000
--- a/modules/detectron/softmax_focal_loss_op.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "modules/detectron/softmax_focal_loss_op.h"
-
-#include "caffe2/operators/softmax_utils.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SoftmaxFocalLoss, SoftmaxFocalLossOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SoftmaxFocalLossGradient,
-    SoftmaxFocalLossGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(SoftmaxFocalLoss)
-    .NumInputs(3)
-    .NumOutputs(2)
-    .SetDoc(R"DOC(
-A multiclass form of Focal Loss designed for use in RetinaNet-like models.
-The input is assumed to be unnormalized scores (sometimes called 'logits')
-arranged in a 4D tensor with shape (N, C, H, W), where N is the number of
-elements in the batch, H and W are the height and width, and C = num_anchors *
-num_classes. The softmax is applied num_anchors times along the C axis.
-
-The softmax version of focal loss is:
-
-  FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t),
-
-where p_i = exp(s_i) / sum_j exp(s_j), t is the target (ground truth) class, and
-s_j is the unnormalized score for class j.
-
-See: https://arxiv.org/abs/1708.02002 for details.
-)DOC")
-    .Arg(
-        "scale",
-        "(float) default 1.0; multiply the loss by this scale factor.")
-    .Arg("alpha", "(float) default 0.25; Focal Loss's alpha hyper-parameter.")
-    .Arg("gamma", "(float) default 1.0; Focal Loss's gamma hyper-parameter.")
-    .Arg(
-        "num_classes",
-        "(int) default 81; number of classes in each softmax group.")
-    .Input(
-        0,
-        "scores",
-        "4D tensor of softmax inputs (called 'scores' or 'logits') with shape "
-        "(N, C, H, W), where C = num_anchors * num_classes defines num_anchors "
-        "groups of contiguous num_classes softmax inputs.")
-    .Input(
-        1,
-        "labels",
-        "4D tensor of labels with shape (N, num_anchors, H, W). Each entry is "
-        "a class label in [0, num_classes - 1] (inclusive).")
-    .Input(
-        2,
-        "normalizer",
-        "Scalar; the loss is normalized by 1 / max(1, normalizer).")
-    .Output(0, "loss", "Scalar loss.")
-    .Output(
-        1,
-        "probabilities",
-        "4D tensor of softmax probabilities with shape (N, C, H, W), where "
-        "C = num_anchors * num_classes, and softmax was applied to each of the "
-        "num_anchors groups; within a group the num_classes values sum to 1.");
-
-OPERATOR_SCHEMA(SoftmaxFocalLossGradient)
-    .NumInputs(5)
-    .NumOutputs(1)
-    .Input(0, "scores", "See SoftmaxFocalLoss.")
-    .Input(1, "labels", "See SoftmaxFocalLoss.")
-    .Input(2, "normalizer", "See SoftmaxFocalLoss.")
-    .Input(
-        3,
-        "probabilities",
-        "Output 1 from SoftmaxFocalLoss; See SoftmaxFocalLoss.")
-    .Input(4, "d_loss", "Gradient of forward output 0 (loss)")
-    .Output(0, "d_scores", "Gradient of forward input 0 (scores)");
-
-class GetSoftmaxFocalLossGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "SoftmaxFocalLossGradient",
-        "",
-        vector<string>{I(0), I(1), I(2), O(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(SoftmaxFocalLoss, GetSoftmaxFocalLossGradient);
-
-} // namespace caffe2
diff --git a/modules/detectron/softmax_focal_loss_op.cu b/modules/detectron/softmax_focal_loss_op.cu
deleted file mode 100644
index 0612ef7edcc8c..0000000000000
--- a/modules/detectron/softmax_focal_loss_op.cu
+++ /dev/null
@@ -1,256 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cfloat>
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/softmax_focal_loss_op.h"
-
-namespace caffe2 {
-
-namespace {
-
-__global__ void SpatialSoftmaxKernel(const int N, const int A,
-    const int H, const int W, const float* Xdata, float* Pdata,
-    const int num_classes) {
-  CUDA_1D_KERNEL_LOOP(index, N * A * H * W) {
-    int D = num_classes * A;
-    int x = index % W;
-    int y = (index / W) % H;
-    int a = (index / (W * H)) % A;
-    int i = index / W / H / A;
-
-    // Subtract max on each cell for numerical reasons
-    float max_val = -FLT_MAX;
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = i * (H * W * D) +  c * (H * W) + y * W + x;
-      max_val = max(max_val, Xdata[idx]);
-    }
-    // Exponentiate
-    float expsum = 0.0f;
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
-      float expx = exp(Xdata[idx] - max_val);
-      Pdata[idx] = expx;
-      expsum += expx;
-    }
-    // Normalize
-    for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) {
-      int idx = i * (H * W * D) + c * (H * W) + y * W + x;
-      Pdata[idx] /= expsum;
-    }
-  }
-}
-
-
-__global__ void SoftmaxFocalLossKernel(
-    const int N, const int A, const int H, const int W,
-    const float* Pdata, const int* targets, float* losses,
-    const float* weight_pos, const float gamma, const float alpha,
-    const int num_classes) {
-  CUDA_1D_KERNEL_LOOP(i, N * A * H * W) {
-    int D = A * num_classes;
-    int x = i % W;
-    int y = (i / W) % H;
-    int a = (i / (W * H)) % A;
-    int n = i / (W * H * A);
-    const int label = static_cast<int>(targets[i]);
-
-    float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
-    float z = (label == 0) * (1 - alpha) / Np +
-              (label >= 1) * alpha / Np;
-
-    losses[i] = 0.0;
-    if (label >= 0) {
-      int offset = a * num_classes;
-      int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x;
-      losses[i] =
-          -(pow(1.0f - Pdata[idx], gamma) *
-          log(c10::cuda::compat::max(Pdata[idx], FLT_MIN))) * z;
-    }
-  }
-}
-
-
-__global__ void SoftmaxFocalLossGradientWeightKernel(
-    const int N, const int A, const int H, const int W,
-    const float* Pdata, const int* targets, float* buff,
-    const float* weight_pos, const float gamma, const float alpha,
-    const int num_classes) {
-  CUDA_1D_KERNEL_LOOP(i, N * A * H * W) {
-    int D = A * num_classes;
-    int x = i % W;
-    int y = (i / W) % H;
-    int a = (i / (W * H)) % A;
-    int n = i / (W * H * A);
-    const int label = static_cast<int>(targets[i]);
-    float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
-    float z =  (label == 0) * (1 - alpha) / Np +
-               (label >= 1) * alpha / Np;
-
-    buff[i] = 0.0;
-    if (label >= 0) {
-      int offset = a * num_classes;
-      int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x;
-      float onemp = 1. - Pdata[idx];
-      float p = Pdata[idx];
-      buff[i] =
-          (-pow(onemp, gamma) +
-          gamma * pow(onemp, gamma - 1) * p * log(c10::cuda::compat::max(p, FLT_MIN))) * z;
-    }
-  }
-}
-
-
-__global__ void SoftmaxFocalLossGradientKernel(
-    const int N, const int D, const int H, const int W,
-    const float* Pdata, const int* targets, const float* buff,
-    const float* d_loss_data, float* dX, const int num_classes) {
-  CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
-    int A = D / num_classes;
-    int x = i % W;
-    int y = (i / W) % H;
-    int d = (i / (W * H)) % D;
-    int a = d / num_classes;
-    int c = d % num_classes;
-    int n = i / (W * H * D);
-    float d_loss = *d_loss_data;
-
-    int ind = n * (H * W * A) + a * (H * W) + y * W + x;
-    const int label = static_cast<int>(targets[ind]);
-
-    float c1 = (label >= 0) * 1.0;
-    float c2 = (label == c) * 1.0;
-    dX[i] = 0.0;
-    dX[i] = c1 * d_loss * buff[ind] * (c2 - Pdata[i]);
-  }
-}
-
-} // namespace
-
-
-template <>
-bool SoftmaxFocalLossOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);         // Logits
-  auto& T = Input(1);         // Labels
-  auto& wp = Input(2);        // num of foreground
-   // average loss as output
-          // softmax probability, going to be re-used in gradient
-
-  int N = X.dim32(0);
-  int D = X.dim32(1);
-  int H = X.dim32(2);
-  int W = X.dim32(3);
-  int A = D / num_classes_;
-
-  ReinitializeTensor(&losses_, {N * A * H * W}, at::dtype<float>().device(CUDA));
-  auto* P = Output(1, {N * D * H * W}, at::dtype<float>());
-  auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
-  math::Set<float, CUDAContext>(
-      avg_loss->size(), 0.f, avg_loss->mutable_data<float>(), &context_);
-  math::Set<float, CUDAContext>(
-      P->size(), 0.f, P->mutable_data<float>(), &context_);
-  math::Set<float, CUDAContext>(
-      losses_.size(), 0.f, losses_.mutable_data<float>(), &context_);
-  TORCH_DCHECK_EQ(X.ndim(), 4);
-
-  const float* Xdata = X.data<float>();
-  const float* Wdata = wp.data<float>();
-
-
-  // Spatial Softmax Kernel
-  SpatialSoftmaxKernel
-      <<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
-         0, context_.cuda_stream()>>>(
-    N, A, H, W, Xdata, P->mutable_data<float>(), num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Compute loss for each x,y location
-  const int* Tdata = T.data<int>();
-  SoftmaxFocalLossKernel
-  <<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
-      0, context_.cuda_stream()>>>(
-    N, A, H, W, P->data<float>(), Tdata, losses_.mutable_data<float>(),
-    Wdata, gamma_, alpha_, num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // sum the losses
-  float* avg_loss_data = avg_loss->mutable_data<float>();
-  math::Sum<float, CUDAContext>(
-      losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
-  math::Scale<float, float, CUDAContext>(
-      1, scale_, avg_loss_data, avg_loss_data, &context_);
-
-  return true;
-}
-
-
-template<>
-bool SoftmaxFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);    // Logits
-  auto& T = Input(1);    // Label
-  auto& wp = Input(2);   // num of foreground example
-  auto& P = Input(3);    // Softmax Probability
-  auto& d_avg_loss = Input(4);
-
-
-  int N = X.dim32(0);
-  int D = X.dim32(1);
-  int H = X.dim32(2);
-  int W = X.dim32(3);
-  int A = D / num_classes_;
-
-  ReinitializeTensor(&buff_, {N * A * H * W}, at::dtype<float>().device(CUDA));
-
-  auto* dX = Output(0, X.sizes(), at::dtype<float>()); // gradient wrt logits
-
-  const float* Xdata = X.data<float>();
-  const int* Tdata = T.data<int>();
-  const float* Pdata = P.data<float>();
-  const float* Wdata = wp.data<float>();
-
-
-  // Compute the weight for gradients
-  SoftmaxFocalLossGradientWeightKernel
-      <<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
-         0, context_.cuda_stream()>>>(
-    N, A, H, W, Pdata, Tdata, buff_.mutable_data<float>(),
-    Wdata, gamma_, alpha_, num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  // Compute the gradient with the weights
-  const float* Bdata = buff_.data<float>();
-  SoftmaxFocalLossGradientKernel
-      <<<CAFFE_GET_BLOCKS(N * D * H * W), CAFFE_CUDA_NUM_THREADS,
-         0, context_.cuda_stream()>>>(
-    N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data<float>(),
-    dX->mutable_data<float>(), num_classes_);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  math::Scale<float, float, CUDAContext>(
-      dX->size(),
-      scale_,
-      dX->data<float>(),
-      dX->mutable_data<float>(),
-      &context_);
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(SoftmaxFocalLoss,
-                       SoftmaxFocalLossOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(SoftmaxFocalLossGradient,
-                       SoftmaxFocalLossGradientOp<float, CUDAContext>);
-} // namespace caffe2
diff --git a/modules/detectron/softmax_focal_loss_op.h b/modules/detectron/softmax_focal_loss_op.h
deleted file mode 100644
index 413c5bd6d7054..0000000000000
--- a/modules/detectron/softmax_focal_loss_op.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SOFTMAX_FOCAL_LOSS_OP_H_
-#define SOFTMAX_FOCAL_LOSS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class SoftmaxFocalLossOp final : public Operator<Context> {
- public:
-  SoftmaxFocalLossOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)),
-        gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
-        alpha_(this->template GetSingleArgument<float>("alpha", 0.25)),
-        num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
-        order_(StringToStorageOrder(
-            this->template GetSingleArgument<string>("order", "NCHW"))) {
-    CAFFE_ENFORCE(scale_ >= 0);
-    CAFFE_ENFORCE_EQ(
-        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float scale_;
-  float gamma_;
-  float alpha_;
-  int num_classes_;
-  StorageOrder order_;
-  Tensor losses_;
-};
-
-template <typename T, class Context>
-class SoftmaxFocalLossGradientOp final : public Operator<Context> {
- public:
-  SoftmaxFocalLossGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        scale_(this->template GetSingleArgument<float>("scale", 1.)),
-        gamma_(this->template GetSingleArgument<float>("gamma", 1.)),
-        alpha_(this->template GetSingleArgument<float>("alpha", 0.25)),
-        num_classes_(this->template GetSingleArgument<int>("num_classes", 81)),
-        order_(StringToStorageOrder(
-            this->template GetSingleArgument<string>("order", "NCHW"))) {
-    CAFFE_ENFORCE(scale_ >= 0);
-    CAFFE_ENFORCE_EQ(
-        order_, StorageOrder::NCHW, "Only NCHW order is supported right now.");
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float scale_;
-  float gamma_;
-  float alpha_;
-  int num_classes_;
-  StorageOrder order_;
-  Tensor buff_;
-};
-
-} // namespace caffe2
-
-#endif // SOFTMAX_FOCAL_LOSS_OP_H_
diff --git a/modules/detectron/spatial_narrow_as_op.cc b/modules/detectron/spatial_narrow_as_op.cc
deleted file mode 100644
index 363aa63a8f122..0000000000000
--- a/modules/detectron/spatial_narrow_as_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "spatial_narrow_as_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp<CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SpatialNarrowAsGradient,
-    SpatialNarrowAsGradientOp<CPUContext>);
-
-OPERATOR_SCHEMA(SpatialNarrowAs)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Reduces ("narrows") the spatial extent of A to that of B by removing rows and
-columns from the bottom and right.
-)DOC")
-    .Input(
-        0,
-        "A",
-        "3D or 4D input of shape (N, H0, W0) or (N, C, H0, W0).")
-    .Input(
-        1,
-        "B",
-        "3D or 4D input of shape (N, H1, W1) or (N, C, H1, W1), where H1 <= H0 "
-        "and W1 <= W0.")
-    .Output(
-        0,
-        "C",
-        "Sub window of A containing rows [0, H1 - 1] (inclusive) and columns "
-        "[0, W1 - 1] (inclusive).");
-
-OPERATOR_SCHEMA(SpatialNarrowAsGradient)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "A",
-        "See SpatialNarrowAs.")
-    .Input(
-        1,
-        "B",
-        "See SpatialNarrowAs.")
-    .Input(
-        2,
-        "dC",
-        "Gradient of forward output 0 (C).")
-    .Output(
-        0,
-        "dA",
-        "Gradient of forward input 0 (A)");
-
-class SpatialNarrowAsGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "SpatialNarrowAsGradient", "",
-        vector<string>{I(0), I(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-REGISTER_GRADIENT(SpatialNarrowAs, SpatialNarrowAsGradient);
-
-} // namespace caffe2
diff --git a/modules/detectron/spatial_narrow_as_op.cu b/modules/detectron/spatial_narrow_as_op.cu
deleted file mode 100644
index ff8b5632e80a8..0000000000000
--- a/modules/detectron/spatial_narrow_as_op.cu
+++ /dev/null
@@ -1,165 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/operator.h"
-#include "modules/detectron/spatial_narrow_as_op.h"
-
-namespace caffe2 {
-
-namespace {
-template <typename T>
-__global__ void CopyKernel(
-    const int N,
-    const int C,
-    const int in_H,
-    const int in_W,
-    const int out_H,
-    const int out_W,
-    const T* in_data,
-    T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, N * C * out_H * out_W) {
-    int w = index % out_W;
-    int h = (index / out_W) % out_H;
-    int c = (index / out_W / out_H) % C;
-    int n = (index / out_W / out_H / C);
-    int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
-    int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
-    out_data[out_index] = in_data[in_index];
-  }
-}
-
-template <typename T>
-__global__ void CopyGradientKernel(
-    const int N,
-    const int C,
-    const int in_H,
-    const int in_W,
-    const int out_H,
-    const int out_W,
-    const T* in_data,
-    T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, N * C * in_H * in_W) {
-    int w = index % in_W;
-    int h = (index / in_W) % in_H;
-    int c = (index / in_W / in_H) % C;
-    int n = (index / in_W / in_H / C);
-    int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w;
-    int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w;
-    out_data[out_index] = in_data[in_index];
-  }
-}
-} // namespace
-
-
-template <>
-bool SpatialNarrowAsOp<CUDAContext>::RunOnDevice() {
-  return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
-}
-
-template <>
-template <typename T>
-bool SpatialNarrowAsOp<CUDAContext>::DoRunWithType() {
-  // Narrows input 0 (A) spatially to match input 1 (B)
-  auto& A = Input(0);
-  auto& B = Input(1);
-
-
-  CAFFE_ENFORCE_EQ(A.dim32(0), B.dim32(0), "Input dim 0 must be equal.");
-  std::vector<int64_t> sizes;
-  if (A.ndim() == B.ndim()) {
-    CAFFE_ENFORCE_EQ(A.dim32(1), B.dim32(1), "Input dim 1 must be equal.");
-    CAFFE_ENFORCE_GE(
-        A.dim32(2), B.dim32(2), "Input 0 height must be >= input 1 height.");
-    CAFFE_ENFORCE_GE(
-        A.dim32(3), B.dim32(3), "Input 0 width must be >= input 1 width.");
-    sizes = B.sizes().vec();
-  } else {
-    // For (N, H, W) case
-    CAFFE_ENFORCE_EQ(A.ndim() - 1, B.ndim(), "Dimension mismatch.");
-    CAFFE_ENFORCE_GE(
-        A.dim32(2), B.dim32(1), "Input 0 height must be >= input 1 height.");
-    CAFFE_ENFORCE_GE(
-        A.dim32(3), B.dim32(2), "Input 0 width must be >= input 1 width.");
-    sizes = {A.dim32(0), A.dim32(1), B.dim32(1), B.dim32(2)};
-  }
-  auto* C = Output(0, sizes, at::dtype<T>());
-  int out_width = C->dim32(3);
-  int out_height = C->dim32(2);
-  int in_width = A.dim32(3);
-  int in_height = A.dim32(2);
-
-  CopyKernel<T><<<
-      CAFFE_GET_BLOCKS(C->size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      C->dim32(0),
-      C->dim32(1),
-      in_height,
-      in_width,
-      out_height,
-      out_width,
-      A.template data<T>(),
-      C->template mutable_data<T>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return true;
-}
-
-template <>
-bool SpatialNarrowAsGradientOp<CUDAContext>::RunOnDevice() {
-  return DispatchHelper<TensorTypes<float_t, int32_t>>::call(this, Input(0));
-}
-
-template <>
-template <typename T>
-bool SpatialNarrowAsGradientOp<CUDAContext>::DoRunWithType() {
-  auto& A = Input(0);
-  auto& B = Input(1);
-  auto& dC = Input(2); // Gradient of net w.r.t. output of forward op
-  auto* dA = Output(0, A.sizes(), at::dtype<T>()); // Gradient of net w.r.t. input to forward op
-
-  math::Set<T, CUDAContext>(
-      dA->size(), 0.f, dA->template mutable_data<T>(), &context_);
-  int out_width = dA->dim32(3);
-  int out_height = dA->dim32(2);
-  int in_width = dC.dim32(3);
-  int in_height = dC.dim32(2);
-
-  CopyGradientKernel<T><<<
-      CAFFE_GET_BLOCKS(dC.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      dA->dim32(0),
-      dA->dim32(1),
-      in_height,
-      in_width,
-      out_height,
-      out_width,
-      dC.template data<T>(),
-      dA->template mutable_data<T>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return true;
-}
-
-REGISTER_CUDA_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp<CUDAContext>);
-REGISTER_CUDA_OPERATOR(
-    SpatialNarrowAsGradient,
-    SpatialNarrowAsGradientOp<CUDAContext>);
-} // namespace caffe2
diff --git a/modules/detectron/spatial_narrow_as_op.h b/modules/detectron/spatial_narrow_as_op.h
deleted file mode 100644
index a1fca861f1c26..0000000000000
--- a/modules/detectron/spatial_narrow_as_op.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SPATIAL_NARROW_AS_OP_H_
-#define SPATIAL_NARROW_AS_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <class Context>
-class SpatialNarrowAsOp final : public Operator<Context> {
- public:
-  SpatialNarrowAsOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  USE_DISPATCH_HELPER;
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  template <typename T>
-  bool DoRunWithType();
-};
-
-template <class Context>
-class SpatialNarrowAsGradientOp final : public Operator<Context> {
- public:
-  SpatialNarrowAsGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws) {}
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  USE_DISPATCH_HELPER;
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  template <typename T>
-  bool DoRunWithType();
-};
-
-} // namespace caffe2
-
-#endif // SPATIAL_NARROW_AS_OP_H_
diff --git a/modules/detectron/upsample_nearest_op.cc b/modules/detectron/upsample_nearest_op.cc
deleted file mode 100644
index 631e17b231f91..0000000000000
--- a/modules/detectron/upsample_nearest_op.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "upsample_nearest_op.h"
-#ifdef USE_MKLDNN
-#include "caffe2/ideep/operators/operator_fallback_ideep.h"
-#include "caffe2/ideep/utils/ideep_operator.h"
-#endif
-
-namespace caffe2 {
-#ifdef USE_MKLDNN
-REGISTER_IDEEP_OPERATOR(
-    UpsampleNearest,
-    IDEEPFallbackOp<UpsampleNearestOp<float, CPUContext>>);
-#endif
-
-REGISTER_CPU_OPERATOR(UpsampleNearest, UpsampleNearestOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    UpsampleNearestGradient,
-    UpsampleNearestGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(UpsampleNearest)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Nearest neighbor upsampling operation. Implementation taken from THCUNN.
-)DOC")
-    .Arg(
-        "scale",
-        "(int) default 2; integer upsampling factor.")
-    .Input(
-        0,
-        "X",
-        "4D feature map input of shape (N, C, H, W).")
-    .Output(
-        0,
-        "Y",
-        "4D feature map of shape (N, C, scale * H, scale * W); Values are "
-        "neareast neighbor samples from X.");
-
-OPERATOR_SCHEMA(UpsampleNearestGradient)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "X",
-        "See UpsampleNearest.")
-    .Input(
-        1,
-        "dY",
-        "Gradient of forward output 0 (Y).")
-    .Output(
-        0,
-        "dX",
-        "Gradient of forward input 0 (X).");
-
-class GetUpsampleNearestGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "UpsampleNearestGradient",
-        "",
-        vector<string>{I(0), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(UpsampleNearest, GetUpsampleNearestGradient);
-
-} // namespace caffe2
diff --git a/modules/detectron/upsample_nearest_op.cu b/modules/detectron/upsample_nearest_op.cu
deleted file mode 100644
index 0ea32e348c0b3..0000000000000
--- a/modules/detectron/upsample_nearest_op.cu
+++ /dev/null
@@ -1,223 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Adapted from https://github.com/torch/cunn/blob/master/lib/THCUNN/SpatialUpSamplingNearest.cu
- *
- * Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
- * Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
- * Copyright (c) 2011-2013 NYU (Clement Farabet)
- * Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert,
- *                         Leon Bottou, Iain Melvin, Jason Weston)
- * Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
- * Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
- *                         Samy Bengio, Johnny Mariethoz)
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * 3. Neither the names of NEC Laboratories American and IDIAP Research
- *    Institute nor the names of its contributors may be used to endorse or
- *    promote products derived from this software without specific prior
- *    written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#include "caffe2/core/context_gpu.h"
-#include "modules/detectron/upsample_nearest_op.h"
-
-namespace caffe2 {
-
-namespace {
-__device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor) {
-  int x, y, z, w;
-  w = ii % d3;
-  ii = ii/d3;
-  z = ii % d2;
-  ii = ii/d2;
-  y = ii % d1;
-  ii = ii/d1;
-  x = ii;
-  w = w/scale_factor;
-  z = z/scale_factor;
-  d2 /= scale_factor;
-  d3 /= scale_factor;
-  return (((x*d1+y)*d2)+z)*d3+w;
-}
-
-__device__ int translate_idx_inv(
-    int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y) {
-  int x, y, z, w;
-  w = ii % d3;
-  ii = ii/d3;
-  z = ii % d2;
-  ii = ii/d2;
-  y = ii % d1;
-  ii = ii/d1;
-  x = ii;
-  w = w*scale_factor+off_x;
-  z = z*scale_factor+off_y;
-  d2 *= scale_factor;
-  d3 *= scale_factor;
-  return (((x*d1+y)*d2)+z)*d3+w;
-}
-
-__global__ void upscale(const float *input, float *output, long no_elements,
-                        int scale_factor, int d1, int d2, int d3) {
-  long ii = threadIdx.x + blockDim.x * blockIdx.x;
-  ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
-  if (ii >= no_elements) return;
-  int ipidx = translate_idx(ii, d1, d2, d3, scale_factor);
-  output[ii]=input[ipidx];
-}
-
-__global__ void downscale(float *gradInput_data, const float *gradOutput_data,
-                          long no_elements, int scale_factor, int d1, int d2,
-                          int d3) {
-  long ii = threadIdx.x + blockDim.x * blockIdx.x;
-  ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
-  if (ii >= no_elements) return;
-  for (int i=0; i < scale_factor; i++){
-    for(int j=0; j < scale_factor; j++){
-      int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j);
-      gradInput_data[ii] += gradOutput_data[ipidx];
-    }
-  }
-}
-} // namespace
-
-template<>
-bool UpsampleNearestOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
-  auto* Y = Output(0);
-
-  vector<int64_t> out_shape;
-  for (int i = 0; i < X.ndim(); ++i) {
-    out_shape.push_back(X.dim32(i));
-  }
-  out_shape[X.ndim() - 1] *= scale_;
-  out_shape[X.ndim() - 2] *= scale_;
-  Y->Resize(out_shape);
-
-  int d1;
-  int d2;
-  int d3;
-  if (X.ndim() == 3) {
-    d1 = Y->dim32(0);
-    d2 = Y->dim32(1);
-    d3 = Y->dim32(2);
-  } else {
-    d1 = Y->dim32(1);
-    d2 = Y->dim32(2);
-    d3 = Y->dim32(3);
-  }
-  long no_elements = Y->size();
-
-  const float *input_data = X.data<float>();
-  float *output_data = Y->mutable_data<float>();
-
-  // cuda blocks & threads:
-  long nthreads = 256;
-  // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
-  // 65535 for SM 2.x, 2^32 -1 for >= 3.0
-  // TODO: When we move to SM 3.5 we should update this
-  long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
-  long n_yblocks = (long)ceil(
-      (float)no_elements / (float)(n_xblocks * nthreads));
-  CAFFE_ENFORCE(n_yblocks <= 65535);
-  dim3 blocks(n_xblocks, n_yblocks);
-  dim3 threads(nthreads);
-
-  upscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
-      input_data, output_data, no_elements, scale_, d1, d2, d3);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return true;
-}
-
-
-template<>
-bool UpsampleNearestGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X  = Input(0);   // Original input to "forward" op
-  auto& dY = Input(1);   // Gradient of net w.r.t. output of "forward" op
-                         // (aka "gradOutput")
-  auto* dX = Output(0);  // Gradient of net w.r.t. input to "forward" op
-                         // (aka "gradInput")
-
-  dX->ResizeLike(X);
-  float *gradInput_data = dX->mutable_data<float>();
-  const float *gradOutput_data = dY.data<float>();
-
-  int d1;
-  int d2;
-  int d3;
-  if (dX->ndim() == 3) {
-    d1 = dX->dim32(0);
-    d2 = dX->dim32(1);
-    d3 = dX->dim32(2);
-  } else {
-    d1 = dX->dim32(1);
-    d2 = dX->dim32(2);
-    d3 = dX->dim32(3);
-  }
-  long no_elements = dX->size();
-
-  // cuda blocks & threads:
-  long nthreads = 256;
-  // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
-  // 65535 for SM 2.x, 2^32 -1 for >= 3.0
-  // TODO: When we move to SM 3.5 we should update this
-  long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
-  long n_yblocks = (long)ceil(
-      (float)no_elements / (float)(n_xblocks * nthreads));
-  CAFFE_ENFORCE(n_yblocks <= 65535);
-  dim3 blocks(n_xblocks, n_yblocks);
-  dim3 threads(nthreads);
-
-  math::Set<float, CUDAContext>(no_elements, 0.f, gradInput_data, &context_);
-  downscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
-      gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return true;
-}
-
-REGISTER_CUDA_OPERATOR(UpsampleNearest,
-                       UpsampleNearestOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(UpsampleNearestGradient,
-                       UpsampleNearestGradientOp<float, CUDAContext>);
-} // namespace caffe2
diff --git a/modules/detectron/upsample_nearest_op.h b/modules/detectron/upsample_nearest_op.h
deleted file mode 100644
index f850f0381a1e8..0000000000000
--- a/modules/detectron/upsample_nearest_op.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef UPSAMPLE_NEAREST_OP_H_
-#define UPSAMPLE_NEAREST_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class UpsampleNearestOp final : public Operator<Context> {
- public:
-  UpsampleNearestOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        scale_(this->template GetSingleArgument<int>("scale", 2)) {
-    TORCH_DCHECK_GE(scale_, 1);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    auto& X = Input(0);
-
-    auto out_shape = X.sizes().vec();
-    out_shape[X.dim() - 1] *= scale_;
-    out_shape[X.dim() - 2] *= scale_;
-    auto* Y = Output(0, out_shape, at::dtype<T>());
-
-    int d1;
-    int d2;
-    int d3;
-    if (X.dim() == 3) {
-      d1 = Y->dim32(0);
-      d2 = Y->dim32(1);
-      d3 = Y->dim32(2);
-    } else {
-      d1 = Y->dim32(0) * Y->dim32(1);
-      d2 = Y->dim32(2);
-      d3 = Y->dim32(3);
-    }
-
-    const T *input_data = X.template data<T>();
-    T *output_data = Y->template mutable_data<T>();
-    int scaled_d2 = d2 / scale_;
-    int scaled_d3 = d3 / scale_;
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-    for (int i = 0; i < d1; ++i) {
-      for (int j = 0; j < d2; ++j) {
-        for (int u = 0; u < d3; ++u) {
-          int ii = (i * d2 + j) * d3 + u;
-          int scaled_u = u / scale_;
-          int scaled_j = j / scale_;
-          int ipidx = ((i * scaled_d2) + scaled_j) * scaled_d3 + scaled_u;
-          output_data[ii] = input_data[ipidx];
-        }
-      }
-    }
-
-    return true;
-  }
-
- protected:
-  int scale_;
-};
-
-template <typename T, class Context>
-class UpsampleNearestGradientOp final : public Operator<Context> {
- public:
-  UpsampleNearestGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        scale_(this->template GetSingleArgument<int>("scale", 2)) {
-    TORCH_DCHECK_GE(scale_, 1);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  int scale_;
-};
-
-} // namespace caffe2
-
-#endif // UPSAMPLE_NEAREST_OP_H_
diff --git a/modules/detectron/upsample_nearest_op_test.py b/modules/detectron/upsample_nearest_op_test.py
deleted file mode 100644
index 276d50474d1fe..0000000000000
--- a/modules/detectron/upsample_nearest_op_test.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import unittest
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, dyndep
-from hypothesis import given, settings
-
-
-dyndep.InitOpsLibrary("@/caffe2/modules/detectron:detectron_ops")
-
-
-class TestUpsampleNearestOp(hu.HypothesisTestCase):
-    @given(
-        N=st.integers(1, 3),
-        H=st.integers(10, 300),
-        W=st.integers(10, 300),
-        scale=st.integers(1, 3),
-        **hu.gcs,
-    )
-    @settings(deadline=None, max_examples=20)
-    def test_upsample_nearest_op(self, N, H, W, scale, gc, dc):
-        C = 32
-        X = np.random.randn(N, C, H, W).astype(np.float32)
-        op = core.CreateOperator("UpsampleNearest", ["X"], ["Y"], scale=scale)
-
-        def ref(X):
-            outH = H * scale
-            outW = W * scale
-            outH_idxs, outW_idxs = np.meshgrid(
-                np.arange(outH), np.arange(outW), indexing="ij"
-            )
-            inH_idxs = (outH_idxs / scale).astype(np.int32)
-            inW_idxs = (outW_idxs / scale).astype(np.int32)
-            Y = X[:, :, inH_idxs, inW_idxs]
-            return [Y]
-
-        self.assertReferenceChecks(device_option=gc, op=op, inputs=[X], reference=ref)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/modules/module_test/CMakeLists.txt b/modules/module_test/CMakeLists.txt
deleted file mode 100644
index f72120d535f30..0000000000000
--- a/modules/module_test/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-if(NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  # If we are building the standalone module, we set the proper cmake variables.
-  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
-  find_package(Caffe2 REQUIRED)
-  set(BUILD_TEST ON)
-  option(BUILD_SHARED_LIBS "Build shared libs." ON)
-endif()
-
-if(BUILD_TEST AND NOT BUILD_LITE_INTERPRETER)
-  add_library(
-      caffe2_module_test_dynamic
-      ${CMAKE_CURRENT_SOURCE_DIR}/module_test_dynamic.cc)
-
-  if(HAVE_SOVERSION)
-    set_target_properties(caffe2_module_test_dynamic PROPERTIES
-        VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
-  endif()
-  target_link_libraries(caffe2_module_test_dynamic torch_library)
-  install(TARGETS caffe2_module_test_dynamic DESTINATION lib)
-  if(MSVC AND BUILD_SHARED_LIBS)
-    install(FILES $<TARGET_PDB_FILE:caffe2_module_test_dynamic> DESTINATION lib OPTIONAL)
-  endif()
-endif()
diff --git a/modules/module_test/module_test_dynamic.cc b/modules/module_test/module_test_dynamic.cc
deleted file mode 100644
index 32596167a3761..0000000000000
--- a/modules/module_test/module_test_dynamic.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/module.h"
-#include "caffe2/core/operator.h"
-
-// An explicitly defined module, testing correctness when we dynamically link a
-// module
-CAFFE2_MODULE(caffe2_module_test_dynamic, "Dynamic module only used for testing.");
-
-namespace caffe2 {
-
-class Caffe2ModuleTestDynamicDummyOp : public OperatorBase {
- public:
-  using OperatorBase::OperatorBase;
-  bool Run(int /* unused */ /*stream_id*/) override {
-    return true;
-  }
-  virtual string type() {
-    return "base";
-  }
-};
-
-REGISTER_CPU_OPERATOR(
-  Caffe2ModuleTestDynamicDummy, Caffe2ModuleTestDynamicDummyOp);
-OPERATOR_SCHEMA(Caffe2ModuleTestDynamicDummy);
-
-} // namespace caffe2
diff --git a/modules/observers/CMakeLists.txt b/modules/observers/CMakeLists.txt
deleted file mode 100644
index 050b8a1461e32..0000000000000
--- a/modules/observers/CMakeLists.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  if(NOT USE_OBSERVERS)
-    return()
-  endif()
-else()
-  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
-  project(caffe2_observers CXX)
-  find_package(Caffe2 REQUIRED)
-  option(BUILD_SHARED_LIBS "Build shared libs." ON)
-endif()
-
-add_library(caffe2_observers
-    "${CMAKE_CURRENT_SOURCE_DIR}/net_observer_reporter_print.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/observer_config.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/perf_observer.cc"
-    )
-if(HAVE_SOVERSION)
-  set_target_properties(caffe2_observers PROPERTIES
-      VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
-endif()
-target_link_libraries(caffe2_observers PUBLIC torch_library)
-target_include_directories(caffe2_observers PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..)
-target_compile_options(caffe2_observers PRIVATE "-DCAFFE2_BUILD_OBSERVER_LIB")
-install(TARGETS caffe2_observers DESTINATION lib)
-caffe2_interface_library(caffe2_observers caffe2_observers_library)
-if(MSVC AND BUILD_SHARED_LIBS)
-  install(FILES $<TARGET_PDB_FILE:caffe2_observers> DESTINATION lib OPTIONAL)
-endif()
-
-if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  set(Caffe2_MODULES ${Caffe2_MODULES} caffe2_observers_library PARENT_SCOPE)
-endif()
diff --git a/modules/observers/macros.h b/modules/observers/macros.h
deleted file mode 100644
index e69b055d2a1d5..0000000000000
--- a/modules/observers/macros.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "c10/macros/Macros.h"
-
-#ifdef CAFFE2_BUILD_OBSERVER_LIB
-#define CAFFE2_OBSERVER_API C10_EXPORT
-#else
-#define CAFFE2_OBSERVER_API C10_IMPORT
-#endif
diff --git a/modules/observers/net_observer_reporter.h b/modules/observers/net_observer_reporter.h
deleted file mode 100644
index bfccef64cee2b..0000000000000
--- a/modules/observers/net_observer_reporter.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#pragma once
-
-#include <map>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/net.h"
-#include "observers/macros.h"
-
-namespace caffe2 {
-
-struct PerformanceInformation {
-  // Analytic
-  int64_t flops = 0;
-  int64_t bytes_written = 0;
-  int64_t bytes_read = 0;
-  std::vector<TensorShape> tensor_shapes = {};
-  std::vector<Argument> args = {};
-  std::string engine = ""; // the engine used
-  std::string type = ""; // the type of the operator
-  // Measured
-  double latency = 0;
-  double cpuMilliseconds = 0;
-};
-
-class CAFFE2_OBSERVER_API NetObserverReporter {
- public:
-  virtual ~NetObserverReporter() = default;
-
-  /*
-    Report the delay metric collected by the observer.
-    The delays are saved in a map. The key is an identifier associated
-    with the reported delay. The value is the delay value in float
-  */
-  virtual void report(
-      NetBase* net,
-      std::map<std::string, PerformanceInformation>&) = 0;
-};
-}
diff --git a/modules/observers/net_observer_reporter_print.cc b/modules/observers/net_observer_reporter_print.cc
deleted file mode 100644
index dca9cbba44bf1..0000000000000
--- a/modules/observers/net_observer_reporter_print.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-#include "observers/net_observer_reporter_print.h"
-
-#include <algorithm>
-#include <sstream>
-#include "caffe2/core/init.h"
-#include "observers/observer_config.h"
-
-#include <c10/util/irange.h>
-
-namespace caffe2 {
-
-const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer ";
-static std::string get_op_args(PerformanceInformation p);
-static std::string get_tensor_shapes(PerformanceInformation p);
-static std::string sanatize(std::string json_s);
-
-void NetObserverReporterPrint::report(
-    NetBase* net,
-    std::map<std::string, PerformanceInformation>& info) {
-  // Not allowed to use json library
-  std::vector<std::map<std::string, std::string>> caffe2_perf;
-
-  for (auto& p : info) {
-    if ((p.first == "NET_DELAY") && (info.size() == 1)) {
-      // for Net_delay perf
-      caffe2_perf.push_back({{"type", "NET"},
-                             {"value", c10::to_string(p.second.latency * 1000)},
-                             {"unit", "us"},
-                             {"metric", "latency"}});
-      caffe2_perf.push_back({{"type", "NET_"},
-                             {
-                               "value",
-                               c10::to_string(
-                                   p.second.cpuMilliseconds /
-                                   p.second.latency *
-                                   100),
-                             },
-                             {"unit", "percent"},
-                             {"metric", "cpu_percent"}});
-    } else if (p.first != "NET_DELAY") {
-      // for operator perf
-      std::string shape_str = get_tensor_shapes(p.second);
-      std::string args_str = get_op_args(p.second);
-      std::string type = p.first;
-      caffe2_perf.push_back({{"type", type},
-                             {"value", c10::to_string(p.second.latency * 1000)},
-                             {"unit", "us"},
-                             {"metric", "latency"}});
-      caffe2_perf.push_back({{"type", type},
-                             {
-                               "value",
-                               c10::to_string(
-                                   p.second.cpuMilliseconds /
-                                   p.second.latency *
-                                   100),
-                             },
-                             {"unit", "percent"},
-                             {"metric", "cpu_percent"}});
-      if (p.second.flops > 0) {
-        caffe2_perf.push_back({{"type", type},
-                               {"value", c10::to_string(p.second.flops)},
-                               {"unit", "flop"},
-                               {"metric", "flops"}});
-      }
-      if (shape_str != "") {
-        caffe2_perf.push_back({{"type", type},
-                               {"info_string", shape_str},
-                               {"unit", ""},
-                               {"metric", "tensor_shapes"}});
-      }
-      if (args_str != "") {
-        caffe2_perf.push_back({{"type", type},
-                               {"info_string", args_str},
-                               {"unit", ""},
-                               {"metric", "op_args"}});
-      }
-    }
-  }
-
-  // NOLINTNEXTLINE(modernize-loop-convert)
-  for (auto it = caffe2_perf.begin(); it != caffe2_perf.end(); it++) {
-    std::stringstream buffer;
-    auto entry = *it;
-    buffer << IDENTIFIER << "{";
-    // NOLINTNEXTLINE(modernize-raw-string-literal)
-    buffer << "\"type\": \"" << sanatize(entry["type"]) << "\","
-           // NOLINTNEXTLINE(modernize-raw-string-literal)
-           << "\"unit\": \"" << sanatize(entry["unit"]) << "\","
-           // NOLINTNEXTLINE(modernize-raw-string-literal)
-           << "\"metric\": \"" << sanatize(entry["metric"]) << "\",";
-    if (entry.find("value") != entry.end()) {
-      // NOLINTNEXTLINE(modernize-raw-string-literal)
-      buffer << "\"value\": \"" << sanatize(entry["value"]) << "\"";
-    } else if (entry.find("info_string") != entry.end()) {
-      // NOLINTNEXTLINE(modernize-raw-string-literal)
-      buffer << "\"info_string\": \"" << sanatize(entry["info_string"]) << "\"";
-    }
-    buffer << "}";
-    LOG(INFO) << buffer.str();
-  }
-}
-
-static std::string get_tensor_shapes(PerformanceInformation p) {
-  std::string shape_str;
-  std::stringstream shape_stream;
-  if (!p.tensor_shapes.empty()) {
-    shape_stream << "[";
-    for (const auto i : c10::irange(p.tensor_shapes.size())) {
-      shape_stream << "[";
-      for (int j = 0; j < p.tensor_shapes[i].dims_size(); j++) {
-        shape_stream << p.tensor_shapes[i].dims(j) << ", ";
-      }
-      shape_stream << "], ";
-    }
-    shape_stream << "]";
-    shape_str = shape_stream.str();
-  } else {
-    shape_str = "";
-  }
-  return shape_str;
-}
-
-static std::string get_op_args(PerformanceInformation p) {
-  std::string args_str;
-  if (!p.args.empty()) {
-    std::stringstream args;
-    args << "[";
-    for (const auto i : c10::irange(p.args.size())) {
-      args << "{" << p.args[i].name() << ": ";
-      if (p.args[i].has_i()) {
-        args << p.args[i].i();
-      } else if (p.args[i].has_s()) {
-        args << p.args[i].s();
-      } else if (p.args[i].has_n()) {
-        args << &p.args[i].n();
-      } else if (p.args[i].has_f()) {
-        args << p.args[i].f();
-      } else {
-        args << "None";
-      }
-      args << "}, ";
-    }
-    args << "]";
-    args_str = args.str();
-  } else {
-    args_str = "";
-  }
-  return args_str;
-}
-
-static std::string sanatize(std::string json_s) {
-  // Remove illegal characters from the name that would cause json string to
-  // become invalid
-  json_s.erase(std::remove(json_s.begin(), json_s.end(), '"'), json_s.end());
-  json_s.erase(std::remove(json_s.begin(), json_s.end(), '\\'), json_s.end());
-  return json_s;
-}
-}
diff --git a/modules/observers/net_observer_reporter_print.h b/modules/observers/net_observer_reporter_print.h
deleted file mode 100644
index 5d4640c24c994..0000000000000
--- a/modules/observers/net_observer_reporter_print.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#pragma once
-
-#include "observers/macros.h"
-#include "observers/net_observer_reporter.h"
-
-#include "caffe2/core/common.h"
-
-namespace caffe2 {
-
-class CAFFE2_OBSERVER_API NetObserverReporterPrint : public NetObserverReporter {
- public:
-  static const std::string IDENTIFIER;
-  void report(NetBase* net, std::map<std::string, PerformanceInformation>&) override;
-};
-
-} // namespace caffe2
diff --git a/modules/observers/observer_config.cc b/modules/observers/observer_config.cc
deleted file mode 100644
index c6ba6a2d370c0..0000000000000
--- a/modules/observers/observer_config.cc
+++ /dev/null
@@ -1,12 +0,0 @@
-#include "observers/observer_config.h"
-
-namespace caffe2 {
-
-int ObserverConfig::netInitSampleRate_ = 0;
-int ObserverConfig::netFollowupSampleRate_ = 0;
-int ObserverConfig::netFollowupSampleCount_ = 0;
-int ObserverConfig::operatorNetSampleRatio_ = 0;
-int ObserverConfig::skipIters_ = 0;
-unique_ptr<NetObserverReporter> ObserverConfig::reporter_ = nullptr;
-int ObserverConfig::marker_ = -1;
-}
diff --git a/modules/observers/observer_config.h b/modules/observers/observer_config.h
deleted file mode 100644
index cc967263a66b9..0000000000000
--- a/modules/observers/observer_config.h
+++ /dev/null
@@ -1,99 +0,0 @@
-#pragma once
-
-#include "observers/macros.h"
-#include "observers/net_observer_reporter.h"
-
-#include "caffe2/core/common.h"
-
-namespace caffe2 {
-
-/*
-  netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 1 :
-      Log operator metrics in every iteration
-  netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 0 :
-      Log net metrics in every iterationn
-  netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
-          netFollowupSampleCount == c && operatorNetSampleRatio_ == 1 :
-      Log operator metrics first at odds of 1 / n. Once first logged,
-      the following c logs are at odds of 1 / min(n, m). Then repeat
-  netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
-          netFollowupSampleCount == c && operatorNetSampleRatio_ == 0 :
-      Log net metrics first at odds of 1 / n. Once first logged,
-      the following c logs are at odds of 1 / min(n, m). Then repeat
-  netInitSampleRate_ == n && netFollowupSampleRate_ == m &&
-          netFollowupSampleCount == c && operatorNetSampleRatio_ == o :
-      Log net metrics first at odds of 1 / n. Once first logged,
-      the following c logs are at odds of 1 / min(n, m), if the random number
-      is multiples of o, log operator metrics instead. Then repeat
-  skipIters_ == n: skip the first n iterations of the net.
-*/
-class CAFFE2_OBSERVER_API ObserverConfig {
- public:
-  static void initSampleRate(
-      int netInitSampleRate,
-      int netFollowupSampleRate,
-      int netFollowupSampleCount,
-      int operatorNetSampleRatio,
-      int skipIters) {
-    CAFFE_ENFORCE(netFollowupSampleRate <= netInitSampleRate);
-    CAFFE_ENFORCE(netFollowupSampleRate >= 1 || netInitSampleRate == 0);
-    netInitSampleRate_ = netInitSampleRate;
-    netFollowupSampleRate_ = netFollowupSampleRate;
-    netFollowupSampleCount_ = netFollowupSampleCount;
-    operatorNetSampleRatio_ = operatorNetSampleRatio;
-    skipIters_ = skipIters;
-  }
-  static int getNetInitSampleRate() {
-    return netInitSampleRate_;
-  }
-  static int getNetFollowupSampleRate() {
-    return netFollowupSampleRate_;
-  }
-  static int getNetFollowupSampleCount() {
-    return netFollowupSampleCount_;
-  }
-  static int getOpoeratorNetSampleRatio() {
-    return operatorNetSampleRatio_;
-  }
-  static int getSkipIters() {
-    return skipIters_;
-  }
-  static void setReporter(unique_ptr<NetObserverReporter> reporter) {
-    reporter_ = std::move(reporter);
-  }
-  static NetObserverReporter* getReporter() {
-    CAFFE_ENFORCE(reporter_);
-    return reporter_.get();
-  }
-  static void setMarker(int marker) {
-    marker_ = marker;
-  }
-  static int getMarker() {
-    return marker_;
-  }
-
- private:
-  /* The odds of log net metric initially or immediately after reset */
-  static int netInitSampleRate_;
-
-  /* The odds of log net metric after log once after start of reset */
-  static int netFollowupSampleRate_;
-
-  /* The number of follow up logs to be collected for odds of
-     netFollowupSampleRate_ */
-  static int netFollowupSampleCount_;
-
-  /* The odds to log the operator metric instead of the net metric.
-     When the operator is logged the net is not logged. */
-  static int operatorNetSampleRatio_;
-
-  /* skip the first few iterations */
-  static int skipIters_;
-
-  static unique_ptr<NetObserverReporter> reporter_;
-
-  /* marker used in identifying the metrics in certain reporters */
-  static int marker_;
-};
-
-}
diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc
deleted file mode 100644
index cfd6130f7255e..0000000000000
--- a/modules/observers/perf_observer.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-#include "observers/perf_observer.h"
-#include "observers/observer_config.h"
-#ifndef C10_MOBILE
-#include "caffe2/core/flags.h"
-#include "observers/net_observer_reporter_print.h"
-#endif
-
-#include <random>
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <time.h>
-#include "caffe2/core/common.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/operator.h"
-
-#if defined(TARGET_OS_MAC) || \
-defined(TARGET_OS_IPHONE) || \
-defined(TARGET_IPHONE_SIMULATOR)
-#define _APPLE 1
-#endif
-
-#ifdef _WIN32
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
-#endif
-
-#ifdef _APPLE
-#include <mach/mach_time.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-#endif
-
-#ifndef C10_MOBILE
-C10_DEFINE_int64(
-    aiBench_netInitSampleRate,
-    0,
-    "One in N sampling rate for net delay");
-
-C10_DEFINE_int64(
-    aiBench_netFollowupSampleRate,
-    0,
-    "One in N sampling rate for net delay");
-
-C10_DEFINE_int64(
-    aiBench_netFollowupSampleCount,
-    0,
-    "control the following c logs");
-
-C10_DEFINE_int64(
-    aiBench_operatorNetSampleRatio,
-    0,
-    "One in N sampling rate for operator delay");
-
-C10_DEFINE_int64(
-    aiBench_skipIters,
-    0,
-    "skip the first N iterations of the net run");
-#endif
-
-namespace caffe2 {
-namespace {
-
-bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
-  AddGlobalNetObserverCreator([](NetBase* subject) {
-    return std::make_unique<PerfNetObserver>(subject);
-  });
-
-#if !defined(C10_MOBILE)
-  // for aibench usage
-  caffe2::ObserverConfig::setReporter(
-      std::make_unique<caffe2::NetObserverReporterPrint>());
-
-  caffe2::ObserverConfig::initSampleRate(
-      FLAGS_aiBench_netInitSampleRate,
-      FLAGS_aiBench_netFollowupSampleRate,
-      FLAGS_aiBench_netFollowupSampleCount,
-      FLAGS_aiBench_operatorNetSampleRatio,
-      FLAGS_aiBench_skipIters);
-#endif
-
-  return true;
-}
-} // namespace
-
-#ifdef _WIN32
-double getTicksPerMillisecond() {
-  static LARGE_INTEGER ticks_per_sec;
-  if (!ticks_per_sec.QuadPart) {
-    QueryPerformanceFrequency(&ticks_per_sec);
-    if (!ticks_per_sec.QuadPart) {
-      return 0.0;
-    }
-  }
-
-  return static_cast<double>(ticks_per_sec.QuadPart) / 1000.0;
-}
-#elif !defined _APPLE
-double getClockTimeMilliseconds(clockid_t clk_id) {
-  int result;
-  struct timespec tp;
-  result = clock_gettime(clk_id, &tp);
-  if (result == -1) {
-    return 0.0;
-  } else {
-    return tp.tv_sec * 1000.0 + tp.tv_nsec / 1000000.0;
-  }
-}
-#endif
-
-double getWallClockTimeMilliseconds() {
-#ifdef _WIN32
-  double ticks_per_ms = getTicksPerMillisecond();
-  if (ticks_per_ms) {
-    LARGE_INTEGER ticks;
-    if (QueryPerformanceCounter(&ticks)) {
-      return static_cast<double>(ticks.QuadPart) / ticks_per_ms;
-    }
-  }
-
-  return 0.0;
-#elif defined _APPLE
-  static mach_timebase_info_data_t info;
-  if (info.denom == 0) {
-    mach_timebase_info(&info);
-  }
-
-  uint64_t now = mach_absolute_time();
-  now = now * info.numer / info.denom; // convert to nanoseconds
-  return now / 1000000.0;
-#else
-  return getClockTimeMilliseconds(CLOCK_MONOTONIC);
-#endif
-}
-
-double getCpuTimeMilliseconds() {
-#ifdef _WIN32
-  FILETIME creation_time;
-  FILETIME exit_time;
-  FILETIME kernel_time;
-  FILETIME user_time;
-  if (GetProcessTimes(
-      GetCurrentProcess(),
-      &creation_time,
-      &exit_time,
-      &kernel_time,
-      &user_time)) {
-    ULARGE_INTEGER kernel;
-    ULARGE_INTEGER user;
-    kernel.HighPart = kernel_time.dwHighDateTime;
-    kernel.LowPart = kernel_time.dwLowDateTime;
-    user.HighPart = user_time.dwHighDateTime;
-    user.LowPart = user_time.dwLowDateTime;
-    return (static_cast<double>(kernel.QuadPart) +
-        static_cast<double>(user.QuadPart)) / 10000.0;
-  }
-
-  return 0.0;
-#elif defined _APPLE
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  struct rusage ru;
-  if (getrusage(RUSAGE_SELF, &ru)) {
-    return 0.0;
-  }
-
-  return ru.ru_utime.tv_sec * 1000.0
-      + ru.ru_utime.tv_usec / 1000.0
-      + ru.ru_stime.tv_sec * 1000.0
-      + ru.ru_stime.tv_usec / 1000.0;
-#else
-  return getClockTimeMilliseconds(CLOCK_PROCESS_CPUTIME_ID);
-#endif
-}
-
-REGISTER_CAFFE2_EARLY_INIT_FUNCTION(
-    registerGlobalPerfNetObserverCreator,
-    &registerGlobalPerfNetObserverCreator,
-    "Caffe2 net global observer creator");
-
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-PerfNetObserver::PerfNetObserver(NetBase* subject_)
-    : NetObserver(subject_), numRuns_(0) {}
-
-// NOLINTNEXTLINE(modernize-use-equals-default)
-PerfNetObserver::~PerfNetObserver() {}
-
-void PerfNetObserver::Start() {
-  static int visitCount = 0;
-  // Select whether to log the operator or the net.
-  // We have one sample rate for the entire app.
-  int netInitSampleRate = ObserverConfig::getNetInitSampleRate();
-  int netFollowupSampleRate = ObserverConfig::getNetFollowupSampleRate();
-  int netFollowupSampleCount = ObserverConfig::getNetFollowupSampleCount();
-  int operatorNetSampleRatio = ObserverConfig::getOpoeratorNetSampleRatio();
-  int skipIters = ObserverConfig::getSkipIters();
-  int sampleRate = visitCount > 0 ? netFollowupSampleRate : netInitSampleRate;
-  // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
-  if (skipIters <= static_cast<int>(numRuns_) && sampleRate > 0 && rand() % sampleRate == 0) {
-    visitCount++;
-    if (visitCount == netFollowupSampleCount) {
-      visitCount = 0;
-    }
-    // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
-    if (operatorNetSampleRatio > 0 && rand() % operatorNetSampleRatio == 0) {
-      logType_ = PerfNetObserver::OPERATOR_DELAY;
-    } else {
-      logType_ = PerfNetObserver::NET_DELAY;
-    }
-  } else {
-    logType_ = PerfNetObserver::NONE;
-  }
-  numRuns_++;
-
-  if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
-    /* Always recreate new operator  observers
-       whenever we measure operator delay */
-    const auto& operators = subject_->GetOperators();
-    for (auto* op : operators) {
-      observerMap_[op] = op->AttachObserver(
-          std::make_unique<PerfOperatorObserver>(op, this));
-    }
-  }
-
-  wallMilliseconds_ = getWallClockTimeMilliseconds();
-  cpuMilliseconds_ = getCpuTimeMilliseconds();
-}
-
-void PerfNetObserver::Stop() {
-  if (logType_ == PerfNetObserver::NONE) {
-    return;
-  }
-  std::map<std::string, PerformanceInformation> info;
-  PerformanceInformation net_perf;
-  net_perf.cpuMilliseconds =
-      getCpuTimeMilliseconds() - cpuMilliseconds_;
-  net_perf.latency =
-      getWallClockTimeMilliseconds() - wallMilliseconds_;
-
-  if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
-    const auto& operators = subject_->GetOperators();
-    for (unsigned idx = 0; idx < operators.size(); ++idx) {
-      const auto* op = operators[idx];
-      auto name = getObserverName(op, static_cast<int>(idx));
-      PerformanceInformation p;
-      const PerfOperatorObserver* opObserver =
-          static_cast<const PerfOperatorObserver*>(observerMap_[op]);
-      p.latency = opObserver->getWallMilliseconds();
-      p.cpuMilliseconds = opObserver->getCpuMilliseconds();
-      p.engine = op->engine();
-      p.type = op->type();
-      p.tensor_shapes =
-          static_cast<const PerfOperatorObserver*>(observerMap_[op])
-              ->getTensorShapes();
-
-      if (op->has_debug_def()) {
-        // NOLINTNEXTLINE(performance-for-range-copy)
-        for (auto arg : op->debug_def().arg()) {
-          p.args.emplace_back(arg);
-        }
-      }
-
-      info.insert({name, p});
-    }
-
-    /* clear all operator delay after use so that we don't spent time
-       collecting the operator delay info in later runs */
-    for (auto* op : operators) {
-      op->DetachObserver(observerMap_[op]);
-    }
-    observerMap_.clear();
-  }
-  info.insert({"NET_DELAY", net_perf});
-  ObserverConfig::getReporter()->report(subject_, info);
-}
-
-caffe2::string PerfNetObserver::getObserverName(const OperatorBase* op, int idx)
-    const {
-  string opType = op->has_debug_def() ? op->debug_def().type() : "NO_TYPE";
-  string displayName =
-      (op->has_debug_def() ? op->debug_def().name().size()
-               ? op->debug_def().name()
-               : (op->debug_def().output_size() ? op->debug_def().output(0)
-                                                : "NO_OUTPUT")
-                           : "NO_DEF");
-  caffe2::string name =
-      "ID_" + c10::to_string(idx) + "_" + opType + "_" + displayName;
-  return name;
-}
-
-PerfOperatorObserver::PerfOperatorObserver(
-    OperatorBase* op,
-    PerfNetObserver* netObserver)
-    : ObserverBase<OperatorBase>(op),
-      netObserver_(netObserver),
-      wallMilliseconds_(0),
-      cpuMilliseconds_(0) {
-  CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net");
-}
-
-// NOLINTNEXTLINE(modernize-use-equals-default)
-PerfOperatorObserver::~PerfOperatorObserver() {}
-
-void PerfOperatorObserver::Start() {
-  wallMilliseconds_ = getWallClockTimeMilliseconds();
-  cpuMilliseconds_ = getCpuTimeMilliseconds();
-}
-
-void PerfOperatorObserver::Stop() {
-  /* Time from the start of the net minus the time spent on all other
-     operators is the time spent on this operator */
-  cpuMilliseconds_ =
-      getCpuTimeMilliseconds() - cpuMilliseconds_;
-  wallMilliseconds_ =
-      getWallClockTimeMilliseconds() - wallMilliseconds_;
-  tensor_shapes_ = subject_->InputTensorShapes();
-}
-
-double PerfOperatorObserver::getWallMilliseconds() const {
-  return wallMilliseconds_;
-}
-
-double PerfOperatorObserver::getCpuMilliseconds() const {
-  return cpuMilliseconds_;
-}
-
-std::vector<TensorShape> PerfOperatorObserver::getTensorShapes() const {
-  return tensor_shapes_;
-}
-
-} // namespace caffe2
diff --git a/modules/observers/perf_observer.h b/modules/observers/perf_observer.h
deleted file mode 100644
index 71e1190e840ba..0000000000000
--- a/modules/observers/perf_observer.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#pragma once
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/net.h"
-#include "caffe2/core/observer.h"
-#include "caffe2/core/timer.h"
-#include "observers/macros.h"
-
-#include <unordered_map>
-
-namespace caffe2 {
-
-double getClockTimeMilliseconds();
-
-class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver {
- public:
-  explicit PerfNetObserver(NetBase* subject_);
-  virtual ~PerfNetObserver();
-
- private:
-  void Start() override;
-  void Stop() override;
-
-  caffe2::string getObserverName(const OperatorBase* op, int idx) const;
-
- private:
-  enum LogType {
-    NONE,
-    OPERATOR_DELAY,
-    NET_DELAY,
-  };
-  LogType logType_;
-  unsigned int numRuns_;
-  std::unordered_map<const OperatorBase*, const ObserverBase<OperatorBase>*>
-      observerMap_;
-
-  double wallMilliseconds_;
-  double cpuMilliseconds_;
-};
-
-class PerfOperatorObserver : public ObserverBase<OperatorBase> {
- public:
-  PerfOperatorObserver(OperatorBase* op, PerfNetObserver* netObserver);
-  virtual ~PerfOperatorObserver();
-
-  double getWallMilliseconds() const;
-  double getCpuMilliseconds() const;
-  std::vector<TensorShape> getTensorShapes() const;
-
- private:
-  void Start() override;
-  void Stop() override;
-
- private:
-  // Observer of a net that owns corresponding op. We make sure net is never
-  // destructed while operator observer is still alive. First operator observer
-  // gets destructed, then the op, then the net and its observer.
-  // We do this trick in order to get access to net's name and other fields
-  // without storing inside the operator observer. Each field is memory
-  // costly here and a raw pointer is a cheapest sholution
-  PerfNetObserver* netObserver_;
-  double wallMilliseconds_;
-  double cpuMilliseconds_;
-  std::vector<TensorShape> tensor_shapes_;
-};
-} // namespace caffe2
diff --git a/setup.py b/setup.py
index d774446780b48..84f3d48c958e8 100644
--- a/setup.py
+++ b/setup.py
@@ -88,12 +88,6 @@
 #     disables use of system-wide nccl (we will use our submoduled
 #     copy in third_party/nccl)
 #
-#   BUILD_CAFFE2_OPS=0
-#     disable Caffe2 operators build
-#
-#   BUILD_CAFFE2=0
-#     disable Caffe2 build
-#
 #   USE_IBVERBS
 #     toggle features related to distributed support
 #
@@ -1317,6 +1311,7 @@ def main():
         "include/torch/csrc/onnx/*.h",
         "include/torch/csrc/profiler/*.h",
         "include/torch/csrc/profiler/orchestration/*.h",
+        "include/torch/csrc/profiler/standalone/*.h",
         "include/torch/csrc/profiler/stubs/*.h",
         "include/torch/csrc/profiler/unwind/*.h",
         "include/torch/csrc/profiler/python/*.h",
diff --git a/test/cpp/api/autograd.cpp b/test/cpp/api/autograd.cpp
index 3d1604752dbc4..4d6bb485be518 100644
--- a/test/cpp/api/autograd.cpp
+++ b/test/cpp/api/autograd.cpp
@@ -1265,7 +1265,7 @@ int64_t ret_single_non_tensor(
 
 torch::Tensor opt_op(
     const torch::Tensor& self,
-    const c10::optional<at::Tensor>& other) {
+    const std::optional<at::Tensor>& other) {
   if (other.has_value()) {
     return self + other.value();
   } else {
@@ -1461,11 +1461,11 @@ TEST(TestAutogradNotImplementedFallback, OptOp) {
   auto opHandle =
       c10::Dispatcher::singleton().findSchemaOrThrow("_test::opt_op", "");
   auto op = [&](const torch::Tensor& _1,
-                const c10::optional<torch::Tensor>& _2) {
+                const std::optional<torch::Tensor>& _2) {
     return callOpUnboxed<
         torch::Tensor,
         const torch::Tensor&,
-        const c10::optional<torch::Tensor>&>(opHandle, _1, _2);
+        const std::optional<torch::Tensor>&>(opHandle, _1, _2);
   };
 
   auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
diff --git a/test/cpp/api/memory.cpp b/test/cpp/api/memory.cpp
index d9f44ea3f7a40..a3adc81406b7b 100644
--- a/test/cpp/api/memory.cpp
+++ b/test/cpp/api/memory.cpp
@@ -6,8 +6,8 @@ struct TestValue {
   explicit TestValue(const int& x) : lvalue_(x) {}
   explicit TestValue(int&& x) : rvalue_(x) {}
 
-  c10::optional<int> lvalue_;
-  c10::optional<int> rvalue_;
+  std::optional<int> lvalue_;
+  std::optional<int> rvalue_;
 };
 
 TEST(MakeUniqueTest, ForwardRvaluesCorrectly) {
diff --git a/test/cpp/c10d/ProcessGroupNCCLTest.cpp b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
index d1c2380274278..edf4f03c2d692 100644
--- a/test/cpp/c10d/ProcessGroupNCCLTest.cpp
+++ b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
@@ -42,7 +42,7 @@ class NCCLTestBase {
   void initialize(
       int rank,
       int size,
-      c10::optional<::std::shared_ptr<::c10d::ProcessGroupNCCL>> split_from =
+      std::optional<::std::shared_ptr<::c10d::ProcessGroupNCCL>> split_from =
           c10::nullopt) {
     store_ = c10::make_intrusive<::c10d::FileStore>(path_, size);
 
diff --git a/test/cpp/jit/test_argument_spec.cpp b/test/cpp/jit/test_argument_spec.cpp
index 6ffe167c58768..71785d889952a 100644
--- a/test/cpp/jit/test_argument_spec.cpp
+++ b/test/cpp/jit/test_argument_spec.cpp
@@ -111,7 +111,7 @@ TEST(ArgumentSpecTest, CompleteArgumentSpec_CUDA) {
 // }
 
 // TEST(ArgumentSpecTest, VaryingShape) {
-//   c10::VaryingShape<int64_t> vs(c10::optional<size_t>{});
+//   c10::VaryingShape<int64_t> vs(std::optional<size_t>{});
 //   auto ptt_empty1 = TensorType::create({}, {}, vs, vs, false);
 //   auto ptt_empty2 = TensorType::create({}, {}, vs, vs, false);
 //   ASSERT_EQ(hashCode(ptt_empty1), hashCode(ptt_empty2));
diff --git a/test/cpp/jit/test_custom_class_registrations.cpp b/test/cpp/jit/test_custom_class_registrations.cpp
index 2595c64c9b170..819d5495b06c3 100644
--- a/test/cpp/jit/test_custom_class_registrations.cpp
+++ b/test/cpp/jit/test_custom_class_registrations.cpp
@@ -27,7 +27,7 @@ struct DefaultArgs : torch::CustomClassHolder {
     x = scale * x + add;
     return x;
   }
-  int64_t divide(c10::optional<int64_t> factor) {
+  int64_t divide(std::optional<int64_t> factor) {
     if (factor) {
       // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
       x = x / *factor;
@@ -334,7 +334,7 @@ struct ElementwiseInterpreter : torch::CustomClassHolder {
   // collection types like vector, optional, and dict.
   using SerializationType = std::tuple<
       std::vector<std::string> /*input_names_*/,
-      c10::optional<std::string> /*output_name_*/,
+      std::optional<std::string> /*output_name_*/,
       c10::Dict<std::string, at::Tensor> /*constants_*/,
       std::vector<InstructionType> /*instructions_*/
       >;
@@ -360,7 +360,7 @@ struct ElementwiseInterpreter : torch::CustomClassHolder {
 
   // Class members
   std::vector<std::string> input_names_;
-  c10::optional<std::string> output_name_;
+  std::optional<std::string> output_name_;
   c10::Dict<std::string, at::Tensor> constants_;
   std::vector<InstructionType> instructions_;
 };
diff --git a/test/cpp/jit/test_exception.cpp b/test/cpp/jit/test_exception.cpp
index be23548e16d15..a4932e76b3e24 100644
--- a/test/cpp/jit/test_exception.cpp
+++ b/test/cpp/jit/test_exception.cpp
@@ -31,7 +31,7 @@ TEST(TestException, TestAssertion) {
 
   bool is_jit_exception = false;
   std::string message;
-  c10::optional<std::string> exception_class;
+  std::optional<std::string> exception_class;
   try {
     cu_ptr->run_method("foo");
   } catch (JITException& e) {
@@ -140,7 +140,7 @@ TEST(TestException, TestCustomException) {
       (torch::jit::GraphFunction*)&cu->get_function("foo");
   std::cerr << "Graph is\n" << *gf->graph() << std::endl;
   bool is_jit_exception = false;
-  c10::optional<std::string> exception_class;
+  std::optional<std::string> exception_class;
   std::string message;
   try {
     cu->run_method("foo");
diff --git a/test/cpp/jit/test_ir.cpp b/test/cpp/jit/test_ir.cpp
index e9a0edabaaf0f..19910cbf31f00 100644
--- a/test/cpp/jit/test_ir.cpp
+++ b/test/cpp/jit/test_ir.cpp
@@ -194,17 +194,17 @@ TEST(IRTest, OperatorMap) {
   ASSERT_FALSE(op_map.contains(*op6));
   op_map.insert(op1, 1);
   ASSERT_TRUE(op_map.contains(*op1));
-  c10::optional<int> o1 = op_map.find(*op1);
+  std::optional<int> o1 = op_map.find(*op1);
   ASSERT_TRUE(o1.has_value());
-  c10::optional<int> o2 = op_map.find(*op2);
+  std::optional<int> o2 = op_map.find(*op2);
   ASSERT_TRUE(o2.has_value());
-  c10::optional<int> o3 = op_map.find(*op3);
+  std::optional<int> o3 = op_map.find(*op3);
   ASSERT_FALSE(o3.has_value());
-  c10::optional<int> o4 = op_map.find(*op4);
+  std::optional<int> o4 = op_map.find(*op4);
   ASSERT_TRUE(o4.has_value());
-  c10::optional<int> o5 = op_map.find(*op5);
+  std::optional<int> o5 = op_map.find(*op5);
   ASSERT_TRUE(o5.has_value());
-  c10::optional<int> o6 = op_map.find(*op6);
+  std::optional<int> o6 = op_map.find(*op6);
   ASSERT_FALSE(o6.has_value());
 }
 
diff --git a/test/cpp/jit/test_jit_type.cpp b/test/cpp/jit/test_jit_type.cpp
index 606c1b0fa36e0..08f7f360731b7 100644
--- a/test/cpp/jit/test_jit_type.cpp
+++ b/test/cpp/jit/test_jit_type.cpp
@@ -12,7 +12,7 @@ TEST(JitTypeTest, IsComplete) {
   auto tt = c10::TensorType::create(
       at::kFloat,
       at::kCPU,
-      c10::SymbolicShape(std::vector<c10::optional<int64_t>>({1, 49})),
+      c10::SymbolicShape(std::vector<std::optional<int64_t>>({1, 49})),
       std::vector<c10::Stride>(
           {c10::Stride{2, true, 1},
            c10::Stride{1, true, 1},
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index efe377aad72ce..9c74eb45e535f 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -1302,7 +1302,7 @@ TEST(RecordFunctionTest, OperatorNameOverload) {
   at::addGlobalCallback(at::RecordFunctionCallback(
                             [](const at::RecordFunction& fn)
                                 -> std::unique_ptr<at::ObserverContext> {
-                              c10::optional<c10::OperatorName> op_name =
+                              std::optional<c10::OperatorName> op_name =
                                   fn.operator_name();
                               if (op_name.has_value()) {
                                 operator_names.insert(c10::toString(*op_name));
diff --git a/test/cpp/jit/test_shape_analysis.cpp b/test/cpp/jit/test_shape_analysis.cpp
index 4940d277ce043..0ff3908d639a5 100644
--- a/test/cpp/jit/test_shape_analysis.cpp
+++ b/test/cpp/jit/test_shape_analysis.cpp
@@ -296,7 +296,7 @@ TEST(ShapeAnalysisTest, MovingConstantOutOfFusionGroups) {
 
 namespace {
 
-c10::optional<int64_t> sym_dim = c10::nullopt;
+std::optional<int64_t> sym_dim = c10::nullopt;
 
 // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
 void assertShapeEqual(c10::SymbolicShape& a, c10::SymbolicShape& e) {
@@ -306,8 +306,8 @@ void assertShapeEqual(c10::SymbolicShape& a, c10::SymbolicShape& e) {
 }
 
 void assertShapeEqual(
-    c10::optional<std::vector<c10::SymbolicShape>>& actual,
-    std::vector<c10::optional<int64_t>> expected) {
+    std::optional<std::vector<c10::SymbolicShape>>& actual,
+    std::vector<std::optional<int64_t>> expected) {
   ASSERT_TRUE(actual.has_value());
   ASSERT_EQ(actual->size(), 1);
 
@@ -332,12 +332,12 @@ TEST(ShapeAnalysisTest, SymbolicShapeAPI) {
 
   // Check vector initializer list syntax
   c10::SymbolicShape ss_concrete =
-      std::vector<c10::optional<int64_t>>{1, 56, 56};
-  c10::SymbolicShape ss1 = std::vector<c10::optional<int64_t>>{sym_dim, 56, 56};
+      std::vector<std::optional<int64_t>>{1, 56, 56};
+  c10::SymbolicShape ss1 = std::vector<std::optional<int64_t>>{sym_dim, 56, 56};
   c10::SymbolicShape ss2 =
-      std::vector<c10::optional<int64_t>>{64, sym_dim, sym_dim};
+      std::vector<std::optional<int64_t>>{64, sym_dim, sym_dim};
   c10::SymbolicShape ss3 =
-      std::vector<c10::optional<int64_t>>{sym_dim, sym_dim, sym_dim, sym_dim};
+      std::vector<std::optional<int64_t>>{sym_dim, sym_dim, sym_dim, sym_dim};
 
   auto res = calculateSymbolicShapesOnOp(
       schema, std::vector<SSAInput>{const_size_1, const_size_1});
@@ -484,7 +484,7 @@ TEST(ShapeAnalysisTest, TestShapeMultipleReturns) {
   auto res =
       calculateSymbolicShapesOnOp(max_dim_op, {ss1, const_int, false_ival});
   c10::SymbolicShape expected_res =
-      c10::SymbolicShape(std::vector<c10::optional<int64_t>>{sym_dim});
+      c10::SymbolicShape(std::vector<std::optional<int64_t>>{sym_dim});
   assertShapeEqual(res->at(0), expected_res);
   // res0 and res1 should share the same symbolic symbol
   EXPECT_EQ(res->at(0), res->at(1));
diff --git a/test/cpp/lazy/test_lazy_ops.cpp b/test/cpp/lazy/test_lazy_ops.cpp
index aa31ffc59bb51..745f40729f02d 100644
--- a/test/cpp/lazy/test_lazy_ops.cpp
+++ b/test/cpp/lazy/test_lazy_ops.cpp
@@ -475,7 +475,7 @@ TEST_F(LazyOpsTest, TestDiv) {
 }
 
 TEST_F(LazyOpsTest, TestDivWithRoundingMode) {
-  c10::optional<c10::string_view> rounding_modes[] = {
+  std::optional<c10::string_view> rounding_modes[] = {
       "trunc", "floor", c10::nullopt};
   for (const auto& rounding_mode : rounding_modes) {
     for (torch::ScalarType scalar_type1 :
@@ -535,7 +535,7 @@ TEST_F(LazyOpsTest, TestDivInPlace) {
 }
 
 TEST_F(LazyOpsTest, TestDivInPlaceWithRoundingMode) {
-  c10::optional<c10::string_view> rounding_modes[] = {
+  std::optional<c10::string_view> rounding_modes[] = {
       "trunc", "floor", c10::nullopt};
   for (const auto& rounding_mode : rounding_modes) {
     for (torch::ScalarType scalar_type1 : {torch::kFloat}) {
@@ -1553,7 +1553,7 @@ TEST_F(LazyOpsTest, TestStdWithCorrection) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // int rank = a.dim();
-  c10::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
+  std::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
   for (const auto& correction : corrections) {
     for (auto keepdim : {true, false}) {
       for (const auto& dim :
@@ -1573,7 +1573,7 @@ TEST_F(LazyOpsTest, TestStdMeanWithCorrection) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // int rank = a.dim();
-  c10::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
+  std::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
   for (const auto& correction : corrections) {
     for (auto keepdim : {true, false}) {
       for (const auto& dim :
@@ -1710,7 +1710,7 @@ TEST_F(LazyOpsTest, TestVarWithDim) {
 TEST_F(LazyOpsTest, TestVarWithCorrection) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
-  c10::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
+  std::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
   for (const auto& dim : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     for (bool keepDim : {true, false}) {
       for (const auto& correction : corrections) {
@@ -1730,7 +1730,7 @@ TEST_F(LazyOpsTest, TestVarWithCorrection) {
 TEST_F(LazyOpsTest, TestVarMeanWithCorrection) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
-  c10::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
+  std::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
   for (const auto& dim : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     for (const auto& correction : corrections) {
       for (auto keepdim : {true, false}) {
diff --git a/test/cpp/lazy/test_misc.cpp b/test/cpp/lazy/test_misc.cpp
index aa4cd1b7e798e..441e5c41eee13 100644
--- a/test/cpp/lazy/test_misc.cpp
+++ b/test/cpp/lazy/test_misc.cpp
@@ -63,10 +63,10 @@ TEST(HashTest, Sanity) {
   test_hash_repeatable_sensitive(c10::Scalar(true), c10::Scalar(false));
   test_hash_repeatable_sensitive(c10::Scalar(12345), c10::Scalar(12354));
 
-  // c10::optional
+  // std::optional
   test_hash_repeatable_sensitive(
-      c10::optional<std::string>("I have value!"),
-      c10::optional<std::string>(c10::nullopt));
+      std::optional<std::string>("I have value!"),
+      std::optional<std::string>(c10::nullopt));
 
   // Containers
   auto a = std::vector<int32_t>({0, 1, 1, 2, 3, 5, 8});
diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp
index 7a4291f0ba447..c26c800a16bf6 100644
--- a/test/cpp/tensorexpr/test_external_calls.cpp
+++ b/test/cpp/tensorexpr/test_external_calls.cpp
@@ -507,11 +507,11 @@ TEST(ExternalCall, Prepacked_Linear_float) {
           .findSchemaOrThrow("prepacked::linear_clamp_prepack", "")
           .typed<c10::intrusive_ptr<LinearOpContext>(
               at::Tensor,
-              c10::optional<at::Tensor>,
-              const c10::optional<at::Scalar>&,
-              const c10::optional<at::Scalar>&)>();
+              std::optional<at::Tensor>,
+              const std::optional<at::Scalar>&,
+              const std::optional<at::Scalar>&)>();
   auto prepacked = linear_clamp_prepack_op.call(
-      weight, bias, c10::optional<at::Scalar>(), c10::optional<at::Scalar>());
+      weight, bias, std::optional<at::Scalar>(), c10::optional<at::Scalar>());
 
   BufHandle DummyPrepacked("DummyPrepacked", {1}, kFloat);
   Tensor Result = Tensor(
@@ -581,13 +581,13 @@ TEST(ExternalCall, Prepacked_Conv2d_float) {
           .findSchemaOrThrow("prepacked::conv2d_clamp_prepack", "")
           .typed<c10::intrusive_ptr<Conv2dOpContext>(
               at::Tensor,
-              c10::optional<at::Tensor>,
+              std::optional<at::Tensor>,
               std::vector<int64_t>,
               std::vector<int64_t>,
               std::vector<int64_t>,
               int64_t,
-              const c10::optional<at::Scalar>&,
-              const c10::optional<at::Scalar>&)>();
+              const std::optional<at::Scalar>&,
+              const std::optional<at::Scalar>&)>();
   auto prepacked = conv2d_clamp_prepack_op.call(
       weight,
       bias,
@@ -595,8 +595,8 @@ TEST(ExternalCall, Prepacked_Conv2d_float) {
       {pad, pad},
       {dilation, dilation},
       groups,
-      c10::optional<at::Scalar>(),
-      c10::optional<at::Scalar>());
+      std::optional<at::Scalar>(),
+      std::optional<at::Scalar>());
 
   BufHandle DummyPrepacked("DummyPrepacked", {1}, kFloat);
   Tensor Result = Tensor(
@@ -945,7 +945,7 @@ TEST(ExternalCall, JitCustomFusionOp) {
           const std::vector<torch::jit::tensorexpr::ArgValue>& inputs,
           const std::vector<torch::jit::tensorexpr::ExprHandle>& output_shape,
           const std::vector<torch::jit::tensorexpr::ExprHandle>& output_strides,
-          const c10::optional<torch::jit::tensorexpr::ScalarType>& output_type,
+          const std::optional<torch::jit::tensorexpr::ScalarType>& output_type,
           at::Device device) {
         auto output_dtype = Dtype(*output_type);
         torch::jit::tensorexpr::BufHandle result_buf(
diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index 21b86e9b00707..22f6b64efe1a8 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -888,7 +888,7 @@ TEST_F(Kernel, SumAllAxes) {
     parseIR(graph_string, &*graph);
 
     auto o = at::empty({}, TensorOptions(kCPU));
-    c10::optional<c10::ScalarType> dtype;
+    std::optional<c10::ScalarType> dtype;
     if (scalar_type != ScalarType::Undefined) {
       dtype = static_cast<c10::ScalarType>(scalar_type);
     }
@@ -947,7 +947,7 @@ TEST_F(Kernel, SumOneAxis) {
         env.d("dim", dim);
         env.d("keepdim", keepdim);
         env.s("dtype", dtypeConstant(scalar_type));
-        c10::optional<c10::ScalarType> dtype;
+        std::optional<c10::ScalarType> dtype;
         if (scalar_type != ScalarType::Undefined) {
           dtype = static_cast<c10::ScalarType>(scalar_type);
         }
@@ -1665,7 +1665,7 @@ Tensor lowerNanToNum(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   auto input_buf = std::get<BufHandle>(inputs[0]);
   auto e = Compute(
diff --git a/test/cpp/tensorexpr/test_quantization.cpp b/test/cpp/tensorexpr/test_quantization.cpp
index a689358276f2c..af6b539ff33e9 100644
--- a/test/cpp/tensorexpr/test_quantization.cpp
+++ b/test/cpp/tensorexpr/test_quantization.cpp
@@ -390,8 +390,8 @@ at::Tensor quantized_cat(
                       .typed<at::Tensor(
                           c10::List<at::Tensor> const&,
                           int64_t,
-                          c10::optional<double>,
-                          c10::optional<int64_t>)>();
+                          std::optional<double>,
+                          std::optional<int64_t>)>();
   return op.redispatch(
       DispatchKeySet({DispatchKey::QuantizedCPU}), xs, dim, scale, zero);
 }
diff --git a/test/cpp_extensions/extension.cpp b/test/cpp_extensions/extension.cpp
index f476a983b14c2..1de9e03971115 100644
--- a/test/cpp_extensions/extension.cpp
+++ b/test/cpp_extensions/extension.cpp
@@ -23,7 +23,7 @@ struct MatrixMultiplier {
   torch::Tensor tensor_;
 };
 
-bool function_taking_optional(c10::optional<torch::Tensor> tensor) {
+bool function_taking_optional(std::optional<torch::Tensor> tensor) {
   return tensor.has_value();
 }
 
diff --git a/test/cpp_extensions/maia_extension.cpp b/test/cpp_extensions/maia_extension.cpp
index 13315810f54c4..8dbc64f82076d 100644
--- a/test/cpp_extensions/maia_extension.cpp
+++ b/test/cpp_extensions/maia_extension.cpp
@@ -20,8 +20,8 @@ Tensor get_tensor(caffe2::TypeMeta dtype, IntArrayRef size) {
   return Tensor(std::move(tensor_impl));
 }
 
-Tensor empty_override(IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device,
-                      c10::optional<bool> pin_memory, c10::optional<c10::MemoryFormat> optional_memory_format) {
+Tensor empty_override(IntArrayRef size, std::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device,
+                      std::optional<bool> pin_memory, c10::optional<c10::MemoryFormat> optional_memory_format) {
   test_int = 0;
   return get_tensor(scalarTypeToTypeMeta(dtype_or_default(dtype)), size);
 }
@@ -32,7 +32,7 @@ Tensor& add_out_override(const Tensor & a, const Tensor & b , const Scalar& c, T
 }
 
 Tensor fake_convolution(
-    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias,
+    const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias,
     IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
     bool transposed, IntArrayRef output_padding, int64_t groups) {
   test_int = 2;
diff --git a/test/cpp_extensions/open_registration_extension.cpp b/test/cpp_extensions/open_registration_extension.cpp
index f5b61102af7b2..df46d827339b4 100644
--- a/test/cpp_extensions/open_registration_extension.cpp
+++ b/test/cpp_extensions/open_registration_extension.cpp
@@ -277,11 +277,11 @@ REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_custom_alloc);
 // basic dummy empty function, so we can directly construct tensors on the custom device
 // This dummy test device will just use the CPU allocator, and ignores pinned memory.
 at::Tensor custom_empty_memory_format(at::IntArrayRef size,
-                                      c10::optional<at::ScalarType> dtype,
-                                      c10::optional<at::Layout> layout,
-                                      c10::optional<at::Device> device,
-                                      c10::optional<bool> pin_memory,
-                                      c10::optional<at::MemoryFormat> memory_format) {
+                                      std::optional<at::ScalarType> dtype,
+                                      std::optional<at::Layout> layout,
+                                      std::optional<at::Device> device,
+                                      std::optional<bool> pin_memory,
+                                      std::optional<at::MemoryFormat> memory_format) {
   constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
   return at::detail::empty_generic(size,
                                    &global_custom_alloc,
@@ -290,11 +290,11 @@ at::Tensor custom_empty_memory_format(at::IntArrayRef size,
                                    memory_format);
 }
 at::Tensor custom_empty_symint(c10::IntArrayRef size,
-                               c10::optional<at::ScalarType> dtype,
-                               c10::optional<at::Layout> layout,
-                               c10::optional<at::Device> device,
-                               c10::optional<bool> pin_memory,
-                               c10::optional<at::MemoryFormat> memory_format) {
+                               std::optional<at::ScalarType> dtype,
+                               std::optional<at::Layout> layout,
+                               std::optional<at::Device> device,
+                               std::optional<bool> pin_memory,
+                               std::optional<at::MemoryFormat> memory_format) {
   constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
   return at::detail::empty_generic(size,
     &global_custom_alloc, private_use_ks, c10::dtype_or_default(dtype), memory_format);
@@ -368,10 +368,10 @@ at::Tensor custom__copy_from_and_resize(const at::Tensor& self, const at::Tensor
 
 at::Tensor custom_empty_strided(c10::IntArrayRef size,
                                 c10::IntArrayRef stride,
-                                c10::optional<at::ScalarType> dtype_opt,
-                                c10::optional<at::Layout> layout_opt,
-                                c10::optional<at::Device> device_opt,
-                                c10::optional<bool> pin_memory_opt) {
+                                std::optional<at::ScalarType> dtype_opt,
+                                std::optional<at::Layout> layout_opt,
+                                std::optional<at::Device> device_opt,
+                                std::optional<bool> pin_memory_opt) {
   constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
   auto dtype = c10::dtype_or_default(dtype_opt);
   return  at::detail::empty_strided_generic(size, stride, &global_custom_alloc, private_use_ks, dtype);
@@ -406,7 +406,7 @@ at::Tensor& custom_set_source_Storage_storage_offset(at::Tensor& result,
 // basic dummy functions related to pin_memory.
 std::vector<void*> custom_pinned_data_ptr;
 
-at::Tensor custom__pin_memory(const at::Tensor& self, c10::optional<at::Device> device) {
+at::Tensor custom__pin_memory(const at::Tensor& self, std::optional<at::Device> device) {
   TORCH_CHECK(
       self.device().is_cpu(),
       "cannot pin '",
@@ -420,7 +420,7 @@ at::Tensor custom__pin_memory(const at::Tensor& self, c10::optional<at::Device>
   return dump_pinned_tensor;
 }
 
-bool custom_is_pinned(const at::Tensor& self, c10::optional<at::Device> device) {
+bool custom_is_pinned(const at::Tensor& self, std::optional<at::Device> device) {
   // Only CPU tensors can be pinned
   if (!self.is_cpu()) {
     return false;
@@ -436,7 +436,7 @@ bool custom_is_pinned(const at::Tensor& self, c10::optional<at::Device> device)
 }
 
 const at::Tensor& custom_resize_(const at::Tensor& self, at::IntArrayRef size,
-                          c10::optional<at::MemoryFormat> optional_memory_format) {
+                          std::optional<at::MemoryFormat> optional_memory_format) {
   at::TensorImpl* tensor_impl = self.unsafeGetTensorImpl();
   tensor_impl->set_sizes_contiguous(size);
   const auto itemsize = tensor_impl->dtype().itemsize();
diff --git a/test/cpp_extensions/rng_extension.cpp b/test/cpp_extensions/rng_extension.cpp
index 2e657d15a3979..3fc62ee69f752 100644
--- a/test/cpp_extensions/rng_extension.cpp
+++ b/test/cpp_extensions/rng_extension.cpp
@@ -33,15 +33,15 @@ struct TestCPUGenerator : public c10::GeneratorImpl {
   uint64_t value_;
 };
 
-Tensor& random_(Tensor& self, c10::optional<Generator> generator) {
+Tensor& random_(Tensor& self, std::optional<Generator> generator) {
   return at::native::templates::random_impl<native::templates::cpu::RandomKernel, TestCPUGenerator>(self, generator);
 }
 
-Tensor& random_from_to(Tensor& self, int64_t from, optional<int64_t> to, c10::optional<Generator> generator) {
+Tensor& random_from_to(Tensor& self, int64_t from, optional<int64_t> to, std::optional<Generator> generator) {
   return at::native::templates::random_from_to_impl<native::templates::cpu::RandomFromToKernel, TestCPUGenerator>(self, from, to, generator);
 }
 
-Tensor& random_to(Tensor& self, int64_t to, c10::optional<Generator> generator) {
+Tensor& random_to(Tensor& self, int64_t to, std::optional<Generator> generator) {
   return random_from_to(self, 0, to, generator);
 }
 
diff --git a/test/custom_operator/op.cpp b/test/custom_operator/op.cpp
index c9389713428bc..ab0506a822f61 100644
--- a/test/custom_operator/op.cpp
+++ b/test/custom_operator/op.cpp
@@ -29,7 +29,7 @@ struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutog
       torch::Tensor var1,
       int64_t mul,
       torch::Tensor var2,
-      c10::optional<torch::Tensor> var3) {
+      std::optional<torch::Tensor> var3) {
     ctx->saved_data["mul"] = mul;
     ctx->saved_data["var3_has_value"] = var3.has_value();
     ctx->save_for_backward({var1, var2});
@@ -59,7 +59,7 @@ torch::Tensor custom_op_with_autograd(
     torch::Tensor var1,
     int64_t mul,
     torch::Tensor var2,
-    c10::optional<torch::Tensor> var3) {
+    std::optional<torch::Tensor> var3) {
   return CustomOpAutogradFunction::apply(var1, mul, var2, var3);
 }
 
diff --git a/test/custom_operator/test_custom_ops.cpp b/test/custom_operator/test_custom_ops.cpp
index b1e830f7b65c7..a526bebd26144 100644
--- a/test/custom_operator/test_custom_ops.cpp
+++ b/test/custom_operator/test_custom_ops.cpp
@@ -57,7 +57,7 @@ void get_autograd_operator_from_registry_and_execute() {
   torch::Tensor z = torch::randn({5,5}, torch::requires_grad());
 
   torch::Tensor output =
-    helpers::get_operator_from_registry_and_execute<torch::Tensor>("custom::op_with_autograd", x, 2, y, c10::optional<torch::Tensor>());
+    helpers::get_operator_from_registry_and_execute<torch::Tensor>("custom::op_with_autograd", x, 2, y, std::optional<torch::Tensor>());
 
   TORCH_INTERNAL_ASSERT(output.allclose(x + 2*y + x*y));
   auto go = torch::ones({}, torch::requires_grad());
@@ -88,7 +88,7 @@ void get_autograd_operator_from_registry_and_execute_in_nograd_mode() {
   torch::Tensor y = torch::randn({5,5}, torch::requires_grad());
 
   torch::Tensor output =
-    helpers::get_operator_from_registry_and_execute<torch::Tensor>("custom::op_with_autograd", x, 2, y, c10::optional<torch::Tensor>());
+    helpers::get_operator_from_registry_and_execute<torch::Tensor>("custom::op_with_autograd", x, 2, y, std::optional<torch::Tensor>());
 
   TORCH_INTERNAL_ASSERT(output.allclose(x + 2*y + x*y));
 }
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py
index e826ca7a000d9..eec060d3004cc 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_training.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@@ -19,6 +19,7 @@
     register_fsdp_forward_method,
 )
 from torch.distributed._tensor import DTensor, init_device_mesh
+from torch.distributed._tensor.debug.comm_mode import CommDebugMode
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     _CHECKPOINT_PREFIX,
     apply_activation_checkpointing,
@@ -29,11 +30,6 @@
     get_optimizer_state_dict,
 )
 from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor.parallel import (
-    ColwiseParallel,
-    parallelize_module,
-    RowwiseParallel,
-)
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
@@ -41,8 +37,8 @@
     FSDPTest,
     FSDPTestMultiThread,
     MLP,
+    MLPStack,
     patch_all_gather,
-    patch_all_reduce,
     patch_reduce_scatter,
     test_compiled_fsdp,
 )
@@ -59,6 +55,8 @@
 )
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
+c10d_ops = torch.ops.c10d
+
 
 class TestFullyShardForwardInputs(FSDPTestMultiThread):
     @property
@@ -696,8 +694,7 @@ def _test_gradient_accumulation(
             return  # skip since not common
 
         torch.manual_seed(42)
-        local_batch_size, lin_dim, num_mlps, num_microbatches = (2, 32, 3, 3)
-        global_batch_size = local_batch_size * self.world_size
+        batch_size, lin_dim, num_mlps, num_microbatches = (2, 32, 3, 3)
         if mode == "some_mlps":
             num_mlps_to_disable_reduce_scatter = 2
         modules = [nn.Linear(lin_dim, lin_dim)]
@@ -716,32 +713,9 @@ def _test_gradient_accumulation(
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
 
-        # TODO: Migrate to `CommDebugMode` once it supports c10d collectives.
-        orig_all_gather = dist.all_gather_into_tensor
-        orig_reduce_scatter = dist.reduce_scatter_tensor
-        orig_all_reduce = dist.all_reduce
-        all_gather_count, reduce_scatter_count, all_reduce_count = 0, 0, 0
-
-        def all_gather_with_count(*args, **kwargs):
-            nonlocal all_gather_count
-            all_gather_count += 1
-            return orig_all_gather(*args, **kwargs)
-
-        def reduce_scatter_with_count(*args, **kwargs):
-            nonlocal reduce_scatter_count
-            reduce_scatter_count += 1
-            return orig_reduce_scatter(*args, **kwargs)
-
-        def all_reduce_with_count(*args, **kwargs):
-            nonlocal all_reduce_count
-            all_reduce_count += 1
-            return orig_all_reduce(*args, **kwargs)
-
-        torch.manual_seed(1)  # same on all ranks
+        torch.manual_seed(42 + self.rank + 1)
         for iter_idx in range(5):
-            with patch_all_gather(all_gather_with_count), patch_reduce_scatter(
-                reduce_scatter_with_count
-            ), patch_all_reduce(all_reduce_with_count):
+            with CommDebugMode() as comm_mode:
                 for microbatch_idx in range(num_microbatches):
                     is_last_microbatch = microbatch_idx == num_microbatches - 1
                     if mode == "all":
@@ -762,19 +736,18 @@ def all_reduce_with_count(*args, **kwargs):
                                 is_last_microbatch, recurse=False
                             )
 
-                    global_inp = torch.rand((global_batch_size, lin_dim), device="cuda")
-                    local_inp = global_inp[
-                        self.rank
-                        * local_batch_size : (self.rank + 1)
-                        * local_batch_size
-                    ].detach()
+                    inp = torch.randn(batch_size, lin_dim, device="cuda")
                     losses: List[torch.Tensor] = []
-                    for _model, inp in ((ref_model, global_inp), (model, local_inp)):
+                    for _model in (ref_model, model):
                         losses.append(_model(inp).sum())
                         losses[-1].backward()
-                    dist.all_reduce(losses[1])  # partial -> replicated
                     self.assertEqual(losses[0], losses[1])
 
+            comm_counts = comm_mode.get_comm_counts()
+            all_gather_count = comm_counts[c10d_ops._allgather_base_]
+            reduce_scatter_count = comm_counts[c10d_ops._reduce_scatter_base_]
+            all_reduce_count = comm_counts[c10d_ops.allreduce_]
+
             # Expect one reduce-scatter per MLP plus one for the root's linear
             # on the last microbatch
             expected_reduce_scatter_count = num_mlps + 1
@@ -788,13 +761,10 @@ def all_reduce_with_count(*args, **kwargs):
                 # Expect additional reduce-scatters for all MLPs
                 expected_reduce_scatter_count += (num_mlps) * (num_microbatches - 1)
             self.assertEqual(reduce_scatter_count, expected_reduce_scatter_count)
-            # Exclude the loss all-reduce per microbatch in our training loop
-            all_reduce_count -= num_microbatches
-            if mesh.ndim == 2:
-                self.assertEqual(all_reduce_count, expected_reduce_scatter_count)
-            else:
-                self.assertEqual(all_reduce_count, 0)
-            reduce_scatter_count = all_reduce_count = 0
+            expected_all_reduce_count = (
+                expected_reduce_scatter_count if mesh.ndim == 2 else 0
+            )
+            self.assertEqual(all_reduce_count, expected_all_reduce_count)
 
             # Expect one all-gather per MLP plus one for the root's linear in
             # the first microbatch's forward
@@ -817,13 +787,10 @@ def all_reduce_with_count(*args, **kwargs):
                 # microbatch forward
                 expected_all_gather_count += num_mlps * (num_microbatches - 1)
             self.assertEqual(all_gather_count, expected_all_gather_count)
-            all_gather_count = 0
 
-            # Average the ref model's gradients over the world size to match
-            # data parallel semantics
             for param in ref_model.parameters():
                 if param.grad is not None:
-                    param.grad.div_(self.world_size)
+                    dist.all_reduce(param.grad, op=dist.ReduceOp.AVG)
             check_sharded_parity(self, ref_model, model)
             for _optim in (optim, ref_optim):
                 _optim.step()
@@ -933,38 +900,14 @@ def _test_train_parity_2d_mlp(
         dp_pg = dp_mesh.get_group()  # used for `replicate()`
 
         torch.manual_seed(42)
-        model = nn.Sequential(
-            nn.LayerNorm(mlp_dim, bias=False),
-            # Use multiplier of 3 to exercise uneven case
-            MLP(mlp_dim, dim_multiplier=3),
-            MLP(mlp_dim),
-            MLP(mlp_dim, dim_multiplier=3),
-        )
+        model = MLPStack(mlp_dim)
         ref_model = copy.deepcopy(model).cuda()
         replicate(ref_model, device_ids=[self.rank], process_group=dp_pg)
-        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
-
-        model = parallelize_module(
-            model,
-            device_mesh=tp_mesh,
-            # Leave the layer norm as implicitly replicated
-            parallelize_plan={
-                # Pass `use_local_output=False` to keep as DTensor to preserve
-                # uneven activation dims
-                "1.in_proj": ColwiseParallel(use_local_output=False),
-                "1.out_proj": RowwiseParallel(use_local_output=False),
-                "2.in_proj": ColwiseParallel(use_local_output=False),
-                "2.out_proj": RowwiseParallel(use_local_output=False),
-                "3.in_proj": ColwiseParallel(use_local_output=False),
-                "3.out_proj": RowwiseParallel(),
-            },
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=False)
+        model.parallelize(
+            tp_mesh, dp_mesh, use_activation_checkpointing, reshard_after_forward
         )
-        for mlp in model:
-            if use_activation_checkpointing:
-                checkpoint(mlp)
-            fully_shard(mlp, mesh=dp_mesh, reshard_after_forward=reshard_after_forward)
-        fully_shard(model, mesh=dp_mesh, reshard_after_forward=reshard_after_forward)
-        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=False)
 
         torch.manual_seed(42 + dp_pg.rank() + 1)
         device = torch.device("cuda")
@@ -992,6 +935,8 @@ def test_train_parity_2d_transformer_checkpoint_resume(self):
                 # else construct new ones (requiring eager optim state init)
                 "reuse_model_optim": [False, True],
                 "optimizer_class": [torch.optim.Adam, torch.optim.AdamW],
+                # TODO: need to update `parallelize` before including foreach=True for testing
+                "foreach": [False],
             },
             self._test_train_parity_2d_transformer_checkpoint_resume,
         )
@@ -1001,6 +946,7 @@ def _test_train_parity_2d_transformer_checkpoint_resume(
         use_seq_parallel: bool,
         reuse_model_optim: bool,
         optimizer_class: Type[torch.optim.Optimizer],
+        foreach: bool,
     ):
         def train_step(
             _model: nn.Module, _optim: torch.optim.Optimizer, _inp: torch.Tensor
@@ -1026,7 +972,9 @@ def parallelize(_model: Transformer, mesh: DeviceMesh, use_seq_parallel: bool):
         model_no_cp = parallelize(
             Transformer(model_args), global_mesh, use_seq_parallel
         )
-        optim_no_cp = optimizer_class(model_no_cp.parameters(), lr=1e-2)
+        optim_no_cp = optimizer_class(
+            model_no_cp.parameters(), lr=1e-2, foreach=foreach
+        )
 
         torch.manual_seed(42 + global_mesh["dp"].get_local_rank() + 1)
         inp = torch.randint(0, model_args.vocab_size, (3, 16), device="cuda")
@@ -1037,7 +985,7 @@ def parallelize(_model: Transformer, mesh: DeviceMesh, use_seq_parallel: bool):
         # model/optimizer, load checkpoint, and run another iteration
         torch.manual_seed(seed)
         model_cp = parallelize(Transformer(model_args), global_mesh, use_seq_parallel)
-        optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2)
+        optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2, foreach=foreach)
 
         loss_cp1 = train_step(model_cp, optim_cp, inp)
         self.assertEqual(loss_no_cp1, loss_cp1)
@@ -1066,7 +1014,7 @@ def parallelize(_model: Transformer, mesh: DeviceMesh, use_seq_parallel: bool):
             model_cp = parallelize(
                 Transformer(model_args), global_mesh, use_seq_parallel
             )
-            optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2)
+            optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2, foreach=foreach)
         self.assertNotEqual(loss_no_cp2, train_step(model_cp, optim_cp, inp))
 
         sharded_sd = {
@@ -1106,6 +1054,7 @@ def test_2d_mlp_with_nd_mesh(self):
                 "reshard_after_forward": [False, True],
                 "use_activation_checkpointing": [False, True],
                 "mlp_dim": [3, 16, 17],
+                "foreach": [False],
             },
             functools.partial(self._test_2d_mlp_with_nd_mesh, global_mesh),
         )
@@ -1116,6 +1065,7 @@ def _test_2d_mlp_with_nd_mesh(
         reshard_after_forward: bool,
         use_activation_checkpointing: bool,
         mlp_dim: int,
+        foreach: bool,
     ):
         global_mesh = self.init_global_mesh()
         pp_mesh, dp_mesh, tp_mesh = (
@@ -1126,38 +1076,14 @@ def _test_2d_mlp_with_nd_mesh(
         dp_pg = dp_mesh.get_group()  # used for `replicate()`
 
         torch.manual_seed(42)
-        model = nn.Sequential(
-            nn.LayerNorm(mlp_dim, bias=False),
-            # Use multiplier of 3 to exercise uneven case
-            MLP(mlp_dim, dim_multiplier=3),
-            MLP(mlp_dim),
-            MLP(mlp_dim, dim_multiplier=3),
-        )
+        model = MLPStack(mlp_dim)
         ref_model = copy.deepcopy(model).cuda()
         replicate(ref_model, device_ids=[self.rank], process_group=dp_pg)
-        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
-
-        model = parallelize_module(
-            model,
-            device_mesh=tp_mesh,
-            # Leave the layer norm as implicitly replicated
-            parallelize_plan={
-                # Pass `use_local_output=False` to keep as DTensor to preserve
-                # uneven activation dims
-                "1.in_proj": ColwiseParallel(use_local_output=False),
-                "1.out_proj": RowwiseParallel(use_local_output=False),
-                "2.in_proj": ColwiseParallel(use_local_output=False),
-                "2.out_proj": RowwiseParallel(use_local_output=False),
-                "3.in_proj": ColwiseParallel(use_local_output=False),
-                "3.out_proj": RowwiseParallel(),
-            },
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=foreach)
+        model.parallelize(
+            tp_mesh, dp_mesh, use_activation_checkpointing, reshard_after_forward
         )
-        for mlp in model:
-            if use_activation_checkpointing:
-                checkpoint(mlp)
-            fully_shard(mlp, mesh=dp_mesh, reshard_after_forward=reshard_after_forward)
-        fully_shard(model, mesh=dp_mesh, reshard_after_forward=reshard_after_forward)
-        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=foreach)
 
         torch.manual_seed(42 + dp_pg.rank() + 1)
         device = torch.device("cuda")
diff --git a/test/distributed/_tensor/test_dtensor.py b/test/distributed/_tensor/test_dtensor.py
index 2f7f522b9e7b0..e2a9d33241e7c 100644
--- a/test/distributed/_tensor/test_dtensor.py
+++ b/test/distributed/_tensor/test_dtensor.py
@@ -819,7 +819,7 @@ def test_split_tensor_1D(self) -> None:
             )
             if size == 0:
                 # when tensor size is 0, there is no padding needed for all the ranks.
-                expected_pad_sizes = [0] * self.world_size
+                expected_pad_sizes = []
                 assert_array_equal(expected_pad_sizes, pad_sizes)
 
                 is_tensor_empty = [
diff --git a/test/distributed/_tensor/test_optimizers.py b/test/distributed/_tensor/test_optimizers.py
index e7ce18eefa634..512b5c97ce6a2 100644
--- a/test/distributed/_tensor/test_optimizers.py
+++ b/test/distributed/_tensor/test_optimizers.py
@@ -84,23 +84,26 @@ def _assert_optimizer(
                 # Default 'rtol' and 'atol' for attr:`~torch.float32` are ``1.3e-6`` and ``1e-5``
                 self.assertEqual(p1, p2, atol=atol, rtol=rtol)
 
+    def test_optimizer_foreach_supported_types_include_DTensor(self):
+        from torch.optim.optimizer import _foreach_supported_types
+
+        self.assertTrue(DTensor in _foreach_supported_types)
+
     @with_comms
     def test_adam_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         # TODO: add fused_adam support
         adam_configs = [
-            {"lr": 0.1},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "weight_decay": 0.05, "foreach": False},
             {"lr": 0.1, "weight_decay": 0.05},
-            {"lr": 0.1, "foreach": True},
-            {"lr": 0.1, "weight_decay": 0.05, "foreach": True},
-            {"lr": 0.1, "weight_decay": 0.05, "amsgrad": True, "foreach": True},
+            {"lr": 0.1, "weight_decay": 0.05, "amsgrad": True},
             {
                 "lr": 0.1,
                 "weight_decay": 0.05,
                 "maximize": True,
                 "amsgrad": True,
-                "foreach": True,
             },
             {"lr": 0.1, "fused": True},
             {"lr": 0.1, "weight_decay": 0.05, "amsgrad": True, "fused": True},
@@ -132,16 +135,15 @@ def test_adamw_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         adamw_configs = [
-            {"lr": 0.1},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "weight_decay": 0.05, "foreach": False},
             {"lr": 0.1, "weight_decay": 0.05},
-            {"lr": 0.1, "weight_decay": 0.05, "foreach": True},
             {
                 "lr": 0.1,
                 "betas": (0.6, 0.66),
                 "eps": 1e-6,
                 "weight_decay": 0.05,
                 "amsgrad": True,
-                "foreach": True,
             },
             {
                 "lr": 0.1,
@@ -150,7 +152,6 @@ def test_adamw_1d_sharding(self):
                 "weight_decay": 0.05,
                 "maximize": True,
                 "amsgrad": True,
-                "foreach": True,
             },
             {"lr": 0.1, "weight_decay": 0.05, "fused": True},
             {
@@ -191,16 +192,17 @@ def test_sgd_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         sgd_configs = [
-            {"lr": 0.1},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "momentum": 0.05, "foreach": False},
             {"lr": 0.1, "momentum": 0.05},
-            {"lr": 0.1, "momentum": 0.05, "foreach": True},
-            {"lr": 0.1, "momentum": 0.06, "dampening": 0.07, "foreach": True},
+            {"lr": 0.1, "momentum": 0.06, "dampening": 0.07},
             {
                 "lr": 0.1,
                 "momentum": 0.08,
                 "weight_decay": 0.05,
                 "nesterov": True,
                 "maximize": True,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -208,7 +210,6 @@ def test_sgd_1d_sharding(self):
                 "weight_decay": 0.05,
                 "nesterov": True,
                 "maximize": True,
-                "foreach": True,
             },
         ]
 
@@ -231,14 +232,15 @@ def test_adagrad_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         adagrad_configs = [
-            {"lr": 0.1},
-            {"lr": 0.1, "lr_decay": 0.05},
-            {"lr": 0.1, "lr_decay": 0.02, "weight_decay": 0.05},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "lr_decay": 0.05, "foreach": False},
+            {"lr": 0.1, "lr_decay": 0.02, "weight_decay": 0.05, "foreach": False},
             {
                 "lr": 0.1,
                 "lr_decay": 0.02,
                 "weight_decay": 0.05,
                 "initial_accumulator_value": 0.03,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -246,6 +248,7 @@ def test_adagrad_1d_sharding(self):
                 "weight_decay": 0.05,
                 "initial_accumulator_value": 0.03,
                 "eps": 1e-6,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -254,6 +257,7 @@ def test_adagrad_1d_sharding(self):
                 "initial_accumulator_value": 0.03,
                 "eps": 1e-6,
                 "maximize": True,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -262,7 +266,6 @@ def test_adagrad_1d_sharding(self):
                 "initial_accumulator_value": 0.03,
                 "eps": 1e-6,
                 "maximize": True,
-                "foreach": True,
             },
         ]
 
@@ -285,16 +288,23 @@ def test_RMSprop_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         RMSprop_configs = [
-            {"lr": 0.1},
-            {"lr": 0.1, "alpha": 0.85},
-            {"lr": 0.1, "alpha": 0.88, "eps": 1e-6},
-            {"lr": 0.1, "alpha": 0.88, "eps": 1e-6, "weight_decay": 0.05},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "alpha": 0.85, "foreach": False},
+            {"lr": 0.1, "alpha": 0.88, "eps": 1e-6, "foreach": False},
+            {
+                "lr": 0.1,
+                "alpha": 0.88,
+                "eps": 1e-6,
+                "weight_decay": 0.05,
+                "foreach": False,
+            },
             {
                 "lr": 0.1,
                 "alpha": 0.88,
                 "eps": 1e-6,
                 "weight_decay": 0.05,
                 "momentum": 0.9,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -303,6 +313,7 @@ def test_RMSprop_1d_sharding(self):
                 "weight_decay": 0.05,
                 "momentum": 0.9,
                 "centered": True,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -312,6 +323,7 @@ def test_RMSprop_1d_sharding(self):
                 "momentum": 0.9,
                 "centered": True,
                 "maximize": True,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -321,7 +333,6 @@ def test_RMSprop_1d_sharding(self):
                 "momentum": 0.9,
                 "centered": True,
                 "maximize": True,
-                "foreach": True,
             },
         ]
 
@@ -344,23 +355,27 @@ def test_adadelta_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         adadelta_configs = [
-            {"lr": 0.1},
-            {"lr": 0.1, "rho": 0.85},
-            {"lr": 0.1, "rho": 0.88, "eps": 1e-5},
-            {"lr": 0.1, "rho": 0.88, "eps": 1e-6, "weight_decay": 0.05},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "rho": 0.85, "foreach": False},
+            {"lr": 0.1, "rho": 0.88, "eps": 1e-5, "foreach": False},
+            {
+                "lr": 0.1,
+                "rho": 0.88,
+                "eps": 1e-6,
+                "weight_decay": 0.05,
+                "foreach": False,
+            },
             {
                 "lr": 0.1,
                 "rho": 0.88,
                 "eps": 1e-6,
                 "weight_decay": 0.05,
-                "foreach": True,
             },
             {
                 "lr": 0.1,
                 "rho": 0.88,
                 "eps": 1e-6,
                 "weight_decay": 0.05,
-                "foreach": True,
                 "maximize": True,
             },
         ]
@@ -384,15 +399,14 @@ def test_nadam_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         nadam_configs = [
-            {"lr": 0.1},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "weight_decay": 0.05, "foreach": False},
             {"lr": 0.1, "weight_decay": 0.05},
-            {"lr": 0.1, "weight_decay": 0.05, "foreach": True},
             {
                 "lr": 0.1,
                 "betas": (0.6, 0.66),
                 "eps": 1e-6,
                 "weight_decay": 0.05,
-                "foreach": True,
             },
             {
                 "lr": 0.1,
@@ -400,7 +414,6 @@ def test_nadam_1d_sharding(self):
                 "eps": 1e-6,
                 "weight_decay": 0.05,
                 "decoupled_weight_decay": True,
-                "foreach": True,
             },
         ]
 
@@ -423,15 +436,17 @@ def test_radam_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         radam_configs = [
-            {"lr": 0.1},
-            {"lr": 0.1, "weight_decay": 0.05},
-            {"lr": 0.1, "weight_decay": 0.05, "foreach": True},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "weight_decay": 0.05, "foreach": False},
+            {
+                "lr": 0.1,
+                "weight_decay": 0.05,
+            },
             {
                 "lr": 0.1,
                 "betas": (0.6, 0.66),
                 "eps": 1e-6,
                 "weight_decay": 0.05,
-                "foreach": True,
             },
             {
                 "lr": 0.1,
@@ -439,7 +454,6 @@ def test_radam_1d_sharding(self):
                 "eps": 1e-6,
                 "weight_decay": 0.05,
                 "decoupled_weight_decay": True,
-                "foreach": True,
             },
         ]
 
@@ -462,23 +476,27 @@ def test_adamax_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         adamax_configs = [
-            {"lr": 0.1},
-            {"lr": 0.1, "betas": (0.6, 0.66)},
-            {"lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6},
-            {"lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6, "weight_decay": 0.05},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "betas": (0.6, 0.66), "foreach": False},
+            {"lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6, "foreach": False},
+            {
+                "lr": 0.1,
+                "betas": (0.6, 0.66),
+                "eps": 1e-6,
+                "weight_decay": 0.05,
+                "foreach": False,
+            },
             {
                 "lr": 0.1,
                 "betas": (0.6, 0.66),
                 "eps": 1e-6,
                 "weight_decay": 0.05,
-                "foreach": True,
             },
             {
                 "lr": 0.1,
                 "betas": (0.6, 0.66),
                 "eps": 1e-6,
                 "weight_decay": 0.05,
-                "foreach": True,
                 "maximize": True,
             },
         ]
@@ -502,11 +520,18 @@ def test_asgd_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         asgd_configs = [
-            {"lr": 0.1},
-            {"lr": 0.1, "lambd": 0.001},
-            {"lr": 0.1, "lambd": 0.001, "alpha": 0.85},
-            {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "t0": 1e5},
-            {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "t0": 1e5, "weight_decay": 0.05},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "lambd": 0.001, "foreach": False},
+            {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "foreach": False},
+            {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "t0": 1e5, "foreach": False},
+            {
+                "lr": 0.1,
+                "lambd": 0.001,
+                "alpha": 0.85,
+                "t0": 1e5,
+                "weight_decay": 0.05,
+                "foreach": False,
+            },
             {
                 "lr": 0.1,
                 "lambd": 0.001,
diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py
index e1dd16bcf9650..5b5ae3e3cb620 100644
--- a/test/distributed/elastic/agent/server/test/api_test.py
+++ b/test/distributed/elastic/agent/server/test/api_test.py
@@ -126,7 +126,9 @@ def __init__(self, spec):
         self.stop_workers_call_count = 0
         self.start_workers_call_count = 0
 
-    def _stop_workers(self, worker_group: WorkerGroup) -> None:
+    def _stop_workers(
+        self, worker_group: WorkerGroup, is_restart: bool = False
+    ) -> None:
         # workers are fake, nothing to stop; just clear the rdzv info
         worker_group.group_rank = None
         worker_group.group_world_size = None
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index 9658ed087ab05..75e903807ff9b 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -465,6 +465,30 @@ def test_function_raise(self):
                     self.assertTrue(pc._stderr_tail.stopped())
                     self.assertTrue(pc._stdout_tail.stopped())
 
+        def test_wait_for_all_child_procs_to_exit(self):
+            """
+            Tests that MultiprocessingContext actually waits for
+            the child process to exit (not just that the entrypoint fn has
+            finished running).
+            """
+
+            mpc = MultiprocessContext(
+                name="echo",
+                entrypoint=echo0,
+                args={},
+                envs={},
+                start_method="spawn",
+                logs_specs=DefaultLogsSpecs(log_dir=self.log_dir()),
+            )
+
+            with mock.patch.object(
+                mpc, "_is_done", return_value=True
+            ), mock.patch.object(mpc, "_pc"), mock.patch.object(
+                mpc._pc, "join", side_effect=[True, False, False, True]
+            ) as mock_join:
+                mpc._poll()
+                self.assertEqual(4, mock_join.call_count)
+
         ########################################
         # start_processes as binary tests
         ########################################
diff --git a/test/dynamo_expected_failures/TestAOTAutograd.test_set__and_data_mutation_good b/test/distributed/pipelining/__init__.py
similarity index 100%
rename from test/dynamo_expected_failures/TestAOTAutograd.test_set__and_data_mutation_good
rename to test/distributed/pipelining/__init__.py
diff --git a/test/distributed/pipelining/model_registry.py b/test/distributed/pipelining/model_registry.py
new file mode 100644
index 0000000000000..f88bebd3a5598
--- /dev/null
+++ b/test/distributed/pipelining/model_registry.py
@@ -0,0 +1,61 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+# This file is a model zoo for testing torch.distributed.pipelining.
+import torch
+from torch.distributed.pipelining import pipe_split
+
+
+class ExampleCode(torch.nn.Module):
+    default_dhid = 512
+    default_batch_size = 256
+
+    def __init__(self, d_hid: int = default_dhid):
+        super().__init__()
+        self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
+        self.mm_param1 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
+        self.register_buffer("cval", torch.randn((d_hid,), requires_grad=False))
+        self.lin0 = torch.nn.Linear(d_hid, d_hid)
+        self.lin1 = torch.nn.Linear(d_hid, d_hid)
+
+    def forward(self, x, y=torch.zeros(default_batch_size, default_dhid)):
+        x = torch.mm(x, self.mm_param0)
+        x = x + y
+        x = torch.relu(x)
+        # try passing a value that doesn't require_grad across skip boundaries
+        a_constant = self.cval.clone()
+        x = self.lin0(x)
+        pipe_split()
+        x = torch.relu(x) + a_constant
+        x = torch.mm(x, self.mm_param1)
+        x = self.lin1(x)
+        x = torch.relu(x)
+        return x
+
+
+# MLP Layer
+class MLPModule(torch.nn.Module):
+    def __init__(self, d_hid):
+        super().__init__()
+        self.net1 = torch.nn.Linear(d_hid, d_hid)
+        self.relu = torch.nn.ReLU()
+        self.net2 = torch.nn.Linear(d_hid, d_hid)
+
+    def forward(self, x):
+        x = self.net1(x)
+        x = self.relu(x)
+        x = self.net2(x)
+        return x
+
+
+# Multi-MLP model
+class MultiMLP(torch.nn.Module):
+    def __init__(self, d_hid):
+        super().__init__()
+        self.mlp0 = MLPModule(d_hid)
+        self.mlp1 = MLPModule(d_hid)
+
+    def forward(self, x):
+        x = self.mlp0(x)
+        pipe_split()
+        x = self.mlp1(x)
+        return x
diff --git a/test/distributed/pipelining/test_pipe.py b/test/distributed/pipelining/test_pipe.py
index c966a20b3cbc0..74d13111bec73 100644
--- a/test/distributed/pipelining/test_pipe.py
+++ b/test/distributed/pipelining/test_pipe.py
@@ -1,8 +1,15 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 import torch
+
+from model_registry import MLPModule
 from torch.distributed.pipelining import pipe_split, pipeline
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    TestCase,
+)
 
 
 d_hid = 512
@@ -39,21 +46,6 @@ def forward(self, x, y):
         return x
 
 
-# MLP example
-class MLPModule(torch.nn.Module):
-    def __init__(self, d_hid):
-        super().__init__()
-        self.net1 = torch.nn.Linear(d_hid, d_hid)
-        self.relu = torch.nn.ReLU()
-        self.net2 = torch.nn.Linear(d_hid, d_hid)
-
-    def forward(self, x):
-        x = self.net1(x)
-        x = self.relu(x)
-        x = self.net2(x)
-        return x
-
-
 class MultiMLP(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -74,8 +66,9 @@ def forward(self, x, y):
 
 
 class PipeTests(TestCase):
-    def _test_model_split(self, model_class):
-        mod = model_class()
+    @parametrize("ModelClass", [ExampleCode, MultiMLP])
+    def test_model_split(self, ModelClass):
+        mod = ModelClass()
         x = torch.randn(batch_size, d_hid)
         y = torch.randn(batch_size, d_hid)
 
@@ -108,12 +101,8 @@ def _test_model_split(self, model_class):
         """
         print("Qualname check passed")
 
-    def test_example_code(self):
-        self._test_model_split(ExampleCode)
-
-    def test_multi_mlp(self):
-        self._test_model_split(MultiMLP)
 
+instantiate_parametrized_tests(PipeTests)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py
index e8e37bcf208f8..8357f3b66108d 100644
--- a/test/distributed/pipelining/test_schedule.py
+++ b/test/distributed/pipelining/test_schedule.py
@@ -1,13 +1,15 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
+import copy
 import os
 import sys
 import tempfile
 
 import torch
 import torch.distributed as dist
+
+from model_registry import ExampleCode, MultiMLP
 from torch.distributed.pipelining import (
-    pipe_split,
     pipeline,
     PipelineStage,
     Schedule1F1B,
@@ -32,30 +34,6 @@
 torch.manual_seed(0)
 
 
-class ExampleCode(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
-        self.mm_param1 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
-        self.register_buffer("cval", torch.randn((d_hid,), requires_grad=False))
-        self.lin0 = torch.nn.Linear(d_hid, d_hid)
-        self.lin1 = torch.nn.Linear(d_hid, d_hid)
-
-    def forward(self, x, y=torch.zeros(batch_size, d_hid)):
-        x = torch.mm(x, self.mm_param0)
-        x = x + y
-        x = torch.relu(x)
-        # try passing a value that doesn't require_grad across skip boundaries
-        a_constant = self.cval.clone()
-        x = self.lin0(x)
-        pipe_split()
-        x = torch.relu(x) + a_constant
-        x = torch.mm(x, self.mm_param1)
-        x = self.lin1(x)
-        x = torch.relu(x)
-        return x
-
-
 class ScheduleTest(MultiProcContinousTest):
     @classmethod
     def backend_str(cls) -> str:
@@ -78,7 +56,7 @@ def test_ec_forward(self):
         # Setting this flag for numerical stability
         torch.distributed.pipelining.microbatch._debug_mask_minibatches = True
 
-        mod = ExampleCode()
+        mod = ExampleCode(d_hid)
         mod.to(self.device)
 
         x = torch.randn(batch_size, d_hid, device=self.device)
@@ -125,7 +103,7 @@ def test_ec_forward(self):
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
     def test_ec_backward(self, ScheduleClass):
-        mod = ExampleCode()
+        mod = ExampleCode(d_hid)
         mod.to(self.device)
 
         x = torch.randn(batch_size, d_hid, device=self.device)
@@ -168,6 +146,79 @@ def test_ec_backward(self, ScheduleClass):
             torch.testing.assert_close(out, ref_out, rtol=1e-2, atol=5e-3)
             torch.testing.assert_close(pipe_loss, ref_loss)
 
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
+    def test_grad(self, ScheduleClass):
+        mod = MultiMLP(d_hid)
+        mod.to(self.device)
+
+        ref_mod = copy.deepcopy(mod)
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        with torch.no_grad():
+            y = ref_mod(x)
+            # Add a small perturbation
+            target = y + torch.randn(batch_size, d_hid, device=self.device)
+
+        loss_fn = torch.nn.MSELoss(reduction="sum")
+
+        # Run reference
+        for _ in range(2):
+            ref_mod.zero_grad()
+            ref_out = ref_mod(x)
+            ref_loss = loss_fn(ref_out, target)
+            ref_loss.backward()
+
+        # Create a pipeline
+        pipe = pipeline(
+            mod,
+            chunks,
+            example_args=(x,),
+        )
+
+        stage = PipelineStage(
+            pipe,
+            self.rank,
+            device=self.device,
+        )
+
+        # Attach to a schedule
+        schedule = ScheduleClass(stage, chunks, loss_fn=loss_fn)
+
+        # Run
+        stage_module = pipe.get_stage_module(self.rank)
+        for _ in range(2):
+            # Zero gradients
+            stage_module.zero_grad()
+            if self.rank == 0:
+                schedule.step(x)
+            elif self.rank == self.world_size - 1:
+                losses = []
+                out = schedule.step(target=target, losses=losses)
+            else:
+                schedule.step()
+
+        dist.barrier()
+
+        # Last rank checks result
+        if self.rank == self.world_size - 1:
+            # Check output
+            torch.testing.assert_close(out, ref_out)
+            # Check loss
+            # Since the reduction used in the loss function above is "sum", we use
+            # "sum" here to reduce microbatch losses into a single value too.
+            pipe_loss = sum(losses)
+            torch.testing.assert_close(pipe_loss, ref_loss)
+
+        # Every rank checks gradients
+        for name, p in stage_module.named_parameters():
+            ref_p = ref_mod.get_parameter(name)
+            try:
+                torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5)
+            except AssertionError:
+                print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
+                raise
+
 
 instantiate_parametrized_tests(ScheduleTest)
 
diff --git a/test/distributed/pipelining/test_stage_backward.py b/test/distributed/pipelining/test_stage_backward.py
index 358607ab91c3f..5791f40c6102a 100644
--- a/test/distributed/pipelining/test_stage_backward.py
+++ b/test/distributed/pipelining/test_stage_backward.py
@@ -3,6 +3,8 @@
 import copy
 
 import torch
+
+from model_registry import MLPModule
 from torch.distributed.pipelining._backward import stage_backward
 from torch.testing._internal.common_utils import run_tests, TestCase
 
@@ -11,20 +13,6 @@
 batch_size = 256
 
 
-class MLPModule(torch.nn.Module):
-    def __init__(self, d_hid):
-        super().__init__()
-        self.net1 = torch.nn.Linear(d_hid, d_hid)
-        self.relu = torch.nn.ReLU()
-        self.net2 = torch.nn.Linear(d_hid, d_hid)
-
-    def forward(self, x):
-        x = self.net1(x)
-        x = self.relu(x)
-        x = self.net2(x)
-        return x
-
-
 class StageBackwardTests(TestCase):
     def test_stage_backward(self):
         # MLP as a stage module
@@ -65,8 +53,6 @@ def test_stage_backward(self):
                 print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
                 raise
 
-        print("Stage backward test passed")
-
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
index cd038dbbb2737..eb5e6b5e5a1df 100644
--- a/test/distributed/tensor/parallel/test_tp_examples.py
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -262,8 +262,11 @@ def test_transformer_training(self, is_seq_parallel=False):
 
         # Ensure model weights are still the same after update.
         optim.step()
-        with CommDebugMode() as comm_mode:
-            optim_tp.step()
+        from torch.distributed._tensor.experimental import implicit_replication
+
+        with implicit_replication():
+            with CommDebugMode() as comm_mode:
+                optim_tp.step()
         self._check_module(model, model_tp)
         if is_seq_parallel:
             self.assertDictEqual(
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 393f81cb5e7ca..fdb23e3f590f3 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -642,6 +642,13 @@ def test_get_autocast_gpu_dtype(x):
         dtype = torch.get_autocast_gpu_dtype()
         return x.type(dtype)
 
+    @make_test
+    def test_is_any_autocast_enabled(x):
+        if torch._C._is_any_autocast_enabled():
+            return x + 1
+        else:
+            return x - 1
+
     @make_test
     def test_list_compare_polyfill(x):
         for a, b, c in [
@@ -1170,6 +1177,19 @@ def test_set_contains(a, b):
             y = a - b
         return x, y
 
+    def test_set_isdisjoint(self):
+        x = {"apple", "banana", "cherry"}
+        y = {"google", "microsoft", "apple"}
+
+        def fn(a):
+            if x.isdisjoint(y):
+                return a + 1
+            else:
+                return a - 1
+
+        test = make_test(fn)
+        test(self)
+
     @make_test
     def test_tuple_iadd(a, b):
         output = (a, b)
@@ -1317,6 +1337,13 @@ def isinstance_namedtuple(obj) -> bool:
         else:
             return a - b
 
+    @make_test
+    def test_torch_size_hasattr(x):
+        if hasattr(x.shape, "_fields"):
+            return x + 1
+        else:
+            return x - 1
+
     @make_test
     def test_is_quantized(a, b):
         if not a.is_quantized:
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index c0b5bfc595363..880e761037cd9 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -5894,7 +5894,7 @@ def wrapper_fn(x, in_dims):
         actual = opt(x, 0), opt(x, 1), opt(x, 2)
         self.assertEqual(expected, actual)
         self.assertEqual(cnt.frame_count, 3)
-        self.assertEqual(cnt.op_count, 33)
+        self.assertEqual(cnt.op_count, 27)
 
     def test_vmap_multiple_invocation_out_dims(self):
         counters.clear()
@@ -5910,7 +5910,7 @@ def wrapper_fn(x, out_dims):
         actual = opt(x, 0), opt(x, 1), opt(x, 2)
         self.assertEqual(expected, actual)
         self.assertEqual(cnt.frame_count, 3)
-        self.assertEqual(cnt.op_count, 30)
+        self.assertEqual(cnt.op_count, 27)
 
     def test_vmap_new_tensor_in_body(self):
         def fn(x):
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index b46ab432831dc..a70e5767f3d64 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -32,6 +32,7 @@
 import torch.onnx.operators
 
 import torch.utils._pytree as pytree
+from torch import Tensor
 from torch._C import FileCheck
 from torch._dynamo import allow_in_graph, bytecode_analysis, bytecode_transformation
 from torch._dynamo.eval_frame import _debug_get_cache_entry_list
@@ -864,7 +865,7 @@ def fn(x):
             return x + y
 
         torch._dynamo.testing.standard_test(
-            self, fn, 1, expected_ops=1, expected_ops_dynamic=ifdynstaticdefault(1, 10)
+            self, fn, 1, expected_ops=1, expected_ops_dynamic=ifdynstaticdefault(1, 4)
         )
 
     def test_int_int_comparisons(self):
@@ -907,10 +908,8 @@ def fn(x):
                 out = 1
             return x + out
 
-        # expect for dynamic: size, index, 6 comparison ops, add
-        torch._dynamo.testing.standard_test(
-            self, fn, 1, expected_ops=1, expected_ops_dynamic=ifdynstaticdefault(1, 9)
-        )
+        # TODO: Test the guards maybe?
+        torch._dynamo.testing.standard_test(self, fn, 1, expected_ops=1)
 
     def test_int_shape_comparisons(self):
         def fn(x):
@@ -932,10 +931,8 @@ def fn(x):
                 out = 1
             return x + out
 
-        # expect for dynamic: size, index, 6 comparison ops, add
-        torch._dynamo.testing.standard_test(
-            self, fn, 1, expected_ops=1, expected_ops_dynamic=ifdynstaticdefault(1, 9)
-        )
+        # TODO: Test the guards maybe?
+        torch._dynamo.testing.standard_test(self, fn, 1, expected_ops=1)
 
     def test_param_shape_binops(self):
         class MyModule(torch.nn.Module):
@@ -1262,13 +1259,11 @@ def fn(x):
                 y.add_(1.0)
             return y
 
-        # expect extra size node for dynamic
         torch._dynamo.testing.standard_test(
             self,
             fn,
             1,
             expected_ops=20,
-            expected_ops_dynamic=ifdynstaticdefault(20, 21),
         )
 
     def test_empty_list(self):
@@ -1658,7 +1653,7 @@ def fn(a, b):
         opt_fn = torch._dynamo.optimize(cnts)(fn)
         self.assertEqual(opt_fn(v1, v2), correct)
         self.assertEqual(cnts.frame_count, 1)
-        self.assertEqual(cnts.op_count, 3)
+        self.assertEqual(cnts.op_count, 4)
 
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", False)
     def test_tensor_item_no_capture(self):
@@ -1738,13 +1733,11 @@ def fn(a):
                 a += 1
             return a
 
-        # expect 1 more op (size call) for dynamic
         return torch._dynamo.testing.standard_test(
             self,
             fn=fn,
             nargs=1,
             expected_ops=9,
-            expected_ops_dynamic=ifdynstaticdefault(9, 10),
         )
 
     def test_build_tuple_unpack(self):
@@ -4337,7 +4330,7 @@ def forward(self, x, ref_id):
         if torch._dynamo.config.assume_static_by_default:
             self.assertExpectedInline(cnts.op_count, """2""")
         else:
-            self.assertExpectedInline(cnts.op_count, """3""")
+            self.assertExpectedInline(cnts.op_count, """2""")
 
         torch._dynamo.reset()
         cnts = torch._dynamo.testing.CompileCounter()
@@ -4347,7 +4340,7 @@ def forward(self, x, ref_id):
         if torch._dynamo.config.assume_static_by_default:
             self.assertExpectedInline(cnts.op_count, """1""")
         else:
-            self.assertExpectedInline(cnts.op_count, """2""")
+            self.assertExpectedInline(cnts.op_count, """1""")
 
     def test_inline_func_jump_on_tensor_condition(self):
         def f1(input):
@@ -8518,6 +8511,28 @@ def f(lengths, values):
 
         f(torch.tensor([2, 3, 4]), torch.randn(9))
 
+    @torch._dynamo.config.patch(
+        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+    )
+    def test_unbacked_auto_functionalize_op(self):
+        @torch.library.custom_op(
+            "mylib::mk_image", mutates_args=("decoder",), device_types=["cpu"]
+        )
+        def mk_image(decoder: Tensor) -> Tensor:
+            return torch.randn(2, 3, 4, 5)
+
+        @torch.library.register_fake("mylib::mk_image")
+        def _(decoder: Tensor) -> Tensor:
+            image_size = [torch.library.get_ctx().new_dynamic_size() for _ in range(4)]
+            return torch.empty(image_size)
+
+        @torch.compile(fullgraph=True)
+        def f(x):
+            return torch.ops.mylib.mk_image.default(x)
+
+        x = torch.zeros(100, dtype=torch.int64)
+        f(x)
+
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_runtime_assert_replacement(self):
         @torch.compile(backend="aot_eager")
@@ -9614,7 +9629,7 @@ def test_shape_env_equal_unbacked(self):
 ShapeEnv not equal: field values don't match:
 
 ==> name_to_node: values don't match.
-  >  Left: {f0, u0, u1}
+  >  Left: {u0, u1, zuf0}
   > Right: {}
 ==> unbacked_symfloat_counter: values don't match.
   >  Left: 1
@@ -9623,7 +9638,7 @@ def test_shape_env_equal_unbacked(self):
   >  Left: 2
   > Right: 0
 ==> var_to_range: values don't match.
-  >  Left: {f0: ValueRanges(lower=-oo, upper=oo, is_bool=False), u0: ValueRanges(lower=-9223372036854775808, upper=9223372036854775807, is_bool=False), u1: ValueRanges(lower=0, upper=1, is_bool=False)}
+  >  Left: {u0: ValueRanges(lower=-9223372036854775808, upper=9223372036854775807, is_bool=False), u1: ValueRanges(lower=0, upper=1, is_bool=False), zuf0: ValueRanges(lower=-oo, upper=oo, is_bool=False)}
   > Right: {}
 """,
         )
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index d4233ac8e0e3b..8ecfe493650d7 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -962,7 +962,7 @@ def test_do_paste_mask(self):
         )
         # (dynamic shapes, static shapes)
         self.assertIn(cnt.frame_count, (5, 7))
-        self.assertIn(cnt.op_count, (106, 127))
+        self.assertIn(cnt.op_count, (104, 106, 127))
 
     def test_convert_boxes_to_pooler_format(self):
         boxes1 = [
@@ -989,7 +989,7 @@ def test_convert_boxes_to_pooler_format(self):
             self.assertExpectedInline(cnt.op_count, """10""")
         else:
             self.assertExpectedInline(cnt.frame_count, """4""")
-            self.assertExpectedInline(cnt.op_count, """16""")
+            self.assertExpectedInline(cnt.op_count, """14""")
 
     def test_boxes_len(self):
         def fn(boxes):
@@ -1194,7 +1194,7 @@ def test_hf_t5_forward(self):
             self.assertExpectedInline(cnt.op_count, """11""")
         else:
             self.assertExpectedInline(cnt.frame_count, """1""")
-            self.assertExpectedInline(cnt.op_count, """12""")
+            self.assertExpectedInline(cnt.op_count, """11""")
 
     def test_module_in_skipfiles(self):
         model = nn.Linear(10, 10)
@@ -4540,29 +4540,10 @@ def f(x):
             """\
 def forward(self, s0 : torch.SymInt, s1 : torch.SymInt, L_x_ : torch.Tensor):
     l_x_ = L_x_
-    size = l_x_.size()
-    getitem = size[0];  size = None
-    gt = getitem > 3;  getitem = None
     getitem_2 = l_x_[0]
     sum_1 = getitem_2.sum();  getitem_2 = None
     gt_1 = sum_1 > 0;  sum_1 = None
     _assert_async = torch._assert_async(gt_1, 'assertion error');  gt_1 = None
-    size_1 = l_x_.size()
-    getitem_3 = size_1[0];  size_1 = None
-    floordiv = getitem_3 // 2;  getitem_3 = None
-    mod = 1 % floordiv;  floordiv = None
-    ne = mod != 0;  mod = None
-    size_2 = l_x_.size()
-    getitem_5 = size_2[0];  size_2 = None
-    floordiv_1 = getitem_5 // 2;  getitem_5 = None
-    pow_1 = floordiv_1 ** 2;  floordiv_1 = None
-    mul = 32 * pow_1;  pow_1 = None
-    size_3 = l_x_.size()
-    getitem_7 = size_3[0];  size_3 = None
-    floordiv_2 = getitem_7 // 2;  getitem_7 = None
-    mul_1 = 16 * floordiv_2;  floordiv_2 = None
-    sub = mul - mul_1;  mul = mul_1 = None
-    ne_1 = sub != 0;  sub = None
     cos = l_x_.cos();  l_x_ = None
     return (cos,)""",
         )
@@ -4933,6 +4914,40 @@ def ladder(x):
         opt_ladder = torch.compile(ladder, fullgraph=True, backend="eager")
         self.assertEqual(opt_ladder(data), ladder(data))
 
+    @unittest.expectedFailure
+    def test_trace_functional_tensor_with_error(self):
+        from torch._subclasses.fake_tensor import FakeTensorMode
+        from torch._subclasses.functional_tensor import (
+            FunctionalTensor,
+            FunctionalTensorMode,
+        )
+
+        def f(a, tmp):
+            a_view = a.view(-1)
+            with torch.no_grad():
+                a.set_(tmp)
+                a_view.mul_(2)
+            return a + tmp
+
+        fake_mode = FakeTensorMode()
+        with FunctionalTensorMode():
+            inp = torch.ones(3, 3, requires_grad=True)
+            inp = fake_mode.from_tensor(inp, static_shapes=True)
+            inp = FunctionalTensor.to_functional(inp)
+
+            tmp = torch.ones(3, 3, requires_grad=True)
+            tmp = fake_mode.from_tensor(tmp, static_shapes=True)
+            tmp = FunctionalTensor.to_functional(tmp)
+
+            opt_f = torch.compile(f, backend="eager")
+            with self.assertRaisesRegex(
+                RuntimeError, "cannot mutate tensors with frozen storage"
+            ):
+                opt_f(inp, tmp)
+
+        # grad state may not be properly reset after the error
+        self.assertTrue(torch.is_grad_enabled())
+
     def test_const_dict_keyerror(self):
         d = {}
 
diff --git a/test/dynamo/test_subgraphs.py b/test/dynamo/test_subgraphs.py
index babc33d29a96d..aae13c92e0586 100644
--- a/test/dynamo/test_subgraphs.py
+++ b/test/dynamo/test_subgraphs.py
@@ -439,7 +439,7 @@ def fn(a, b):
             x = x / (a + b)
             return x
 
-        self._common(fn, 1, 6)
+        self._common(fn, 1, 5)  # item gets DCE'd
 
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", False)
     def test_graph_break_on_item(self):
diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index 83443a5a55763..9b49f5ff8bb6a 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -11,6 +11,7 @@
 
 from torch._dynamo.comptime import comptime
 from torch._dynamo.testing import CompileCounter, same
+from torch.testing._internal.logging_utils import logs_to_string
 
 
 # The intention of this test file is you should put test cases specifically
@@ -485,6 +486,41 @@ def fn(x):
         compl_fn = torch.compile(fn, dynamic=True, backend="eager")
         self.assertEqual(compl_fn(inputs), fn(inputs))
 
+    @torch._dynamo.config.patch(specialize_float=False, assume_static_by_default=True)
+    def test_unspec_float_input(self):
+        cnts = torch._dynamo.testing.CompileCounter()
+
+        def f(x, y):
+            if y == 5.0:
+                return x + 2
+            else:
+                return x + y
+
+        cf = torch.compile(backend=cnts, fullgraph=True)(f)
+
+        x = torch.randn(3)
+        self.assertEqual(f(x, 3.0), cf(x, 3.0))
+        self.assertEqual(f(x, 4.0), cf(x, 4.0))
+        self.assertExpectedInline(cnts.frame_count, """1""")  # no recompile
+        self.assertEqual(f(x, 5.0), cf(x, 5.0))
+        self.assertExpectedInline(cnts.frame_count, """2""")  # guard worked
+        self.assertEqual(f(x, math.nan), cf(x, math.nan))
+        self.assertExpectedInline(cnts.frame_count, """3""")  # nan always recompiles
+
+    @torch._dynamo.config.patch(specialize_float=False, assume_static_by_default=True)
+    def test_unspec_float_output(self):
+        cnts = torch._dynamo.testing.CompileCounter()
+
+        def f(x, y):
+            return x + 1, y * 2
+
+        cf = torch.compile(backend=cnts, fullgraph=True)(f)
+        x = torch.randn(3)
+
+        self.assertEqual(f(x, 3.0), cf(x, 3.0))
+        self.assertEqual(f(x, 4.0), cf(x, 4.0))
+        self.assertEqual(f(x, 5.0), cf(x, 5.0))
+
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_data_dependent_evaluate_expr_graph_break(self):
         cnts = torch._dynamo.testing.CompileCounter()
@@ -511,7 +547,26 @@ def fn(x):
         fn(x)
 
         self.assertExpectedInline(cnts.frame_count, """2""")
-        self.assertExpectedInline(cnts.op_count, """3""")
+        self.assertExpectedInline(cnts.op_count, """4""")
+
+    def test_prune_torch_check(self):
+        log_stream, ctx = logs_to_string("torch._dynamo.output_graph", "graph_code")
+
+        @torch.compile(fullgraph=True, dynamic=True, backend="eager")
+        def f(x, y):
+            torch._check(y + 5 == 85)
+            torch._check(x.size(0) == 80)
+
+        with ctx():
+            f(torch.randn(80, 100), 80)
+
+        out = "\n".join(log_stream.getvalue().strip().split("\n")[3:]).strip()
+        self.assertExpectedInline(
+            out,
+            """\
+def forward(self):
+        return ()""",
+        )
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_split_aot_autograd(self):
diff --git a/test/dynamo_expected_failures/TestOldSerialization.test_serialization_filelike_api_requirements b/test/dynamo_expected_failures/TestOldSerialization.test_serialization_filelike_api_requirements
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_profiler_strides b/test/dynamo_expected_failures/TestProfiler.test_profiler_strides
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestSerialization.test_serialization_filelike_api_requirements b/test/dynamo_expected_failures/TestSerialization.test_serialization_filelike_api_requirements
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/expect/HasDecompTest.test_has_decomposition.expect b/test/expect/HasDecompTest.test_has_decomposition.expect
index 98c8fa664cb98..bbd8475fd6802 100644
--- a/test/expect/HasDecompTest.test_has_decomposition.expect
+++ b/test/expect/HasDecompTest.test_has_decomposition.expect
@@ -335,9 +335,6 @@ aten::_functional_assert_async.msg
 aten::_functional_assert_scalar
 aten::_functional_sym_constrain_range
 aten::_functional_sym_constrain_range_for_size
-aten::_fused_adagrad
-aten::_fused_adagrad.out
-aten::_fused_adagrad_
 aten::_fused_adam
 aten::_fused_adam.out
 aten::_fused_adam.tensor_lr
@@ -934,8 +931,6 @@ aten::min.dim_min
 aten::min.unary_out
 aten::miopen_batch_norm
 aten::miopen_batch_norm.out
-aten::miopen_batch_norm_backward
-aten::miopen_batch_norm_backward.out
 aten::miopen_convolution
 aten::miopen_convolution.out
 aten::miopen_convolution_add_relu
diff --git a/test/export/opinfo_schema.py b/test/export/opinfo_schema.py
new file mode 100644
index 0000000000000..06e0445a5fa2a
--- /dev/null
+++ b/test/export/opinfo_schema.py
@@ -0,0 +1,108 @@
+# Owner(s): ["oncall: export"]
+
+import torch
+from torch._dispatch.python import enable_python_dispatcher
+from torch._subclasses.schema_check_mode import SchemaCheckMode
+from torch.fx.operator_schemas import normalize_function
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    ops,
+)
+from torch.testing._internal.common_methods_invocations import op_db
+from torch.testing._internal.common_utils import TestCase
+from torch.utils._pytree import tree_map
+
+# Simplified naming for C++ classes
+SchemaArgument = torch._C._SchemaArgument
+SchemaArgType = torch._C._SchemaArgType
+SchemaInfo = torch._C._SchemaInfo
+
+test_classes = {}
+
+
+class PreDispatchSchemaCheckMode(SchemaCheckMode):
+    """
+    Dispatch mode built on top of SchemaCheckMode that checks for incorrect op schemas
+    for PreDispatch IR. This is meant to run ops in eager mode on concrete inputs, to
+    see if they incorrectly claim to be functional (aliasing or mutating).
+
+    If an op is claimed to be functional and either is detected, an error is raised.
+    Errors will be silenced if the schema admits aliasing or mutation - the op may
+    later decompose and become functional.
+    """
+
+    def __init__(self):
+        self._dispatch_key = torch._C.DispatchKey.PreDispatch
+        super().__init__()
+
+    def _may_alias_or_mutate(self, func, types, args, kwargs):
+        def unwrap(e):
+            if isinstance(e, torch.Tensor) and not type(e) == torch.Tensor:
+                try:
+                    return e.elem
+                except AttributeError as t:
+                    return e
+            return e
+
+        # get arguments, outputs
+        schema_info = SchemaInfo(func._schema)
+        pre_arguments = normalize_function(
+            func, args, kwargs, normalize_to_only_use_kwargs=True
+        ).kwargs
+        schema_info.add_argument_values(pre_arguments)
+        out = func(*args, **kwargs)
+        tuple_out = out if isinstance(out, tuple) else (out,)
+        tuple_out = tree_map(unwrap, tuple_out)
+
+        # check schema
+        for i in range(len(func._schema.arguments)):
+            for j in range(len(tuple_out)):
+                if schema_info.may_contain_alias(
+                    SchemaArgument(SchemaArgType.output, j),
+                    SchemaArgument(SchemaArgType.input, i),
+                ):
+                    return True
+            if schema_info.is_mutable(
+                SchemaArgument(SchemaArgType.input, i),
+            ):
+                return True
+
+        return False
+
+    # creating this just so we have access to the offending op
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        try:
+            return super().__torch_dispatch__(func, types, args=args, kwargs=kwargs)
+        except RuntimeError as e:
+            # check if schema claims to be either aliasing or mutating
+            alias_or_mutate = self._may_alias_or_mutate(func, types, args, kwargs)
+            if (
+                not alias_or_mutate
+            ):  # if schema is aliasing or mutating, will decompose further
+                msg = e.args[0]
+                e.args = (
+                    f"""SchemaCheckMode failed with the following error on op <{func}>, meaning
+    this op contains aliasing or mutations, despite claiming to be functional:\n\n"""
+                    + msg,
+                )
+                raise e
+
+
+class TestOpInfo(TestCase):
+    @ops(op_db, allowed_dtypes=(torch.float, torch.int))
+    def test_schema_check_op(self, device, dtype, op):
+        sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=False)
+        inputs = next(sample_inputs_itr)
+        args = [inputs.input] + list(inputs.args)
+        kwargs = inputs.kwargs
+        with enable_python_dispatcher():
+            with PreDispatchSchemaCheckMode():
+                op.op(*args, **kwargs)
+
+
+instantiate_device_type_tests(TestOpInfo, globals())
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/export/test_export.py b/test/export/test_export.py
index 586fc403da9a9..cec463fa3dc0e 100644
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -631,7 +631,13 @@ def forward(self, x, weight, bias):
         self.assertEqual(actual_result, expected_result)
 
     # TODO(yidi)
-    @unittest.expectedFailure
+    # Expected failure for test cases that calls run_decomposition().
+    # The top-level cond node has pre-existing metadata,
+    # which overrides the metadata for operators in subgraph due to interpreter.run(),
+    # where cond is a single node in the interpreter.run(). And we preserve metadata
+    # by copying current node's metadata for all nodes created during interpreting.
+    @testing.expectedFailurePreDispatchRunDecomp
+    @testing.expectedFailureRetraceability
     def test_export_cond_preserve_torch_fn_for_subgraphs(self):
         class MySubModule(torch.nn.Module):
             def foo(self, x):
@@ -2091,6 +2097,32 @@ def forward(self, x):
         ):
             export(Module(), (torch.tensor(1, device="cpu"),))
 
+    def test_float_conversion(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return x.float()
+
+        ep = export(Module(), (torch.tensor(1, dtype=torch.float),))
+        ops = []
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                ops.append(node.target)
+        self.assertGreater(len(ops), 0)
+        for op in ops:
+            self.assertIn(op, (torch.ops.aten._to_copy.default,))
+
+    def test_device_to_mutation_float(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                y = x.float()
+                y.add_(1)
+                return y, x
+
+        with self.assertRaisesRegex(
+            RuntimeError, "cannot mutate tensors with frozen storage"
+        ):
+            export(Module(), (torch.tensor(1, dtype=torch.float),))
+
     def test_module(self):
         class MyLinear(torch.nn.Module):
             def __init__(self):
@@ -3301,6 +3333,23 @@ def forward(self, x):
         test_inp = torch.ones(8, 4)
         self.assertTrue(torch.allclose(ep.module()(test_inp), Foo().forward(test_inp)))
 
+    @testing.expectedFailureRetraceability
+    def test_runtime_assert_with_size(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                a = x.item()
+                torch._check_is_size(a)
+                torch._check(a <= y.size(0))
+                return y[:a]
+
+        ep = export(
+            M(),
+            (torch.tensor(5), torch.ones(10)),
+            dynamic_shapes={"x": None, "y": {0: torch.export.Dim("t")}},
+        )
+        inp = (torch.tensor(6), torch.randn(13))
+        self.assertTrue(torch.allclose(ep.module()(*inp), M()(*inp)))
+
     def test_issue_113041(self):
         class TestModule(torch.nn.Module):
             def __init__(self):
@@ -4873,6 +4922,31 @@ def forward(self, x):
         unflattened = unflatten(ep)
         self.assertTrue(torch.allclose(m1(*inps), unflattened(*inps)))
 
+    @testing.expectedFailureRetraceability
+    def test_unused_aliases(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                # param
+                self.alpha = torch.nn.Parameter(torch.randn(4))
+                self.beta = self.alpha
+                self.gamma = self.alpha
+
+            def forward(self, x):
+                return x + self.gamma
+
+        inps = (torch.randn(4),)
+        ep = export(Foo(), inps)
+        # placeholder nodes will be deduplicated in strict-mode,
+        # but check that all params still appear in state dict
+        for param in ["alpha", "beta", "gamma"]:
+            self.assertTrue(param in ep.state_dict)
+
+        # check that they also appear in unflattened state dict
+        unep = unflatten(ep)
+        for param in ["alpha", "beta", "gamma"]:
+            self.assertTrue(param in unep.state_dict())
+
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
 class TestOneOffModelExportResult(TestCase):
diff --git a/test/export/test_unflatten.py b/test/export/test_unflatten.py
index b8ff48334f011..19c55982d590d 100644
--- a/test/export/test_unflatten.py
+++ b/test/export/test_unflatten.py
@@ -708,6 +708,44 @@ def forward(self, input_):
         umod = unflatten(ep_non_strict)
         self.assertTrue(torch.allclose(umod(input_), mod(input_)))
 
+    def test_simple_alias(self):
+        # handle weight sharing, check tensor ids after unflattening
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                # alias param
+                self.bias = torch.nn.Parameter(torch.randn(4))
+                self.m = torch.nn.Linear(4, 4)
+                self.m.bias = self.bias
+
+            def forward(self, x):
+                return self.m(x) + self.bias
+
+        m = Foo()
+        inps = (torch.randn(4, 4),)
+        ep = export(m, inps)
+        unep = unflatten(ep)
+        self.assertTrue(id(unep.m.bias) == id(unep.bias))
+
+        # handle aliasing where one alias is unused
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bias = torch.nn.Parameter(torch.randn(4))
+                self.m = torch.nn.Linear(4, 4)
+                self.m.bias = (
+                    self.bias
+                )  # self.bias is unused, aliasing should be handled
+
+            def forward(self, x):
+                return self.m(x)
+
+        m = Foo()
+        inps = (torch.randn(4, 4),)
+        ep = export(m, inps)
+        unep = unflatten(ep)
+        self.assertTrue(torch.allclose(unep(*inps), m(*inps)))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 5c17b7f84d0d4..ffa71a7e905b5 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -103,8 +103,13 @@
 
 class AOTTestCase(TestCase):
     def setUp(self):
+        self.prev_grad_state = torch.is_grad_enabled()
         super().setUp()
 
+    def tearDown(self):
+        torch.set_grad_enabled(self.prev_grad_state)
+        super().tearDown()
+
 
 class TestPythonKey(AOTTestCase):
     def test_make_fx(self, device):
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index a1aeb8c1de7d3..92a988d83db39 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -890,6 +890,42 @@ def f(x):
         )(inp)
         self.assertEqual(gm_functional(torch.zeros(1, 2)), f(torch.zeros(1, 2)))
 
+    def test_cond_subgraph_same_shape_env_as_parent(self):
+        def true_fn(x):
+            return x.sin() + 10
+
+        def false_fn(x):
+            return x.cos() - 20
+
+        def f(x, pred):
+            y = cond(pred, true_fn, false_fn, [x])
+            z = torch.add(y, y)
+            return z
+
+        symbolic_traced_graph = self._check_tracing(f, (torch.ones(4), True))[
+            "symbolic"
+        ]
+        graph_shape_env = symbolic_traced_graph.shape_env
+
+        def _node_shape_env_iter(gm):
+            for node in symbolic_traced_graph.graph.nodes:
+                if node.op == "call_function":
+                    val = node.meta.get("val")
+                    if isinstance(val, tuple):
+                        for v in val:
+                            yield v.fake_mode.shape_env
+                    else:
+                        yield val.fake_mode.shape_env
+
+        for shape_env in _node_shape_env_iter(symbolic_traced_graph):
+            self.assertTrue(shape_env is graph_shape_env)
+
+        for shape_env in _node_shape_env_iter(symbolic_traced_graph.true_graph_0):
+            self.assertTrue(shape_env is graph_shape_env)
+
+        for shape_env in _node_shape_env_iter(symbolic_traced_graph.false_graph_0):
+            self.assertTrue(shape_env is graph_shape_env)
+
     def test_cond_functionalized_nested(self):
         def true_true_fn(x):
             y = x.cos()
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index e913d90dde48c..068123a78e8c5 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -349,8 +349,6 @@ def is_inplace(op, variant):
 
 vjp_fail = {
     xfail("tensor_split"),  # data_ptr composite compliance
-    decorate("nn.functional.batch_norm", decorator=skipIfRocm),
-    decorate("nn.functional.instance_norm", decorator=skipIfRocm),
     # https://github.com/pytorch/pytorch/issues/96560
     decorate("nn.functional.scaled_dot_product_attention", decorator=skipIfRocm),
 }
@@ -569,11 +567,6 @@ def abs_if_complex(t):
                 xfail(
                     "NumpyExpMarkDirtyAutogradFunction"
                 ),  # TODO: https://github.com/pytorch/pytorch/issues/91280
-                # https://github.com/pytorch/pytorch/issues/96560
-                # ROCm: NotImplementedError
-                decorate("nn.functional.batch_norm", decorator=skipIfRocm),
-                # ROCm: NotImplementedError
-                decorate("nn.functional.instance_norm", decorator=skipIfRocm),
                 # --- Non-Contiguous Failures! ---
                 # This is expected to fail as the operator
                 # expects last dim to have stride=1
@@ -1282,9 +1275,6 @@ def test_vmapvjp(self, device, dtype, op):
         xfail("_native_batch_norm_legit"),
         # TODO: implement batching rule
         xfail("_batch_norm_with_update"),
-        # https://github.com/pytorch/pytorch/issues/96560
-        # ROCm: NotImplementedError
-        decorate("nn.functional.instance_norm", decorator=skipIfRocm),
         # ----------------------------------------------------------------------
     }
 
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index a23b51da923f9..7735594c58230 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -59,7 +59,6 @@
     markDynamoStrictTest,
     parametrize,
     run_tests,
-    skipIfRocm,
     skipIfTorchDynamo,
     subtest,
     TEST_WITH_TORCHDYNAMO,
@@ -4200,8 +4199,6 @@ def test():
                 xfail("tril"),  # Exception not raised on error input
                 xfail("triu"),  # Exception not raised on error input
                 xfail("as_strided", "partial_views"),
-                # https://github.com/pytorch/pytorch/issues/96560
-                decorate("nn.functional.batch_norm", decorator=skipIfRocm),
                 # RuntimeError: output with shape [4, 4] doesn't match the broadcast shape [1, 4, 4]
                 xfail("addcdiv"),
                 xfail("addcmul"),
@@ -4375,8 +4372,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
                 xfail("linalg.lu", ""),
                 skip("linalg.ldl_solve", ""),
                 skip("_softmax_backward_data"),
-                # https://github.com/pytorch/pytorch/issues/96560
-                decorate("nn.functional.batch_norm", decorator=skipIfRocm),
                 # One or more of the overload doesn't have a Batch rule.
                 xfail("bincount"),
                 # RuntimeError: Expected all tensors to be on the same device,
diff --git a/test/inductor/extension_backends/cpp/extension_device.cpp b/test/inductor/extension_backends/cpp/extension_device.cpp
index 71f3f5919a9b2..c801f9ea06837 100644
--- a/test/inductor/extension_backends/cpp/extension_device.cpp
+++ b/test/inductor/extension_backends/cpp/extension_device.cpp
@@ -44,7 +44,7 @@ at::Tensor custom_to_device(
     at::ScalarType dtype,
     bool non_blocking,
     bool copy,
-    c10::optional<at::MemoryFormat> memory_format) {
+    std::optional<at::MemoryFormat> memory_format) {
   TORCH_CHECK(self.is_cpu() || self.device().type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device.");
   TORCH_CHECK(device.is_cpu() || device.type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device.");
   // Some dummy asserts for the basic use case: inputs are the same size / dtype, all contiguous.
@@ -121,11 +121,11 @@ at::Tensor custom__copy_from(const at::Tensor& self, const at::Tensor& dst, bool
 }
 
 at::Tensor custom_empty_memory_format(at::IntArrayRef size,
-                                      c10::optional<at::ScalarType> dtype,
-                                      c10::optional<at::Layout> layout,
-                                      c10::optional<at::Device> device,
-                                      c10::optional<bool> pin_memory,
-                                      c10::optional<at::MemoryFormat> memory_format) {
+                                      std::optional<at::ScalarType> dtype,
+                                      std::optional<at::Layout> layout,
+                                      std::optional<at::Device> device,
+                                      std::optional<bool> pin_memory,
+                                      std::optional<at::MemoryFormat> memory_format) {
   constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
   return at::detail::empty_generic(size,
                                    &global_custom_alloc,
@@ -134,7 +134,7 @@ at::Tensor custom_empty_memory_format(at::IntArrayRef size,
                                    memory_format);
 }
 
-at::Tensor custom_empty_strided(c10::IntArrayRef size, c10::IntArrayRef stride, c10::optional<at::ScalarType> dtype_opt, c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt, c10::optional<bool> pin_memory_opt) {
+at::Tensor custom_empty_strided(c10::IntArrayRef size, c10::IntArrayRef stride, std::optional<at::ScalarType> dtype_opt, c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt, c10::optional<bool> pin_memory_opt) {
   op_counter += 1;
 
   constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
diff --git a/test/inductor/test_benchmark_fusion.py b/test/inductor/test_benchmark_fusion.py
index c2f435e9bf94e..3970148b2747f 100644
--- a/test/inductor/test_benchmark_fusion.py
+++ b/test/inductor/test_benchmark_fusion.py
@@ -203,7 +203,7 @@ def setUpClass(cls):
                     {
                         "benchmark_kernel": True,
                         "benchmark_fusion": True,
-                        "benchmark_multi_templates": True,
+                        "benchmark_epilogue_fusion": True,
                     }
                 )
             )
@@ -231,7 +231,7 @@ def foo(m, inp):
 
             torch._dynamo.reset()
             with unittest.mock.patch.object(
-                torch._inductor.config, "benchmark_multi_templates", False
+                torch._inductor.config, "benchmark_epilogue_fusion", False
             ):
                 foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
                 with torch.no_grad():
diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
index a8fabacd742ec..b2d0ed91809f9 100644
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@@ -46,7 +46,6 @@
     OneCycleLR,
     PolynomialLR,
     ReduceLROnPlateau,
-    SequentialLR,
     StepLR,
 )
 
@@ -73,9 +72,11 @@
     StepLR: {"step_size": 1, "gamma": 100},
     MultiStepLR: {"milestones": [1, 2], "gamma": 100},
     ExponentialLR: {"gamma": 100},
-    SequentialLR: {"schedulers": None, "milestones": [1, 2]},
     CosineAnnealingLR: {"T_max": 7},
-    ChainedScheduler: {"schedulers": None},
+    # These schedulers have memory leaks in eager
+    # https://github.com/pytorch/pytorch/issues/126131
+    # SequentialLR: {"schedulers": None, "milestones": [1, 2]},
+    # ChainedScheduler: {"schedulers": None},
     CyclicLR: {"base_lr": 0.001, "max_lr": 0.02, "cycle_momentum": False},
     CosineAnnealingWarmRestarts: {"T_0": 1},
     OneCycleLR: {
@@ -766,6 +767,25 @@ def test_get_value_on_static_address(self):
 
         self.assertEqual(ret_val, x)
 
+    # compile a large foreach op and verify
+    # that the time taken is within an expected range
+    @requires_cuda
+    def test_compile_time_smoketest(self):
+        import time
+
+        xs = [torch.ones(2, 2, device="cuda") for _ in range(100)]
+        ys = [torch.ones(2, 2, device="cuda") for _ in range(100)]
+
+        @torch.compile
+        def fn(xs, ys):
+            return torch._foreach_add(xs, ys)
+
+        start = time.perf_counter()
+        fn(xs, ys)
+        end = time.perf_counter()
+
+        self.assertLess(end - start, 90)
+
 
 for optim_cls, name, kwargs, scheduler_cls in COMPILED_OPT_KWARG_DB:
     setattr(
diff --git a/test/inductor/test_cuda_cpp_wrapper.py b/test/inductor/test_cuda_cpp_wrapper.py
index 42df6813c63e5..5bbe588d3a84e 100644
--- a/test/inductor/test_cuda_cpp_wrapper.py
+++ b/test/inductor/test_cuda_cpp_wrapper.py
@@ -99,7 +99,6 @@ class DynamicShapesCudaWrapperCudaTests(InductorTestCase):
     xfail_list = [
         "test_bernoulli1_cuda",  # cpp fallback op naming issue
         "test_profiler_mark_wrapper_call_cuda",
-        "test_randint_cuda",
         "test_scaled_dot_product_attention_cuda_dynamic_shapes",
     ]
     for test_name in xfail_list:
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
index c8877d4a8e978..e0e1395c4908b 100644
--- a/test/inductor/test_cudagraph_trees.py
+++ b/test/inductor/test_cudagraph_trees.py
@@ -1629,9 +1629,13 @@ def test_incompatible_cudagraph_ops_item(self):
             def foo(x):
                 return x.item()
 
+            # NB: This doesn't work with float, because float unbacked codegen
+            # is currently broken.  But testing the float case here is also
+            # awkward, because we plan to Tensor-ify the float compute, and as
+            # a result we'd actually expect this to work with cuda graphs!
             with capture_stderr() as captured_output:
-                self.assertEqual(foo(torch.tensor(3.0, device="cuda")), 3.0)
-                self.assertEqual(foo(torch.tensor(6.0, device="cuda")), 6.0)
+                self.assertEqual(foo(torch.tensor(3, device="cuda")), 3)
+                self.assertEqual(foo(torch.tensor(6, device="cuda")), 6)
 
             # NOTE: this test is named after incompatible ops, but is not skipping due to incompatible ops.
             # This should get fixed.
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
index 37461bc2c50a8..9df905d2ad547 100644
--- a/test/inductor/test_flex_attention.py
+++ b/test/inductor/test_flex_attention.py
@@ -126,6 +126,19 @@ def score_mod(score, b, h, m, n):
 
 
 class TestTemplatedSDPA(InductorTestCase):
+    def _check_equal(self, golden_out, ref_out, compiled_out, dtype):
+        compiled_error = (golden_out - compiled_out).abs().mean()
+        ref_error = (golden_out - ref_out).abs().mean()
+        # Note, it seems like we really are less accurate than the float32
+        # computation, likely due to the online softmax
+        if dtype == torch.float32:
+            fudge_factor = 10.0
+        else:
+            fudge_factor = 1.1
+        if compiled_error > ref_error * fudge_factor:
+            msg = f"Compiled error {compiled_error} is greater than ref error {ref_error} by more than {fudge_factor}X."
+            self.assertTrue(False, msg)
+
     def run_test(
         self,
         score_mod: Callable,
@@ -145,18 +158,114 @@ def run_test(
         )
         ref_out = sdpa_partial(q, k, v)
         compiled_out = compiled_sdpa(q, k, v)
+        self._check_equal(golden_out, ref_out, compiled_out, dtype)
 
-        compiled_error = (golden_out - compiled_out).abs().mean()
-        ref_error = (golden_out - ref_out).abs().mean()
-        # Note, it seems like we really are less accurate than the float32
-        # computation, likely due to the online softmax
-        if dtype == torch.float32:
-            fudge_factor = 10.0
-        else:
-            fudge_factor = 1.1
-        if compiled_error > ref_error * fudge_factor:
-            msg = f"Compiled error {compiled_error} is greater than ref error {ref_error} by more than {fudge_factor}X."
-            self.assertTrue(False, msg)
+    def run_dynamic_test(
+        self,
+        score_mod: Callable,
+        dtype: torch.dtype = torch.float16,
+        B: int = B,
+        H: int = H,
+        S: int = S,
+        D: int = D,
+    ):
+        sdpa_partial = create_attention(score_mod)
+        # The first eager batch, shape (B, H, S, D)
+        q1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        k1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        v1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        golden_out1 = sdpa_partial(
+            q1.to(torch.float64), k1.to(torch.float64), v1.to(torch.float64)
+        )
+        ref_out1 = sdpa_partial(q1, k1, v1)
+
+        # The second eager batch, shape (B * 2, H, S / 2, D)
+        B = int(B * 2)
+        S = int(S / 2)
+        q2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        k2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        v2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        golden_out2 = sdpa_partial(
+            q2.to(torch.float64), k2.to(torch.float64), v2.to(torch.float64)
+        )
+        ref_out2 = sdpa_partial(q2, k2, v2)
+
+        # Need to clear dynamo counters, since flex attention eager mode also uses dynamo tracing.
+        # We check dynamo counters["frames"]["ok"] to ensure there is no re-compilation.
+        torch._dynamo.reset()
+        # Compiling with dynamic shape in the first batch.
+        compiled_sdpa = torch.compile(sdpa_partial, dynamic=True)
+        compiled_out1 = compiled_sdpa(q1, k1, v1)
+        self._check_equal(golden_out1, ref_out1, compiled_out1, dtype)
+        self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 1)
+
+        # No re-compilation, use the compiled dynamic shape version.
+        compiled_out2 = compiled_sdpa(q2, k2, v2)
+        self._check_equal(golden_out2, ref_out2, compiled_out2, dtype)
+        self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 1)
+
+    def run_automatic_dynamic_test(
+        self,
+        score_mod: Callable,
+        dtype: torch.dtype = torch.float16,
+        B: int = B,
+        H: int = H,
+        S: int = S,
+        D: int = D,
+    ):
+        sdpa_partial = create_attention(score_mod)
+        # The first eager batch, shape (B, H, S, D)
+        q1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        k1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        v1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        golden_out1 = sdpa_partial(
+            q1.to(torch.float64), k1.to(torch.float64), v1.to(torch.float64)
+        )
+        ref_out1 = sdpa_partial(q1, k1, v1)
+
+        # The second eager batch, shape (B * 2, H, S / 2, D)
+        B = int(B * 2)
+        S = int(S / 2)
+        q2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        k2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        v2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        golden_out2 = sdpa_partial(
+            q2.to(torch.float64), k2.to(torch.float64), v2.to(torch.float64)
+        )
+        ref_out2 = sdpa_partial(q2, k2, v2)
+
+        # The third eager batch, shape (B * 4, H, S / 4, D)
+        B = int(B * 2)
+        S = int(S / 2)
+        q3 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        k3 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        v3 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        golden_out3 = sdpa_partial(
+            q3.to(torch.float64), k3.to(torch.float64), v3.to(torch.float64)
+        )
+        ref_out3 = sdpa_partial(q3, k3, v3)
+
+        # Need to clear dynamo counters, since flex attention eager mode also uses dynamo tracing.
+        # We check dynamo counters["frames"]["ok"] to ensure:
+        # 1, the first batch is compiled with static shape
+        # 2, the second batch is compiled with dynamic shape
+        # 3, no re-compilation in the third batch
+        torch._dynamo.reset()
+        # The first batch.
+        compiled_sdpa = torch.compile(sdpa_partial)
+        compiled_out1 = compiled_sdpa(q1, k1, v1)
+        self._check_equal(golden_out1, ref_out1, compiled_out1, dtype)
+        self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 1)
+
+        # The second batch (automatic dynamic).
+        compiled_out2 = compiled_sdpa(q2, k2, v2)
+        self._check_equal(golden_out2, ref_out2, compiled_out2, dtype)
+        self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 2)
+
+        # The third batch (no re-compilation).
+        compiled_out3 = compiled_sdpa(q3, k3, v3)
+        self._check_equal(golden_out3, ref_out3, compiled_out3, dtype)
+        self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 2)
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes)
@@ -164,6 +273,20 @@ def run_test(
     def test_builtin_score_mods(self, dtype: torch.dtype, score_mod: Callable):
         self.run_test(score_mod, dtype)
 
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes)
+    @common_utils.parametrize("score_mod", test_score_mods)
+    def test_builtin_score_mods_dynamic(self, dtype: torch.dtype, score_mod: Callable):
+        self.run_dynamic_test(score_mod, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes)
+    @common_utils.parametrize("score_mod", test_score_mods)
+    def test_builtin_score_mods_automatic_dynamic(
+        self, dtype: torch.dtype, score_mod: Callable
+    ):
+        self.run_automatic_dynamic_test(score_mod, dtype)
+
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes)
     def test_skip_odd_keys(self, dtype: torch.dtype):
@@ -289,7 +412,51 @@ def natten_mask(score, b, h, q, kv):
         self.run_test(natten_mask, dtype)
 
     @supported_platform
-    @expectedFailure
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_subgraph_respect_decompostion(self, dtype):
+        from torch._decomp import core_aten_decompositions
+        from torch.fx.experimental.proxy_tensor import make_fx
+
+        def score_mod_func(score, b, h, q, kv):
+            return score - q // (1 + kv)
+
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 8, 4),
+            device="cuda",
+            dtype=torch.float64,
+            requires_grad=True,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+        # floor_div is not decomposed in decompostion_table is empty
+        gm = make_fx(_flex_attention, decomposition_table={})(
+            query, key, value, score_mod_func
+        )
+        self.assertExpectedInline(
+            gm.sdpa_score0.code.strip(),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
+    add = torch.ops.aten.add.Tensor(arg4_1, 1);  arg4_1 = None
+    floor_divide = torch.ops.aten.floor_divide.default(arg3_1, add);  arg3_1 = add = None
+    sub = torch.ops.aten.sub.Tensor(arg0_1, floor_divide);  arg0_1 = floor_divide = None
+    return sub""",
+        )
+
+        # floor_div is decomposed for core_aten_decompositions
+        gm = make_fx(_flex_attention, decomposition_table=core_aten_decompositions())(
+            query, key, value, score_mod_func
+        )
+        self.assertExpectedInline(
+            gm.sdpa_score0.code.strip(),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
+    add = torch.ops.aten.add.Tensor(arg4_1, 1);  arg4_1 = None
+    div = torch.ops.aten.div.Tensor_mode(arg3_1, add, rounding_mode = 'floor');  arg3_1 = add = None
+    sub = torch.ops.aten.sub.Tensor(arg0_1, div);  arg0_1 = div = None
+    return sub""",
+        )
+
+    @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_silu_on_score(self, dtype):
         def silu_score(score, b, h, q, kv):
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index c8622de6faf8f..c5f0afa118f87 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -622,6 +622,82 @@ def f(x, weight):
     def test_empty_conv_input_with_1x1_kernel(self):
         self.test_empty_conv_input(kernel_size=1)
 
+    def test_non_contiguous_input_mm(self):
+        """
+        Make sure the triton template can work with non-contiguous inputs without crash.
+        Check https://github.com/pytorch/pytorch/issues/125437 for more details.
+        """
+        x = torch.empty_strided(
+            (50257, 32768), (1, 50304), dtype=torch.bfloat16, device="cuda"
+        )
+        y = torch.empty_strided(
+            (32768, 768), (768, 1), dtype=torch.bfloat16, device="cuda"
+        )
+
+        @torch.compile(mode="max-autotune")
+        def f(x, y):
+            return x @ y
+
+        ref = x @ y
+        act = f(x, y)
+        self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3))
+
+    def test_non_contiguous_input_addmm(self):
+        b = torch.empty((768), dtype=torch.bfloat16, device="cuda")
+        x = torch.empty_strided(
+            (50257, 32768), (1, 50304), dtype=torch.bfloat16, device="cuda"
+        )
+        y = torch.empty_strided(
+            (32768, 768), (768, 1), dtype=torch.bfloat16, device="cuda"
+        )
+
+        @torch.compile(mode="max-autotune")
+        def f(x, y):
+            return torch.addmm(b, x, y)
+
+        ref = torch.addmm(b, x, y)
+        act = f(x, y)
+        self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3))
+
+    def test_non_contiguous_input_bmm(self):
+        x = torch.empty_strided(
+            (1, 50257, 32768), (0, 1, 50304), dtype=torch.bfloat16, device="cuda"
+        )
+        y = torch.empty_strided(
+            (1, 32768, 768), (0, 768, 1), dtype=torch.bfloat16, device="cuda"
+        )
+
+        @torch.compile(mode="max-autotune")
+        def f(x, y):
+            return torch.bmm(x, y)
+
+        ref = torch.bmm(x, y)
+        act = f(x, y)
+        self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3))
+
+    def test_non_contiguous_input_mm_plus_mm(self):
+        x1 = torch.empty_strided(
+            (50257, 32768), (1, 50304), dtype=torch.bfloat16, device="cuda"
+        )
+        y1 = torch.empty_strided(
+            (32768, 768), (768, 1), dtype=torch.bfloat16, device="cuda"
+        )
+
+        x2 = torch.empty_strided(
+            (50257, 32768), (1, 50304), dtype=torch.bfloat16, device="cuda"
+        )
+        y2 = torch.empty_strided(
+            (32768, 768), (768, 1), dtype=torch.bfloat16, device="cuda"
+        )
+
+        @torch.compile(mode="max-autotune")
+        def f(x1, y1, x2, y2):
+            return x1 @ y1 + x2 @ y2
+
+        ref = x1 @ y1 + x2 @ y2
+        act = f(x1, y1, x2, y2)
+        self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3))
+
 
 class TestBenchmarkRequest(BenchmarkRequest):
     def __init__(
diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
index 5aeb4d01edbd0..b16e5e5d62edf 100644
--- a/test/inductor/test_pad_mm.py
+++ b/test/inductor/test_pad_mm.py
@@ -5,10 +5,15 @@
 
 import torch._inductor.config as inductor_config
 from torch._dynamo.testing import rand_strided
-from torch._inductor.fx_passes.pad_mm import get_alignment_size, get_padded_length
+from torch._inductor.fx_passes.pad_mm import (
+    get_alignment_size,
+    get_pad_cache,
+    get_padded_length,
+    should_pad_common,
+)
 
 from torch._inductor.test_case import run_tests, TestCase
-from torch._inductor.utils import run_and_get_code
+from torch._inductor.utils import fresh_inductor_cache, run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
@@ -125,7 +130,7 @@ def forward(self, a, b):
         b = rand_strided((K, N), (1, K), device="cuda", dtype=torch.float32)
         # TODO: Getting the alignment right requires pattern matcher to
         # run on newly added nodes
-        aligned_m = get_padded_length(M, get_alignment_size(a)) + M - 3
+        aligned_m = get_padded_length(M, get_alignment_size(a)) + M
         torch._dynamo.mark_dynamic(a, 1)
         torch._dynamo.mark_dynamic(b, 0)
         with unittest.mock.patch(
@@ -312,6 +317,103 @@ def forward(self, a, b, c):
             FileCheck().check(f"K = {K}").run(code)
         self.assertEqual(res1, res2)
 
+    @inductor_config.patch(force_shape_pad=True)
+    def test_pad_single_cat(self):
+        @torch.compile()
+        def foo(x, y):
+            return x @ y
+
+        inps = [torch.rand([5, 5], device="cuda") for _ in range(2)]
+        out = foo(*inps)
+        self.assertEqual(out, inps[0] @ inps[1])
+
+    @inductor_config.patch(force_shape_pad=True)
+    @fresh_inductor_cache()
+    def test_pad_addmm_2d_bias(self):
+        @torch.compile()
+        def foo(input, x, y):
+            return torch.ops.aten.addmm(input, x, y)
+
+        for a in [1, 4]:
+            for b in [1, 6]:
+                inps = (
+                    torch.rand([a, b], device="cuda"),
+                    torch.rand([4, 5], device="cuda"),
+                    torch.rand([5, 6], device="cuda"),
+                )
+                out = foo(*inps)
+                out_eager = torch.ops.aten.addmm(*inps)
+                self.assertEqual(out, out_eager)
+
+        for a in [1, 6]:
+            inps = (
+                torch.rand([a], device="cuda"),
+                torch.rand([4, 5], device="cuda"),
+                torch.rand([5, 6], device="cuda"),
+            )
+            out = foo(*inps)
+            out_eager = torch.ops.aten.addmm(*inps)
+            self.assertEqual(out, out_eager)
+
+    @inductor_config.patch(force_shape_pad=True)
+    def test_pad_batch(self):
+        m = 6
+        n = 9
+        k = 11
+        batch_size = 3
+        mat1 = torch.ones((batch_size, m, k), device="cuda", dtype=torch.float16)
+        mat2 = torch.ones((batch_size, k, n), device="cuda", dtype=torch.float16)
+        expected_alignment = get_alignment_size(mat1)
+
+        assert expected_alignment == 8, "Alignment for float16 should be 8"
+        assert should_pad_common(
+            mat1, mat2
+        ), "This should pass the common padding criteria"
+
+        @torch.compile()
+        def bmm(mat1, mat2):
+            return torch.bmm(mat1, mat2)
+
+        res2, (code,) = run_and_get_code(bmm, mat1, mat2)
+        bmm_expected_result = torch.bmm(mat1, mat2)
+        # in call code, expect to see a single pad per input, and then we should see padded allocation for output
+        FileCheck().check("del async_compile").check_count(
+            ".run(", 2, exactly=True
+        ).check("empty_strided_cuda((3, 8, 16)").run(code)
+
+        assert torch.allclose(
+            res2, bmm_expected_result
+        ), "BMM results are not identical"
+
+    @fresh_inductor_cache()
+    def test_exclude_padding(self):
+        @torch.compile()
+        def mm(a, b):
+            return a @ b
+
+        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
+        local_cache = get_pad_cache().get_local_cache()
+        self.assertTrue(len(local_cache) == 2)
+        FileCheck().check_count("exclude_pad:False", 2, exactly=True).run(
+            repr(local_cache)
+        )
+
+        @torch.compile()
+        def mm(a, b):
+            return (a + 1) @ b
+
+        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
+        local_cache = get_pad_cache().get_local_cache()
+        # reuse original base timing
+        self.assertTrue(len(local_cache) == 3)
+
+        FileCheck().check_count("exclude_pad:False", 3, exactly=True).run(
+            repr(local_cache)
+        )
+        FileCheck().check_count("exclude_pad:True", 1, exactly=True).run(
+            repr(local_cache)
+        )
+
 
 if __name__ == "__main__":
     if HAS_CUDA:
diff --git a/test/inductor/test_padding.py b/test/inductor/test_padding.py
index 7aef585842e61..e08ac285801d7 100644
--- a/test/inductor/test_padding.py
+++ b/test/inductor/test_padding.py
@@ -7,9 +7,9 @@
 
 import torch
 from torch import nn, Tensor
+from torch._dynamo.convert_frame import maybe_cprofile
 from torch._dynamo.test_case import run_tests, TestCase
 from torch._dynamo.testing import rand_strided, reduce_to_scalar_loss
-from torch._dynamo.utils import maybe_cprofile
 from torch._inductor import config, ir, metrics
 from torch._inductor.fx_passes import pad_mm as pad_mm_pass
 from torch._inductor.runtime.runtime_utils import do_bench
diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
index fc1a9a5ec507d..cc7c3f7084c88 100644
--- a/test/inductor/test_pattern_matcher.py
+++ b/test/inductor/test_pattern_matcher.py
@@ -6,6 +6,7 @@
 import torch
 import torch._dynamo.config as dynamo_config
 import torch._inductor.config as inductor_config
+import torch.nn.functional as F
 from torch._dynamo.utils import count_calls, counters
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch._inductor.fx_passes import joint_graph
@@ -28,6 +29,7 @@
 from torch.testing._internal.common_cuda import SM80OrLater
 from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm
 from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.utils import _pytree as pytree
 
 
 class TestPatternMatcher(TestCase):
@@ -38,15 +40,22 @@ def common(
         expected_matches,
         expected_nodes,
         additional_check=lambda code: None,
+        reference_in_float=False,
     ):
         counters.clear()
         torch.manual_seed(42)
-        expected = fn(*args)
+        if reference_in_float:
+            ref_inputs = pytree.tree_map_only(
+                torch.Tensor, lambda x: x.to(torch.float32), args
+            )
+        else:
+            ref_inputs = args
+        expected = fn(*ref_inputs)
         torch.manual_seed(42)
         actual, codes = run_and_get_code(torch.compile(fn), *args)
         if len(codes) == 1:
             codes = codes[0]
-        torch.testing.assert_close(actual, expected)
+        torch.testing.assert_close(actual, expected, check_dtype=not reference_in_float)
 
         self.assertEqual(
             counters["inductor"]["pattern_matcher_count"], expected_matches
@@ -1170,6 +1179,46 @@ def fn1(a, b):
         stable_topological_sort(graph)
         self.assertEqual(list(graph.nodes), [b, a, c])
 
+    def test_scaled_softmax(self):
+        def mul_softmax(a, b):
+            return F.softmax(a * b, dim=0)
+
+        def div_softmax(x, inv_scale):
+            return F.softmax(x / inv_scale, dim=0)
+
+        x = torch.randn(10, 10)
+        scale = 1e6
+        inv_scale = 1 / scale
+        self.common(mul_softmax, (x, scale), 1, 3)
+        self.common(mul_softmax, (scale, x), 1, 3)
+        self.common(div_softmax, (x, inv_scale), 1, 3)
+
+        scale = torch.randn(10) * 1e6
+        inv_scale = 1 / scale
+        self.common(mul_softmax, (x, scale), 1, 3)
+        self.common(mul_softmax, (scale, x), 1, 3)
+        self.common(div_softmax, (x, inv_scale), 1, 3)
+
+        scale = torch.randn(1, 10) * 1e6
+        inv_scale = 1 / scale
+        self.common(mul_softmax, (x, scale), 1, 3)
+        self.common(mul_softmax, (scale, x), 1, 3)
+        self.common(div_softmax, (x, inv_scale), 1, 3)
+
+        # Test matching with type promotion
+        x = torch.randn(10, 10, dtype=torch.bfloat16)
+        scale = torch.randn(10, dtype=torch.bfloat16) * 1e6
+        inv_scale = 1 / scale
+        self.common(mul_softmax, (x, scale), 1, 4, reference_in_float=True)
+        self.common(mul_softmax, (scale, x), 1, 4, reference_in_float=True)
+        self.common(div_softmax, (x, inv_scale), 1, 4, reference_in_float=True)
+
+        # No match if scale changes in softmax dim
+        scale = torch.randn(10, 10)
+        self.common(mul_softmax, (x, scale), 0, 0)
+        self.common(mul_softmax, (scale, x), 0, 0)
+        self.common(div_softmax, (x, scale), 0, 0)
+
 
 if __name__ == "__main__":
     if IS_LINUX and HAS_CUDA:
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index c4394b3964865..09e913350e143 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -240,9 +240,8 @@ def f(a, b):
         def f(a, b):
             return torch.cat([torch.softmax(a, dim=-1), torch.softmax(b, dim=-1)]).cos()
 
-        # potentially beneficial to fuse but we exclude reductions from pointwise cat
         inp = (T(10, 10), T(10, 10))
-        self.assertExpectedInline(count_numel(f, *inp), """800""")
+        self.assertExpectedInline(count_numel(f, *inp), """680""")
 
         # Should turn into pointwise even if only some of inputs are pointwise.
         def f(a, b):
@@ -267,6 +266,13 @@ def f(a, b):
         inp = (T(10, 10), T(10, 10))
         self.assertExpectedInline(count_numel(f, *inp), """400""")
 
+        def f(a, b):
+            b = b.cos()
+            return torch.cat([a, b])
+
+        inp = (T(10, 10), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """400""")
+
     @patch.object(config, "split_cat_fx_passes", False)
     @patch.object(
         config,
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index c74f776c1527a..3a7b66d660658 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -17,7 +17,9 @@
 import time
 import typing
 import unittest
+import unittest.mock
 import weakref
+from pathlib import Path
 from typing import Tuple
 from unittest.mock import patch
 
@@ -40,6 +42,9 @@
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.utils import (
     add_scheduler_init_hook,
+    aoti_compile_with_persistent_cache,
+    aoti_eager_cache_dir,
+    load_aoti_eager_cache,
     run_and_get_code,
     run_and_get_triton_code,
 )
@@ -81,7 +86,6 @@
 from torch.utils import _pytree as pytree
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_flatten, tree_unflatten
-from torch.utils._triton import has_triton
 from torch.utils.weak import WeakTensorKeyDictionary
 
 DO_PERF_TEST = os.environ.get("DO_PERF_TEST") == "1"
@@ -761,6 +765,102 @@ def fn(a, b):
             ),
         )
 
+    @skipCUDAIf(not SM80OrLater, "Requires sm80")
+    def test_eager_aoti_cache_hit(self):
+        ns = "aten"
+        op_name = "abs"
+        dispatch_key = "CPU"
+        device = "cpu"
+        if self.device.lower() == "cuda":
+            dispatch_key = "CUDA"
+            device = "cuda"
+
+        input_tensor = torch.randn(128, dtype=torch.float, device=device)
+        kernel_lib_path = aoti_compile_with_persistent_cache(
+            ns,
+            op_name,
+            device,
+            False,
+            getattr(torch.ops.aten, op_name),
+            (input_tensor,),
+            {},
+        )
+        self.assertTrue(Path(kernel_lib_path).exists())
+
+        from unittest import mock
+
+        # Patch the aoti_compile_with_persistent_cache as None to ensure no new kernel is generated
+        with mock.patch(
+            "torch._inductor.utils.aoti_compile_with_persistent_cache", None
+        ):
+            qualified_op_name = f"{ns}::{op_name}"
+            _, overload_names = torch._C._jit_get_operation(qualified_op_name)
+
+            with _scoped_library("aten", "IMPL") as torch_compile_op_lib_impl:
+                # Get ref result from eager
+                ref_value = getattr(torch.ops.aten, op_name)(input_tensor)
+
+                for overload_name in overload_names:
+                    try:
+                        reg_op_name = qualified_op_name
+                        schema = torch._C._get_schema(qualified_op_name, overload_name)
+                        if schema.overload_name:
+                            reg_op_name = f"{qualified_op_name}.{schema.overload_name}"
+                        torch_compile_op_lib_impl._impl_with_aoti_compile(  # noqa: F821
+                            reg_op_name, dispatch_key
+                        )
+                    except Exception as e:
+                        continue
+
+                # Invoke the pre-compiled kernel and get result.
+                res_value = getattr(torch.ops.aten, op_name)(input_tensor)
+
+                self.assertEqual(ref_value, res_value)
+
+    @skipCUDAIf(not SM80OrLater, "Requires sm80")
+    def test_aoti_compile_with_persistent_cache(self):
+        def fn(a):
+            return torch.abs(a)
+
+        ns = "aten"
+        op_name = "abs"
+
+        device = "cpu"
+        if self.device.lower() == "cuda":
+            device = "cuda"
+
+        input_tensor = torch.randn(128, dtype=torch.float, device=device)
+        kernel_lib_path = aoti_compile_with_persistent_cache(
+            ns,
+            op_name,
+            input_tensor.device.type,
+            False,
+            fn,
+            args=(input_tensor,),
+            kwargs={},
+        )
+        self.assertTrue(len(kernel_lib_path) > 0)
+
+        device_kernel_cache = aoti_eager_cache_dir(ns, device)
+        kernel_conf = device_kernel_cache / f"{op_name}.json"
+        self.assertTrue(kernel_conf.exists())
+
+        json_data = load_aoti_eager_cache("aten", "abs", input_tensor.device.type)
+        self.assertTrue(json_data is not None)
+        self.assertTrue(isinstance(json_data, list))
+        self.assertTrue(len(json_data) > 0)
+
+        op_info = json_data[0]
+        self.assertTrue(isinstance(op_info, dict))
+        self.assertTrue("meta_info" in op_info)
+        self.assertTrue("kernel_path" in op_info)
+        kernel_libs_abs_path = []
+        for item in json_data:
+            kernel_path = device_kernel_cache / item["kernel_path"]
+            kernel_libs_abs_path.append(kernel_path.as_posix())
+
+        self.assertTrue(kernel_lib_path in kernel_libs_abs_path)
+
     @skipCUDAIf(not SM80OrLater, "Requires sm80")
     def test_torch_compile_override_registration(self):
         dynamic = False
@@ -1105,6 +1205,22 @@ def repeat(x, n):
         self.assertEqual(expect, actual)
         self.assertEqual(actual, repeat(x, 3))
 
+    def test_index_propagation_abs(self):
+        def reflection_pad_left(x, n):
+            # e.g. x=[1, 2, 3], n=2 => returns [3, 2, 1, 2, 3]
+            i = torch.arange(x.shape[0] + n, device=x.device)
+            return x[(i - n).abs()]
+
+        x = torch.randn(8, device=self.device)
+        opt_fn = torch._dynamo.optimize("inductor")(reflection_pad_left)
+
+        # this should be collapsed to direct indexing
+        actual = _run_and_assert_no_indirect_indexing(
+            self, opt_fn, x, 3, has_wrapping=False
+        )
+        expect = reflection_pad_left(x, 3)
+        self.assertEqual(expect, actual)
+
     @skipIfRocm
     @config.patch(debug_index_asserts=False)
     def test_neg_index(self):
@@ -2222,14 +2338,74 @@ def fn_int_input(a, i):
     def test_div_precision(self):
         # Reproducer for https://github.com/pytorch/pytorch/issues/101039
 
-        def forward(y):
-            z = y.div(1e-06)
+        def forward(x, y):
+            z = x.div(y)
             return F.softmax(z, dim=-1)
 
         query = torch.randn(1, 10, 40)
         key = torch.randn(1, 2, 40)
-        y = torch.matmul(query, key.transpose(-2, -1))
-        self.common(forward, (y,))
+        x = torch.matmul(query, key.transpose(-2, -1))
+        self.common(forward, (x, 1e-6))
+
+        x = torch.tensor(
+            [
+                [
+                    [
+                        [-16.1649, 5.6846, -5.1022, -9.1134],
+                        [-11.5552, -2.2615, -12.8913, 10.6538],
+                        [-7.1666, -5.3333, 2.0776, -9.7984],
+                        [7.4469, -2.3948, 2.7371, 0.9201],
+                    ],
+                    [
+                        [-8.0361, -16.3771, 22.7741, 4.4685],
+                        [20.8047, -0.7771, -2.4355, -2.2299],
+                        [3.8343, -2.0914, -2.4077, 2.2740],
+                        [-15.8663, -2.7015, -12.5241, -3.0040],
+                    ],
+                    [
+                        [-2.5139, 14.4393, -3.7186, 1.2255],
+                        [5.6742, 14.1842, -8.5976, 16.8366],
+                        [-9.7358, -3.0279, 11.8164, -4.0787],
+                        [-9.0621, 8.2580, 29.9486, -2.4107],
+                    ],
+                    [
+                        [7.3622, 12.5640, -20.5592, 13.6237],
+                        [-11.5640, 0.8832, 16.7275, -2.5009],
+                        [-2.0953, -12.2276, -26.2633, 4.5268],
+                        [15.3329, -11.7492, 6.5650, -9.2483],
+                    ],
+                ],
+                [
+                    [
+                        [7.9980, -4.9369, 3.1508, 5.2994],
+                        [3.8052, 3.9514, 8.4987, -10.5045],
+                        [-2.6827, -4.0010, -4.0611, 6.4091],
+                        [-19.0318, 6.4073, 2.8923, 8.0250],
+                    ],
+                    [
+                        [7.1650, -3.4585, 5.7720, -5.0305],
+                        [-0.9765, -3.0086, 11.7114, 8.0555],
+                        [-3.1027, -3.5514, 9.6182, -8.8526],
+                        [-9.2348, -6.0239, 6.2528, -6.7221],
+                    ],
+                    [
+                        [11.5936, 22.4139, -0.4089, -4.9889],
+                        [14.8217, -2.3426, -17.6189, 3.7427],
+                        [1.9546, -13.0902, 8.6293, -7.2457],
+                        [-7.6900, -4.5796, 9.6332, -10.2631],
+                    ],
+                    [
+                        [0.8027, -1.0955, 14.8404, -0.2673],
+                        [3.2143, -1.8640, -2.9678, 6.5165],
+                        [-3.9865, 6.5230, 6.3019, -0.4247],
+                        [8.3185, -13.5076, 27.0986, -1.6792],
+                    ],
+                ],
+            ]
+        )
+        x = torch.matmul(x, x)
+        y = torch.tensor([[[0.6331]], [[1.6358]], [[-0.3459]], [[1.0196]]])
+        self.common(forward, (x, y))
 
     def test_div_by_zero(self):
         def fn(x, runtime_zero, runtime_neg_zero):
@@ -5949,6 +6125,7 @@ def fn(a, b):
                 (a, b),
             )
 
+    @skipIfXpu
     def test_nll_loss_backward(self):
         def fn(a, b, c):
             return aten.nll_loss_backward(
@@ -9786,6 +9963,7 @@ def fn(n):
         res = torch.compile(fn)(20)
         self.assertTrue(torch.all((0 <= res) & (res < 10)).item())
 
+    @torch._inductor.config.patch(force_shape_pad=True)
     def test_should_pad_bench_for_bmm(self):
         B = 2
         M = 1024
@@ -9795,25 +9973,9 @@ def test_should_pad_bench_for_bmm(self):
         mat1 = torch.rand(B, M, K, device=self.device)
         mat2 = torch.rand(B, K, N, device=self.device)
 
-        def return_true(*args, **kwargs):
-            return True
-
-        # return value of is_mm_compute_bound depends on flops and membw of
-        # the GPU. Mock it so the test does not becomes flaky when running
-        # on different GPUs.
-        patch1 = patch.object(pad_mm, "is_mm_compute_bound", return_true)
-        # mock get_cached_should_pad so the test does not rely on benchmarking
-        # result.
-        patch2 = patch.object(pad_mm, "get_cached_should_pad", return_true)
+        should_pad = pad_mm.should_pad_bench(None, mat1, mat2, torch.ops.aten.bmm)
 
-        with patch1, patch2:
-            should_pad = pad_mm.should_pad_bench(mat1, mat2, torch.ops.aten.bmm)
-
-        if has_triton():
-            self.assertTrue(should_pad)
-        else:
-            # should_pad_bench always returns False if has_triton returns False
-            self.assertFalse(should_pad)
+        self.assertTrue(should_pad)
 
     @parametrize(
         "name, op",
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
index c5200a6014241..9ee63752f8e0d 100644
--- a/test/inductor/test_torchinductor_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -315,6 +315,7 @@ def f(x, r):
 
         f(torch.tensor([3], device=device), torch.randn(10, device=device))
 
+    @unittest.expectedFailure
     @torch._dynamo.config.patch(
         capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
     )
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 87df89f04d574..c9591a747d77e 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -16,9 +16,12 @@
 import collections
 import gc
 import json
+import mmap
 import os
 import pickle
+import random
 import re
+import struct
 import subprocess
 import sys
 import threading
@@ -64,7 +67,9 @@
 from torch.testing._internal.common_device_type import skipCUDAVersionIn
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+    IS_ARM64,
     IS_JETSON,
+    IS_LINUX,
     IS_WINDOWS,
     parametrize,
     run_tests,
@@ -1216,6 +1221,26 @@ def test_profiler_op_event_args(self):
                         f"Failed finding record funciont for op = {e}",
                     )
 
+    def test_profiler_strides(self):
+        torch._C._profiler._set_record_concrete_inputs_enabled_val(True)
+        base_tensor = torch.randn(1024, dtype=torch.float32)
+        a = base_tensor.as_strided((16, 16), (17, 1), 0)
+        b = base_tensor.as_strided((16, 16), (25, 2), 272)
+        with _profile(record_shapes=True) as prof:
+            c = torch.add(a, b)
+
+        with TemporaryFileName(mode="w+") as fname:
+            prof.export_chrome_trace(fname)
+            with open(fname) as f:
+                j = json.load(f)
+                op_events = [
+                    e for e in j["traceEvents"] if e.get("cat", "") == "cpu_op"
+                ]
+                for e in op_events:
+                    args = e["args"]
+                    if e["name"] == "aten::add":
+                        self.assertEqual(args["Input Strides"], [[17, 1], [25, 2], []])
+
     def test_profiler_fwd_bwd_link(self):
         with _profile(use_kineto=True) as prof:
             t1, t2 = torch.ones(1, requires_grad=True), torch.ones(
@@ -2416,6 +2441,70 @@ def test_profiler_pattern_matcher_json_report(self):
         finally:
             os.remove("torchtidy_report.json")
 
+    @unittest.skipIf(IS_ARM64 or not IS_LINUX, "x86 linux only cpp unwinding")
+    def test_fuzz_symbolize(self):
+        # generate some random addresses in the text section and make sure the
+        # symbolizers do not throw exceptions/crash
+        def get_text_sections():
+            text_sections = []
+            seen = set()
+            for filename in os.listdir("/proc/self/map_files"):
+                library = os.readlink("/proc/self/map_files/" + filename)
+                if ".so" not in library or library in seen:
+                    continue
+                seen.add(library)
+                with open(os.path.join("/proc/self/map_files", library), "rb") as f:
+                    mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
+
+                    def unpack(fmt, offset):
+                        return struct.unpack(
+                            fmt, mm[offset : offset + struct.calcsize(fmt)]
+                        )
+
+                    if mm[:4] != b"\x7fELF":
+                        continue
+                    (section_headers_start,) = unpack("Q", 40)
+                    (section_header_size,) = unpack("H", 58)
+                    (num_section_headers,) = unpack("H", 60)
+                    (shstrndx,) = unpack("H", 62)
+                    (shstrtab_offset,) = unpack(
+                        "Q", section_headers_start + shstrndx * section_header_size + 24
+                    )
+                    for i in range(num_section_headers):
+                        (section_name_offset,) = unpack(
+                            "I", section_headers_start + i * section_header_size
+                        )
+                        name_start = shstrtab_offset + section_name_offset
+                        section_name = mm[name_start : name_start + 6]
+                        if section_name != b".text\0":
+                            continue
+                        (section_offset,) = unpack(
+                            "Q", section_headers_start + i * section_header_size + 24
+                        )
+                        (section_size,) = unpack(
+                            "Q", section_headers_start + i * section_header_size + 32
+                        )
+                        start = int(filename.split("-")[0], 16) + section_offset
+                        text_sections.append((start, section_size))
+                        break
+                    mm.close()
+            return text_sections
+
+        r = random.Random()
+        r.seed(1)
+        text_sections = get_text_sections()
+        addrs = []
+        for i in range(200):
+            s = r.randrange(0, len(text_sections))
+            start, size = text_sections[s]
+            addr = r.randrange(start, start + size)
+            addrs.append(addr)
+        fast = torch._C._profiler.symbolize_addresses(addrs, "fast")
+        dladdr = torch._C._profiler.symbolize_addresses(addrs, "dladdr")
+        addr2line = torch._C._profiler.symbolize_addresses(addrs, "addr2line")
+        self.assertEqual(len(fast), len(addrs))
+        self.assertEqual(len(addr2line), len(fast))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 00462023b6b5b..64c939c7e6846 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -1615,6 +1615,58 @@ def test_decomposed_choose_qparams_per_token_asymmetric_backward(self):
         out = x.div(s).add(zp).round()
         out.sum().backward()
 
+    def test_decomposed_quantize_per_channel_group(self):
+        # register the ops
+        import torch.ao.quantization.fx._decomposed
+        qmin, qmax = (-8, 7)
+        group_size = 128
+        x = torch.randn(100, 256)
+        s = torch.randn(100, 2)
+        zp = torch.randint(qmax, size=(100, 2), dtype=torch.int32)
+
+        # simulate fake quantize per channel group with qdq
+        q = torch.ops.quantized_decomposed.quantize_per_channel_group(
+            x, s, zp, qmin, qmax, torch.int8, group_size,
+        )
+        dq = torch.ops.quantized_decomposed.dequantize_per_channel_group(
+            q, s, zp, qmin, qmax, torch.int8, group_size, torch.float32
+        )
+
+        # express per group fake quant using `torch.fake_quantize_per_channel_affine`
+        x_grouped = x.reshape(-1, group_size)
+        s_flattened = s.flatten()
+        zp_flattened = zp.flatten()
+        fq = torch.fake_quantize_per_channel_affine(
+            x_grouped, s_flattened, zp_flattened, 0, qmin, qmax,
+        )
+        fq = fq.reshape_as(x)
+        torch.testing.assert_close(dq, fq, rtol=0, atol=0)
+
+    def test_decomposed_quantize_per_token(self):
+        # register the ops
+        import torch.ao.quantization.fx._decomposed
+        qmin, qmax = (-8, 7)
+        x = torch.randn(100, 256)
+        s = torch.randn(100, 1)
+        zp = torch.randint(qmax, size=(100, 1), dtype=torch.int32)
+
+        # simulate fake quantize per token with qdq
+        q = torch.ops.quantized_decomposed.quantize_per_token(
+            x, s, zp, qmin, qmax, torch.int8,
+        )
+        dq = torch.ops.quantized_decomposed.dequantize_per_token(
+            q, s, zp, qmin, qmax, torch.int8, torch.float32
+        )
+
+        # express per token fake quant using `torch.fake_quantize_per_channel_affine`
+        s_flattened = s.flatten()
+        zp_flattened = zp.flatten()
+        fq = torch.fake_quantize_per_channel_affine(
+            x, s_flattened, zp_flattened, 0, qmin, qmax,
+        )
+        torch.testing.assert_close(dq, fq, rtol=0, atol=0)
+
+
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
                        "\tpython test/test_quantization.py TESTNAME\n\n"
diff --git a/test/run_test.py b/test/run_test.py
index af3b4d6866730..5b24a00789964 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -37,7 +37,6 @@
     TEST_WITH_ASAN,
     TEST_WITH_CROSSREF,
     TEST_WITH_ROCM,
-    TEST_WITH_SLOW,
     TEST_WITH_SLOW_GRADCHECK,
 )
 
@@ -76,9 +75,11 @@
 sys.path.remove(str(REPO_ROOT))
 
 TEST_CONFIG = os.getenv("TEST_CONFIG", "")
+BUILD_ENVIRONMENT = os.getenv("BUILD_ENVIRONMENT", "")
 RERUN_DISABLED_TESTS = os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1"
 DISTRIBUTED_TEST_PREFIX = "distributed"
 INDUCTOR_TEST_PREFIX = "inductor"
+IS_SLOW = "slow" in TEST_CONFIG or "slow" in BUILD_ENVIRONMENT
 
 
 # Note [ROCm parallel CI testing]
@@ -494,7 +495,7 @@ def run_test(
         None
         if not options.enable_timeout
         else THRESHOLD * 6
-        if TEST_WITH_SLOW
+        if IS_SLOW
         else THRESHOLD * 3
         if should_retry
         and isinstance(test_module, ShardedTest)
@@ -1180,18 +1181,25 @@ def parse_args():
         and (
             TEST_WITH_CROSSREF
             or TEST_WITH_ASAN
-            or (
-                strtobool(os.environ.get("TD_DISTRIBUTED", "False"))
-                and TEST_CONFIG == "distributed"
-                and TEST_CUDA
-            )
+            or (TEST_CONFIG == "distributed" and TEST_CUDA)
             or (IS_WINDOWS and not TEST_CUDA)
             or TEST_CONFIG == "nogpu_AVX512"
             or TEST_CONFIG == "nogpu_NO_AVX2"
+            or (
+                "sm86" not in BUILD_ENVIRONMENT
+                and TEST_CONFIG == "default"
+                and TEST_CUDA
+            )
+            or (not TEST_CUDA and TEST_CONFIG == "default")
         )
         and get_pr_number() is not None
         and not strtobool(os.environ.get("NO_TD", "False"))
-        and not TEST_WITH_SLOW,
+        and not IS_SLOW
+        and not TEST_WITH_ROCM
+        and not IS_MACOS
+        and "onnx" not in BUILD_ENVIRONMENT
+        and "debug" not in BUILD_ENVIRONMENT
+        and "parallelnative" not in BUILD_ENVIRONMENT,
     )
     parser.add_argument(
         "additional_unittest_args",
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 3f5cb476cb292..1872faee6a281 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -4703,7 +4703,7 @@ class TestCudaOptims(TestCase):
         [
             optim
             for optim in optim_db
-            if "foreach" in optim.supported_impls and "cuda" in optim.supports_fused_on
+            if "foreach" in optim.supported_impls and "fused" in optim.supported_impls
         ],
         dtypes=[torch.float32],
     )
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 40075eb24e04c..d4953101d26b9 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -1,76 +1,93 @@
 # Owner(s): ["module: meta tensors"]
 
 
-from torch.testing._internal.common_utils import (
-    TestCase, TEST_WITH_TORCHDYNAMO, run_tests, skipIfCrossRef, skipIfRocm, skipIfTorchDynamo, parametrize,
-    instantiate_parametrized_tests, TemporaryFileName)
-import torch
-import torch._dynamo
-from torch._dynamo.testing import make_test_cls_with_patches
+import contextlib
+import copy
+import dataclasses
+import inspect
 import itertools
+import pickle
+import unittest
+import weakref
+from unittest.mock import patch
+
 import numpy as np
-from torch.testing._internal.jit_utils import RUN_CUDA
+import torch
+import torch._dynamo
+import torch._functorch.config
+import torch._prims as prims
+import torch.testing._internal.optests as optests
+import torch.utils._pytree as pytree
+
+from torch import distributed as dist
+from torch._C._functorch import _add_batch_dim, get_unwrapped, is_batchedtensor
+from torch._dynamo.testing import make_test_cls_with_patches, rand_strided
 from torch._guards import tracing, TracingContext
 from torch._subclasses.fake_tensor import (
+    DynamicOutputShapeException,
     extract_tensor_metadata,
     FakeTensor,
-    FakeTensorMode,
     FakeTensorConverter,
-    DynamicOutputShapeException,
-    UnsupportedOperatorException,
+    FakeTensorMode,
     unset_fake_temporarily,
+    UnsupportedOperatorException,
 )
+from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.symbolic_shapes import (
-    ShapeEnv, DimDynamic, free_symbols, StatelessSymbolicContext, ShapeEnvSettings, statically_known_true
+    DimDynamic,
+    free_symbols,
+    ShapeEnv,
+    ShapeEnvSettings,
+    StatelessSymbolicContext,
+    statically_known_true,
 )
-from torch.testing._internal.custom_op_db import custom_op_db
-from torch.testing._internal.common_device_type import ops
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, OpDTypes
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.fx.passes.fake_tensor_prop import FakeTensorProp
-from torch._dynamo.testing import rand_strided
-from torch._C._functorch import is_batchedtensor, _add_batch_dim, get_unwrapped
 from torch.testing import FileCheck
-import dataclasses
-import inspect
-import unittest
-import torch._prims as prims
-import contextlib
-import weakref
-import copy
-import pickle
-import torch._functorch.config
-import torch.testing._internal.optests as optests
-from unittest.mock import patch
-
-from torch import distributed as dist
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    OpDTypes,
+    ops,
+)
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    skipIfCrossRef,
+    skipIfRocm,
+    skipIfTorchDynamo,
+    TemporaryFileName,
+    TEST_WITH_TORCHDYNAMO,
+    TestCase,
+)
+from torch.testing._internal.custom_op_db import custom_op_db
+from torch.testing._internal.jit_utils import RUN_CUDA
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._python_dispatch import TorchDispatchMode
-import torch.utils._pytree as pytree
-from torch.fx.experimental.proxy_tensor import make_fx
 
 aten = torch.ops.aten
 
 torch._dynamo.config.fake_tensor_cache_enabled = True
 torch._dynamo.config.fake_tensor_cache_crosscheck_enabled = True
 
+
 def expectedFailurePropagateRealTensors(fn):
     fn._expected_failure_propagate_real_tensors = True
     return fn
 
+
 class FakeTensorTest(TestCase):
     def checkType(self, t, device_str, size):
         self.assertTrue(isinstance(t, FakeTensor))
         self.assertEqual(t.device.type, device_str)
         self.assertEqual(list(t.size()), size)
 
-
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_cuda_initialized(self):
         # doesnt error
         with FakeTensorMode():
-            p = torch.randn(4, 2, requires_grad=True, device='cuda')
-            x = torch.randn(8, 4, device='cuda')
+            p = torch.randn(4, 2, requires_grad=True, device="cuda")
+            x = torch.randn(8, 4, device="cuda")
             y = torch.mm(x, p).square().sum()
             y.backward()
 
@@ -86,18 +103,20 @@ def test_basic(self):
             self.assertTrue(isinstance(z, FakeTensor))
 
     def test_custom_op_fallback(self):
-        from torch.library import Library, impl
+        from torch.library import impl, Library
 
         try:
             test_lib = Library("my_test_op", "DEF")  # noqa: TOR901
-            test_lib.define('foo(Tensor self) -> Tensor')
+            test_lib.define("foo(Tensor self) -> Tensor")
 
-            @impl(test_lib, 'foo', 'CPU')
+            @impl(test_lib, "foo", "CPU")
             def foo_impl(self):
                 return self.cos()
 
             x = torch.empty(2, 2, device="cpu")
-            with self.assertRaisesRegex(UnsupportedOperatorException, "my_test_op.foo.default"):
+            with self.assertRaisesRegex(
+                UnsupportedOperatorException, "my_test_op.foo.default"
+            ):
                 with FakeTensorMode(allow_fallback_kernels=True) as mode:
                     x = mode.from_tensor(x)
                     torch.ops.my_test_op.foo(x)
@@ -114,6 +133,7 @@ def test_parameter_instantiation(self):
     @unittest.skipIf(not dist.is_available(), "requires distributed")
     def test_fsdp_flat_param(self):
         from torch.distributed.fsdp._flat_param import FlatParameter
+
         with FakeTensorMode() as m:
             data = torch.randn(2, 2)
             param = FlatParameter(data, requires_grad=True)
@@ -127,11 +147,13 @@ def test_non_parameter_grad(self):
         fake_t = mode.from_tensor(t)
         self.assertEqual(fake_t.requires_grad, t.requires_grad)
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_index_cuda_with_cpu(self):
         with FakeTensorMode():
-            x = torch.rand([2048], device='cuda')
+            x = torch.rand([2048], device="cuda")
             out = x[torch.zeros([36], dtype=torch.int64)]
             self.checkType(out, "cuda", [36])
 
@@ -148,14 +170,14 @@ def test_shape_take_not_device(self):
     def test_repr(self):
         with FakeTensorMode():
             x = torch.empty(2, 2, device="cpu")
-            self.assertEqual(repr(x), 'FakeTensor(..., size=(2, 2))')
+            self.assertEqual(repr(x), "FakeTensor(..., size=(2, 2))")
             x = torch.empty(2, 2, device="meta")
             self.assertEqual(repr(x), "FakeTensor(..., device='meta', size=(2, 2))")
 
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_zero_dim(self):
         with FakeTensorMode() as mode:
-            x = torch.tensor(0.)
+            x = torch.tensor(0.0)
             y = torch.rand([4, 4], device="cuda")
             out = x + y
             self.assertEqual(out.shape, (4, 4))
@@ -173,7 +195,7 @@ def test_nan_to_num(self):
 
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_throw(self):
-        x = torch.tensor(0.)  # TODO: tensor() errors
+        x = torch.tensor(0.0)  # TODO: tensor() errors
         with FakeTensorMode() as mode:
             x_conv = mode.from_tensor(x)
             y = torch.rand([4, 4], device="cuda")
@@ -207,17 +229,25 @@ def test_device_inplace_copy(self):
     def test_fake_dispatch_keys(self):
         with FakeTensorMode():
             x = torch.rand([4])
-            f = FileCheck().check("CPU").check("ADInplaceOrView").check("AutogradCPU").check("AutocastCPU")
+            f = (
+                FileCheck()
+                .check("CPU")
+                .check("ADInplaceOrView")
+                .check("AutogradCPU")
+                .check("AutocastCPU")
+            )
             f.run(torch._C._dispatch_key_set(x))
 
             with torch.inference_mode():
                 x = torch.rand([4])
                 y = x + x
-                FileCheck().check("CPU").check("AutocastCPU").run(torch._C._dispatch_key_set(y))
-                FileCheck().check_not("ADInplaceOrView").check_not("Autograd").run(torch._C._dispatch_key_set(y))
+                FileCheck().check("CPU").check("AutocastCPU").run(
+                    torch._C._dispatch_key_set(y)
+                )
+                FileCheck().check_not("ADInplaceOrView").check_not("Autograd").run(
+                    torch._C._dispatch_key_set(y)
+                )
 
-    # TODO: functorch support for propagate real tensors
-    @expectedFailurePropagateRealTensors
     def test_batch_tensor(self):
         x = torch.rand((3, 4, 5))
         b = _add_batch_dim(x, 0, 0)
@@ -289,7 +319,9 @@ def test_fake_mode_error(self):
             with FakeTensorMode():
                 y = x[0]
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     def test_fake_grad_copy(self):
         x = torch.rand([4, 4], requires_grad=True)
         x.grad = torch.rand([4, 4])
@@ -306,7 +338,7 @@ def test_index_put_error(self):
         for context in [contextlib.nullcontext, lambda: mode]:
             with context():
                 y = torch.randn(2, 2, 3)
-                x = torch.randn(2, 2, 3).to('cuda')
+                x = torch.randn(2, 2, 3).to("cuda")
                 with self.assertRaises(RuntimeError):
                     x[[1, 1]] = y
 
@@ -314,10 +346,12 @@ def test_index_put_error(self):
                     torch.ops.aten.index_put(x, torch.tensor([1, 1], device="cuda"), y)
 
                 # no error
-                torch.ops.aten.index_put(x, torch.tensor([1, 1], device="cuda"), torch.tensor(5.))
-                torch.ops.aten.index_put_(x, torch.tensor([1, 1], device="cuda"), torch.tensor(5.))
-
-
+                torch.ops.aten.index_put(
+                    x, torch.tensor([1, 1], device="cuda"), torch.tensor(5.0)
+                )
+                torch.ops.aten.index_put_(
+                    x, torch.tensor([1, 1], device="cuda"), torch.tensor(5.0)
+                )
 
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_like_constructor(self):
@@ -338,7 +372,9 @@ def test_binary_op_type_promotion(self):
             self.assertEqual(out.dtype, torch.float)
             self.assertEqual(out.device.type, "cpu")
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     def test_from_numpy(self):
         with FakeTensorMode():
             x = torch.tensor(np.zeros([4, 4]))
@@ -366,9 +402,15 @@ def test_upsample_bilinear_small_channels(self):
         mode = FakeTensorMode()
         for i, context in enumerate([contextlib.nullcontext, lambda: mode]):
             with context():
-                arg0_1 = torch.empty_strided((3, 427, 640), (1, 1920, 3), dtype=torch.float32, device='cuda')
+                arg0_1 = torch.empty_strided(
+                    (3, 427, 640), (1, 1920, 3), dtype=torch.float32, device="cuda"
+                )
                 unsqueeze = torch.ops.aten.unsqueeze.default(arg0_1, 0)
-                out.append(torch.ops.aten.upsample_bilinear2d.default(unsqueeze, [800, 1199], False))
+                out.append(
+                    torch.ops.aten.upsample_bilinear2d.default(
+                        unsqueeze, [800, 1199], False
+                    )
+                )
 
         self.assertTrue(out[1].is_contiguous())
         self.checkMetaProps(out[0], out[1])
@@ -409,8 +451,9 @@ def test_out_multi_device(self):
             with self.assertRaisesRegex(Exception, "found.+two.+devices"):
                 x.add_(y)
 
-
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_normalize_device(self):
         with FakeTensorMode():
@@ -427,10 +470,15 @@ def test_recursive_invocation(self):
             y = x + x
             self.assertTrue(mode.in_kernel_invocation)
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @skipIfRocm
-    @parametrize("allow_fallback_kernels", [False, True],
-                 lambda a: 'with_fallback' if a else 'without_fallback')
+    @parametrize(
+        "allow_fallback_kernels",
+        [False, True],
+        lambda a: "with_fallback" if a else "without_fallback",
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_cudnn_rnn(self, allow_fallback_kernels):
         def fn(
@@ -526,7 +574,7 @@ def fn(
                     for ten in out:
                         if i == 1:
                             self.assertTrue(isinstance(ten, FakeTensor))
-                        self.assertEqual(ten.device.type, 'cuda')
+                        self.assertEqual(ten.device.type, "cuda")
 
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_cuda_lstm(self):
@@ -544,13 +592,20 @@ def test_cuda_lstm(self):
                 D = 2 if bidir else 1
                 H_out = proj_size if proj_size > 0 else hidden_size
 
-                lstm = torch.nn.LSTM(input_size=H_in, hidden_size=hidden_size,
-                                     num_layers=num_layers, proj_size=proj_size, batch_first=False,
-                                     bias=True, bidirectional=bidir, device='cuda')
+                lstm = torch.nn.LSTM(
+                    input_size=H_in,
+                    hidden_size=hidden_size,
+                    num_layers=num_layers,
+                    proj_size=proj_size,
+                    batch_first=False,
+                    bias=True,
+                    bidirectional=bidir,
+                    device="cuda",
+                )
 
-                h_0 = torch.randn((num_layers * D, N, H_out), device='cuda')
-                c_0 = torch.randn((num_layers * D, N, hidden_size), device='cuda')
-                inp = torch.randn((L, N, H_in), device='cuda')
+                h_0 = torch.randn((num_layers * D, N, H_out), device="cuda")
+                c_0 = torch.randn((num_layers * D, N, hidden_size), device="cuda")
+                inp = torch.randn((L, N, H_in), device="cuda")
                 (output, (h_n, c_n)) = lstm(inp, (h_0, c_0))
                 output.sum().backward()
 
@@ -578,9 +633,8 @@ def test_same_shape_env_preserved(self):
         t1 = mode1.from_tensor(
             torch.randn(10),
             symbolic_context=StatelessSymbolicContext(
-                dynamic_sizes=[DimDynamic.DYNAMIC],
-                constraint_sizes=[None]
-            )
+                dynamic_sizes=[DimDynamic.DYNAMIC], constraint_sizes=[None]
+            ),
         )
         mode2 = FakeTensorMode(shape_env=shape_env)
         t2 = mode2.from_tensor(t1)
@@ -630,11 +684,16 @@ def test_deepcopy(self):
             mod_copied = copy.deepcopy(mod)
 
         def check_copy(mod, mod_copied):
-            for name, param in itertools.chain(mod.named_parameters(), mod.named_buffers()):
+            for name, param in itertools.chain(
+                mod.named_parameters(), mod.named_buffers()
+            ):
                 param_copied = getattr(mod_copied, name)
                 self.checkMetaProps(param, param_copied)
                 self.assertTrue(isinstance(param_copied, FakeTensor))
-                self.assertEqual(isinstance(param, torch.nn.Parameter), isinstance(param_copied, torch.nn.Parameter))
+                self.assertEqual(
+                    isinstance(param, torch.nn.Parameter),
+                    isinstance(param_copied, torch.nn.Parameter),
+                )
                 self.assertEqual(param.requires_grad, param_copied.requires_grad)
 
         check_copy(mod, mod_copied)
@@ -653,18 +712,22 @@ def __init__(self):
         self.assertIs(mod_copied.a, mod_copied.b)
         self.assertEqual(mod_copied.b.storage()._cdata, mod_copied.a.storage()._cdata)
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_new(self):
         with FakeTensorMode():
             a = torch.rand([16, 1])
             self.checkType(a.new(10, 10), "cpu", [10, 10])
             self.checkType(a.new([1, 2, 3, 4]), "cpu", [4])
-            b = torch.rand([4, 4], device='cuda')
-            self.checkType(b.new(device='cuda'), "cuda", [0])
+            b = torch.rand([4, 4], device="cuda")
+            self.checkType(b.new(device="cuda"), "cuda", [0])
             self.checkType(a.new(torch.rand([1])), "cpu", [1])
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     def test_scalar_inputs(self):
         with FakeTensorMode():
             self.checkType(torch.div(3, 2), "cpu", [])
@@ -672,7 +735,9 @@ def test_scalar_inputs(self):
             self.assertEqual(ten.dtype, torch.float)
             self.checkType(ten, "cpu", [2])
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     def test_allow_meta(self):
         def run_meta():
             with FakeTensorMode():
@@ -688,7 +753,7 @@ def test_embedding_bag_meta(self):
         def f():
             # This behavior was originally unintentional but we see people
             # relying on it
-            embedding = torch.nn.EmbeddingBag(10, 3, mode='sum', device='meta')
+            embedding = torch.nn.EmbeddingBag(10, 3, mode="sum", device="meta")
             input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long)
             offsets = torch.tensor([0, 4], dtype=torch.long)
             return embedding(input, offsets)
@@ -701,7 +766,9 @@ def f():
             self.assertEqual(r.size(), f.size())
             self.assertEqual(r.device, f.device)
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     def test_mixed_real_and_fake_inputs(self):
         class _TestPattern(torch.nn.Module):
             def __init__(self):
@@ -730,7 +797,9 @@ def forward(self, input):
             out = mod(torch.randn(1, 1, 3, 3))
         self.checkType(out, "cpu", (1, 1, 3, 3))
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_aten_copy_multi_device(self):
         with FakeTensorMode():
@@ -744,7 +813,9 @@ def test_aten_copy_multi_device(self):
         self.checkType(copy2, "cuda", (4,))
         self.checkType(out, "cpu", (4,))
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_aten_index_multi_device(self):
         with FakeTensorMode():
@@ -768,7 +839,9 @@ def test_aten_index_multi_device(self):
         self.checkType(r3, "cpu", (4, 4))
         self.checkType(r4, "cuda", (4, 4))
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_aten_slice_scatter_multi_device(self):
         with FakeTensorMode():
@@ -790,7 +863,10 @@ def test__adaptive_avg_pool2d_backward(self):
             grad_out = torch.rand(2, 3, 4, 4)
             inp = torch.rand(2, 3, 4, 4).to(memory_format=torch.channels_last)
             grad_in = torch.ops.aten._adaptive_avg_pool2d_backward(grad_out, inp)
-            self.assertTrue(torch._prims_common.suggest_memory_format(grad_in) == torch.channels_last)
+            self.assertTrue(
+                torch._prims_common.suggest_memory_format(grad_in)
+                == torch.channels_last
+            )
 
     # Propagate real tensors doesn't work when original input arguments are
     # fake
@@ -805,6 +881,23 @@ def forward(self, input):
             ep = torch.export.export(MyNumpyModel(), args=(torch.randn(1000),))
             self.assertTrue(isinstance(ep, torch.export.ExportedProgram))
 
+    def test_unsqueeze_copy(self):
+        shape_env = ShapeEnv()
+        t1 = torch.ones(2, 2, 768)
+        with FakeTensorMode(shape_env=shape_env) as fake_mode:
+            t = fake_mode.from_tensor(
+                t1,
+                symbolic_context=StatelessSymbolicContext(
+                    dynamic_sizes=[
+                        DimDynamic.DYNAMIC,
+                        DimDynamic.STATIC,
+                        DimDynamic.STATIC,
+                    ],
+                ),
+            )
+
+        self.assertEqual(t.shape[0], torch.ops.aten.unsqueeze_copy(t, 1).shape[0])
+
     def test_alias_call(self):
         fwAD = torch.autograd.forward_ad
 
@@ -853,20 +946,20 @@ def assertNotConst(self, *args):
 
     def test_simple(self):
         with FakeTensorMode():
-            x = torch.tensor(4.)
-            self.assertEqual(x.item(), 4.)
+            x = torch.tensor(4.0)
+            self.assertEqual(x.item(), 4.0)
 
     def test_inplace_add(self):
         with FakeTensorMode():
-            x = torch.tensor(4.)
+            x = torch.tensor(4.0)
             y = x.add_(1)
-            self.assertEqual(x.item(), 5.)
-            self.assertEqual(y.item(), 5.)
+            self.assertEqual(x.item(), 5.0)
+            self.assertEqual(y.item(), 5.0)
             self.assertConst(x, y)
 
     def test_shared_storages(self):
         with FakeTensorMode():
-            x = torch.tensor([4.])
+            x = torch.tensor([4.0])
             y = x[:]
 
             self.assertEqual(x.storage()._cdata, y.storage()._cdata)
@@ -874,7 +967,7 @@ def test_shared_storages(self):
 
     def test_constant_invalidation(self):
         with FakeTensorMode():
-            x = torch.tensor([1.])
+            x = torch.tensor([1.0])
             self.assertConst(x)
             y = torch.rand([1])
             x.add_(y)
@@ -889,13 +982,14 @@ def test_inplace_view_invalidation(self):
             self.assertNotConst(x)
 
     def test_fake_tensor_in_intlist_repro(self):
-
         def fn(tensors):
             max_size = torch.tensor([800, 1216], dtype=torch.int64)
             batch_shape = [len(tensors)] + list(tensors[0].shape[:-2]) + list(max_size)
             return tensors[0].new_full(batch_shape, 0.0)
 
-        with self.assertRaises(torch._subclasses.fake_tensor.DataDependentOutputException):
+        with self.assertRaises(
+            torch._subclasses.fake_tensor.DataDependentOutputException
+        ):
             with torch._subclasses.fake_tensor.FakeTensorMode():
                 a = torch.randn(3, 800, 1199)
                 b = torch.randn(3, 800, 800)
@@ -913,7 +1007,7 @@ def test_fake_tensor_batch_norm_cpu(self):
 
     def test_shared_storage_invalidation(self):
         with FakeTensorMode():
-            x = torch.tensor([1.])
+            x = torch.tensor([1.0])
             y = x[:]
             self.assertConst(x, y)
             y.add_(torch.rand([1]))
@@ -929,7 +1023,7 @@ def test_aliased_const_write(self):
 
     def test_constant_propagate_through_functions(self):
         with FakeTensorMode():
-            y = torch.div(4, 4, rounding_mode='trunc')
+            y = torch.div(4, 4, rounding_mode="trunc")
             self.assertConst(y)
 
 
@@ -954,7 +1048,9 @@ def test_fake(self, device, dtype, op):
 
 make_propagate_real_tensors_cls(FakeTensorOpInfoTest)
 instantiate_device_type_tests(FakeTensorOpInfoTest, globals(), only_for=("cpu", "cuda"))
-instantiate_device_type_tests(PropagateRealTensorsFakeTensorOpInfoTest, globals(), only_for=("cpu",))  # noqa: F821
+instantiate_device_type_tests(
+    PropagateRealTensorsFakeTensorOpInfoTest, globals(), only_for=("cpu",)  # noqa: F821
+)
 
 
 class FakeTensorConverterTest(TestCase):
@@ -967,7 +1063,10 @@ def test_memoized_conversion_from_meta(self):
         x = torch.rand(2, 2).to(device="meta")
         mode = FakeTensorMode()
         converter = mode.fake_tensor_converter
-        self.assertTrue(converter.from_meta_and_device(mode, x, "cpu") is converter.from_meta_and_device(mode, x, "cpu"))
+        self.assertTrue(
+            converter.from_meta_and_device(mode, x, "cpu")
+            is converter.from_meta_and_device(mode, x, "cpu")
+        )
 
     def test_separate_tensor_storages_view(self):
         x = torch.rand(2, 2, 2)
@@ -998,7 +1097,6 @@ def test_separate_tensor_storages_non_view(self):
         self.assertEqual(len(converter.tensor_memo), 0)
         self.assertEqual(len(converter.meta_converter.storage_memo), 0)
 
-
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1991")
     def test_dead_weak_ref(self):
         x = torch.rand(2, 2, 2)
@@ -1101,7 +1199,8 @@ def test_non_kwarg_only_device(self):
             )
             if has_non_kwarg_device:
                 self.assertTrue(
-                    self.get_aten_op(schema) in torch._subclasses.fake_tensor._device_not_kwarg_ops
+                    self.get_aten_op(schema)
+                    in torch._subclasses.fake_tensor._device_not_kwarg_ops
                 )
 
     def test_tensor_constructors_all_have_kwarg_device(self):
@@ -1140,24 +1239,35 @@ def test_like_ops(self):
         for schema in self.get_all_aten_schemas():
             if "_like" == schema.name[-5:]:
                 op = self.get_aten_op(schema)
-                self.assertIn(op, torch._subclasses.fake_tensor._like_tensor_constructors)
+                self.assertIn(
+                    op, torch._subclasses.fake_tensor._like_tensor_constructors
+                )
 
     def test_str_storage(self):
         x = torch.zeros(3)
         with FakeTensorMode() as m:
             y = m.from_tensor(x)
-            self.assertExpectedInline(str(x.storage()), '''\
+            self.assertExpectedInline(
+                str(x.storage()),
+                """\
  0.0
  0.0
  0.0
-[torch.storage.TypedStorage(dtype=torch.float32, device=cpu) of size 3]''')
-            self.assertExpectedInline(str(y.storage()), '''\
+[torch.storage.TypedStorage(dtype=torch.float32, device=cpu) of size 3]""",
+            )
+            self.assertExpectedInline(
+                str(y.storage()),
+                """\
 ...
-[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]''')
+[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]""",
+            )
 
-        self.assertExpectedInline(str(y.storage()), '''\
+        self.assertExpectedInline(
+            str(y.storage()),
+            """\
 ...
-[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]''')
+[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]""",
+        )
 
     # at::_embedding_bag has no op info,
     # and returns extra tensors that at::embedding bag throws away
@@ -1172,7 +1282,9 @@ def test_embedding_bag_private(self):
 
         ref_out = torch.ops.aten._embedding_bag(*args)
         with FakeTensorMode() as m:
-            meta_args = [m.from_tensor(a) if isinstance(a, torch.Tensor) else a for a in args]
+            meta_args = [
+                m.from_tensor(a) if isinstance(a, torch.Tensor) else a for a in args
+            ]
             meta_out = torch.ops.aten._embedding_bag(*meta_args)
 
         self.assertEqual(len(ref_out), len(meta_out))
@@ -1188,20 +1300,29 @@ def test_cross_entropy_loss(self):
             args = (inp, target, w)
             ref = fn(*args)
             with FakeTensorMode() as m:
-                meta_args = [m.from_tensor(a) if isinstance(a, torch.Tensor) else a for a in args]
-                meta_out = torch.nn.functional.cross_entropy(*meta_args, label_smoothing=0.5)
+                meta_args = [
+                    m.from_tensor(a) if isinstance(a, torch.Tensor) else a for a in args
+                ]
+                meta_out = torch.nn.functional.cross_entropy(
+                    *meta_args, label_smoothing=0.5
+                )
 
             self.assertEqual(ref.size(), meta_out.size())
 
     @skipIfRocm
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support SDPA or pre-SM80 hardware")
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION,
+        "Does not support SDPA or pre-SM80 hardware",
+    )
     def test_flash_attention(self):
         class Repro(torch.nn.Module):
             def __init__(self):
                 super().__init__()
 
             def forward(self, arg1, arg2, arg3):
-                torch.ops.aten._scaled_dot_product_flash_attention(arg1, arg2, arg3, scale=0.17677669529663687)
+                torch.ops.aten._scaled_dot_product_flash_attention(
+                    arg1, arg2, arg3, scale=0.17677669529663687
+                )
 
         args_new = [
             [
@@ -1213,11 +1334,13 @@ def forward(self, arg1, arg2, arg3):
                 ((4, 2, 16, 32), (1024, 512, 32, 1), torch.float16, "cuda"),
                 ((4, 2, 16, 32), (1024, 512, 32, 1), torch.float16, "cuda"),
                 ((4, 2, 16, 32), (1024, 512, 32, 1), torch.float16, "cuda"),
-            ]
+            ],
         ]
         for args_list in args_new:
-            args = [rand_strided(bsz, num_heads, seq_len, head_dim) for
-                    (bsz, num_heads, seq_len, head_dim) in args_list]
+            args = [
+                rand_strided(bsz, num_heads, seq_len, head_dim)
+                for (bsz, num_heads, seq_len, head_dim) in args_list
+            ]
             try:
                 with torch._subclasses.CrossRefFakeMode():
                     Repro()(*args)
@@ -1225,7 +1348,10 @@ def forward(self, arg1, arg2, arg3):
                 # We expect the cross ref to succed for the first output to fail
                 # for the rng state, see Note [Seed and Offset]
                 self.assertTrue("output[0]" not in str(e))
-                self.assertTrue("found mismatched tensor metadata for output[6]: Devices cpu and cuda:0 are not equal!" in str(e))
+                self.assertTrue(
+                    "found mismatched tensor metadata for output[6]: Devices cpu and cuda:0 are not equal!"
+                    in str(e)
+                )
 
     # IMPORTANT!!! Always run even if CUDA is not available
     def test_fake_cuda_no_init(self):
@@ -1234,12 +1360,12 @@ def test_fake_cuda_no_init(self):
         if torch._functorch.config.fake_tensor_propagate_real_tensors:
             return
         with FakeTensorMode():
-            torch.empty(10, device='cuda')
-            torch.ones(10, device='cuda')
-            torch.zeros(10, device='cuda')
-            torch.rand(10, device='cuda')
-            torch.tensor(3.14, device='cuda')
-            torch.tensor([[3.14, 2], [1, 2]], device='cuda')
+            torch.empty(10, device="cuda")
+            torch.ones(10, device="cuda")
+            torch.zeros(10, device="cuda")
+            torch.rand(10, device="cuda")
+            torch.tensor(3.14, device="cuda")
+            torch.tensor([[3.14, 2], [1, 2]], device="cuda")
 
     @skipIfRocm
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
@@ -1351,7 +1477,6 @@ def to_fake_tensor(x):
                     failed = True
                 self.assertTrue(failed)
 
-
     @expectedFailurePropagateRealTensors  # Propagate real tensors doesn't work with fake-on-fake
     def test_fake_tensor_prop_on_nn_module_with_optional_args(self):
         class OptionalArgumentInBetween(torch.nn.Module):
@@ -1371,14 +1496,19 @@ def forward(self, value, another_value=None, another_optional_value=None):
                 value = value + another_value + another_optional_value
                 return value * value
 
-        fake_mode = FakeTensorMode(allow_non_fake_inputs=True, allow_fallback_kernels=False)
+        fake_mode = FakeTensorMode(
+            allow_non_fake_inputs=True, allow_fallback_kernels=False
+        )
         with fake_mode:
             model = OptionalArgumentInBetween()
             value = torch.randn(5, 4)
             another_optional_value = torch.randn(5, 4)
-            graph_model = torch.fx.symbolic_trace(model, (value, None, another_optional_value))
-            FakeTensorProp(graph_model, fake_mode).propagate(value, None, another_optional_value)
-
+            graph_model = torch.fx.symbolic_trace(
+                model, (value, None, another_optional_value)
+            )
+            FakeTensorProp(graph_model, fake_mode).propagate(
+                value, None, another_optional_value
+            )
 
     @expectedFailurePropagateRealTensors  # TODO: not sure about this one, kinda strange
     def test_unbacked_shape_realloc(self):
@@ -1390,12 +1520,14 @@ def f(x):
         with fake_mode:
             value = torch.randn(5)
             gm = make_fx(f)(value)
-        nonzero_nodes = [n for n in gm.graph.nodes if n.target is torch.ops.aten.nonzero.default]
+        nonzero_nodes = [
+            n for n in gm.graph.nodes if n.target is torch.ops.aten.nonzero.default
+        ]
         self.assertEqual(len(nonzero_nodes), 1)
-        self.assertIsInstance(nonzero_nodes[0].meta['val'].shape[0], torch.SymInt)
-        u0 = nonzero_nodes[0].meta['val'].shape[0]
+        self.assertIsInstance(nonzero_nodes[0].meta["val"].shape[0], torch.SymInt)
+        u0 = nonzero_nodes[0].meta["val"].shape[0]
         FakeTensorProp(gm, fake_mode).propagate(value)
-        u1 = nonzero_nodes[0].meta['val'].shape[0]
+        u1 = nonzero_nodes[0].meta["val"].shape[0]
         # Test that this test is actually doing something in that the
         # FakeTensorProp actually triggered a reallocation.  If this assert is
         # failing, it could be because we started memoizing the nnz count for
@@ -1407,9 +1539,7 @@ def f(x):
         self.assertIsNot(u0, u1)
         self.assertTrue(statically_known_true(u0 == u1))
 
-
     def test_torch_load_with_fake_mode(self):
-
         class TheModelClass(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1462,7 +1592,8 @@ def test_shape_env_settings(self):
         """
         init_sig = inspect.signature(ShapeEnv._init)
         args = [
-            name for name, param in init_sig.parameters.items()
+            name
+            for name, param in init_sig.parameters.items()
             if type(param.default) is bool
         ]
 
@@ -1770,5 +1901,6 @@ def test_inference_mode(self):
                 extract_tensor_metadata(res4),
             )
 
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_flop_counter.py b/test/test_flop_counter.py
index 43f5cb9dadf4f..4f9c7020c0e60 100644
--- a/test/test_flop_counter.py
+++ b/test/test_flop_counter.py
@@ -1,15 +1,24 @@
 # Owner(s): ["module: unknown"]
 
+import functools
+import unittest
+
 import torch
-from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_TORCHDYNAMO
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION, PLATFORM_SUPPORTS_MEM_EFF_ATTENTION
-import torch.utils.flop_counter
 import torch.nn.functional as F
-import unittest
-import functools
+import torch.utils.flop_counter
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+    PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+)
+from torch.testing._internal.common_utils import (
+    run_tests,
+    TEST_WITH_TORCHDYNAMO,
+    TestCase,
+)
 
 try:
     from torchvision import models as torchvision_models
+
     HAS_TORCHVISION = True
 except ImportError:
     HAS_TORCHVISION = False
@@ -17,16 +26,22 @@
 
 HAS_CUDA = torch.cuda.is_available()
 
+
 def FlopCounterMode(*args, **kwargs):
     return torch.utils.flop_counter.FlopCounterMode(*args, **kwargs, display=False)
 
+
 def get_total_flops(mode):
     return str(sum(v for _, v in mode.flop_counts["Global"].items()))
 
+
 def T(*shape, requires_grad=False):
     return torch.randn(*shape, requires_grad=requires_grad)
 
-@unittest.skipIf(TEST_WITH_TORCHDYNAMO, "torchdynamo doesn't work with __torch_dispatch__ right now")
+
+@unittest.skipIf(
+    TEST_WITH_TORCHDYNAMO, "torchdynamo doesn't work with __torch_dispatch__ right now"
+)
 class TestFlopCounter(TestCase):
     def test_flop_counter_variety(self):
         mod = torch.nn.Linear(9, 10)
@@ -109,6 +124,7 @@ def test_backward_reset(self):
     def test_torchscript(self):
         def foo(x):
             return torch.mm(x, x)
+
         with FlopCounterMode() as mode:
             foo(T(5, 5))
         unscripted_flops = get_total_flops(mode)
@@ -125,7 +141,9 @@ def forward(ctx, input: torch.Tensor) -> torch.Tensor:
 
             @staticmethod
             def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
-                return torch.mm(grad_output, grad_output) + torch.mm(grad_output, grad_output)
+                return torch.mm(grad_output, grad_output) + torch.mm(
+                    grad_output, grad_output
+                )
 
         a = T(5, 5, requires_grad=True)
         with FlopCounterMode() as mode:
@@ -160,11 +178,13 @@ def backward(ctx, grad_out):
                     return grad_inp, grad_weight, None
                 else:
                     grad_inp = F.conv1d(grad_out, weight)
-                    grad_weight = F.conv1d(grad_out.transpose(1, 0), inp.transpose(1, 0))
+                    grad_weight = F.conv1d(
+                        grad_out.transpose(1, 0), inp.transpose(1, 0)
+                    )
                     return grad_inp, grad_weight.transpose(1, 0), None
 
-
         from torch.func import grad
+
         x = torch.randn(2, 3, 16, dtype=torch.float64)
         weight = torch.randn(3, 4, 4, dtype=torch.float64)
 
@@ -182,13 +202,16 @@ def only_convs(x, weight, transposed):
 
         self.assertEqual(boring_grads, fun_grads)
 
-
     def test_convs(self):
         def assert_equivalence(f, expected_forward=None):
             with FlopCounterMode() as mode:
                 f()
-            conv_forward_flops = mode.get_flop_counts()['Global'][torch.ops.aten.convolution]
-            conv_backward_flops = mode.get_flop_counts()['Global'][torch.ops.aten.convolution_backward]
+            conv_forward_flops = mode.get_flop_counts()["Global"][
+                torch.ops.aten.convolution
+            ]
+            conv_backward_flops = mode.get_flop_counts()["Global"][
+                torch.ops.aten.convolution_backward
+            ]
 
             self.assertEqual(conv_forward_flops * 2, conv_backward_flops)
             if expected_forward is not None:
@@ -213,8 +236,12 @@ def assert_equivalence(f, expected_forward=None):
             x = torch.rand(1, in_channels, 4, 4, requires_grad=True)
             weight = torch.randn(out_channels, in_channels, 2, 2, requires_grad=True)
             assert_equivalence(lambda: F.conv2d(x, weight).sum().backward())
-            transposed_weight = torch.randn(in_channels, out_channels, 2, 2, requires_grad=True)
-            assert_equivalence(lambda: F.conv_transpose2d(x, transposed_weight).sum().backward())
+            transposed_weight = torch.randn(
+                in_channels, out_channels, 2, 2, requires_grad=True
+            )
+            assert_equivalence(
+                lambda: F.conv_transpose2d(x, transposed_weight).sum().backward()
+            )
 
     @skipIfNoTorchVision
     def test_module(self):
@@ -224,12 +251,15 @@ def test_module(self):
             resnet18(a).sum().backward()
 
         self.assertExpectedInline(get_total_flops(mode), """10884440064""")
-        layer1_conv_flops = mode.flop_counts['ResNet.layer1'][torch.ops.aten.convolution]
-        layer1_conv_back_flops = mode.flop_counts['ResNet.layer1'][torch.ops.aten.convolution_backward]
+        layer1_conv_flops = mode.flop_counts["ResNet.layer1"][
+            torch.ops.aten.convolution
+        ]
+        layer1_conv_back_flops = mode.flop_counts["ResNet.layer1"][
+            torch.ops.aten.convolution_backward
+        ]
         self.assertExpectedInline(str(layer1_conv_flops), """924844032""")
         self.assertExpectedInline(str(layer1_conv_back_flops), """1849688064""")
 
-
     def test_conv_transpose_loop(self):
         x = torch.rand(1, 4, 30, 2)
         model = torch.nn.ConvTranspose2d(4, 8, (2, 2), stride=2)
@@ -241,7 +271,9 @@ def test_conv_transpose_loop(self):
         self.assertExpectedInline(str(mode.get_total_flops()), """1536000""")
 
     def test_custom(self):
-        mode = FlopCounterMode(custom_mapping={torch.ops.aten.add: lambda *args, out_shape: 5})
+        mode = FlopCounterMode(
+            custom_mapping={torch.ops.aten.add: lambda *args, out_shape: 5}
+        )
         with mode:
             a = T(4, 5)
             a + a
@@ -250,6 +282,7 @@ def test_custom(self):
 
         def count(*args, out_val):
             return out_val.numel()
+
         count._get_raw = True
 
         mode = FlopCounterMode(custom_mapping={torch.ops.aten.add: count})
@@ -264,8 +297,11 @@ def test_noop(self):
             T(4, 5).cos()
 
     @unittest.skipIf(not HAS_CUDA, "CUDA not available")
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION or not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
-                     "Does not support all SDPA backends (pre-SM80 hardware on CUDA)")
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION
+        or not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+        "Does not support all SDPA backends (pre-SM80 hardware on CUDA)",
+    )
     def test_sdpa(self):
         batch_size = 4
         n_heads = 8
@@ -277,73 +313,154 @@ def test_sdpa(self):
 
         torch.manual_seed(0)
 
-        def get_flops(batch_size, n_heads, seq_len_q, seq_len_k, head_dim, head_dim_v, dtype, backend, with_backward=False):
-            query = torch.randn(batch_size, n_heads, seq_len_q, head_dim, device='cuda', dtype=dtype, requires_grad=True)
-            key = torch.randn(batch_size, n_heads, seq_len_k, head_dim, device='cuda', dtype=dtype, requires_grad=True)
-            value = torch.randn(batch_size, n_heads, seq_len_k, head_dim_v, device='cuda', dtype=dtype, requires_grad=True)
+        def get_flops(
+            batch_size,
+            n_heads,
+            seq_len_q,
+            seq_len_k,
+            head_dim,
+            head_dim_v,
+            dtype,
+            backend,
+            with_backward=False,
+        ):
+            query = torch.randn(
+                batch_size,
+                n_heads,
+                seq_len_q,
+                head_dim,
+                device="cuda",
+                dtype=dtype,
+                requires_grad=True,
+            )
+            key = torch.randn(
+                batch_size,
+                n_heads,
+                seq_len_k,
+                head_dim,
+                device="cuda",
+                dtype=dtype,
+                requires_grad=True,
+            )
+            value = torch.randn(
+                batch_size,
+                n_heads,
+                seq_len_k,
+                head_dim_v,
+                device="cuda",
+                dtype=dtype,
+                requires_grad=True,
+            )
 
             if backend == "math":
-                backend = torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False)
+                backend = torch.backends.cuda.sdp_kernel(
+                    enable_flash=False, enable_math=True, enable_mem_efficient=False
+                )
             elif backend == "flash":
-                backend = torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False)
+                backend = torch.backends.cuda.sdp_kernel(
+                    enable_flash=True, enable_math=False, enable_mem_efficient=False
+                )
             elif backend == "mem_efficient":
-                backend = torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=True)
+                backend = torch.backends.cuda.sdp_kernel(
+                    enable_flash=False, enable_math=False, enable_mem_efficient=True
+                )
 
             mode = FlopCounterMode()
             with backend, mode:
-                out = F.scaled_dot_product_attention(query, key, value, dropout_p=0, is_causal=True)
+                out = F.scaled_dot_product_attention(
+                    query, key, value, dropout_p=0, is_causal=True
+                )
                 if with_backward:
                     out.sum().backward()
             return int(get_total_flops(mode))
 
         # Sets seq_len_q == seq_len_k and dim_q == dim_v
-        run_uniform_flops = functools.partial(get_flops, batch_size, n_heads, seq_len_q, seq_len_q, head_dim, head_dim, dtype)
+        run_uniform_flops = functools.partial(
+            get_flops,
+            batch_size,
+            n_heads,
+            seq_len_q,
+            seq_len_q,
+            head_dim,
+            head_dim,
+            dtype,
+        )
 
-        flops = [run_uniform_flops(backend, with_backward=False) for backend in ["math", "flash", "mem_efficient"]]
+        flops = [
+            run_uniform_flops(backend, with_backward=False)
+            for backend in ["math", "flash", "mem_efficient"]
+        ]
         flops_fw_math, flops_fw_flash, flops_fw_efficient = flops
         self.assertEqual(flops_fw_math, flops_fw_flash)
         self.assertEqual(flops_fw_math, flops_fw_efficient)
 
         self.assertExpectedInline(str(flops_fw_math), """134217728""")
 
-        flops = [run_uniform_flops(backend, with_backward=True) for backend in ["math", "flash", "mem_efficient"]]
+        flops = [
+            run_uniform_flops(backend, with_backward=True)
+            for backend in ["math", "flash", "mem_efficient"]
+        ]
         flops_fw_bw_math, flops_fw_bw_flash, flops_fw_bw_efficient = flops
         self.assertEqual(flops_fw_math * 3, flops_fw_bw_math)
         self.assertEqual(flops_fw_math * 7 // 2, flops_fw_bw_flash)
         self.assertEqual(flops_fw_bw_flash, flops_fw_bw_efficient)
 
-
-        run_nonuniform_flops = functools.partial(get_flops, batch_size, n_heads, seq_len_q, seq_len_k, head_dim, head_dim_v, dtype)
+        run_nonuniform_flops = functools.partial(
+            get_flops,
+            batch_size,
+            n_heads,
+            seq_len_q,
+            seq_len_k,
+            head_dim,
+            head_dim_v,
+            dtype,
+        )
         # Flash does not support non-uniform attention, i.e. seq_len_q != seq_len_k or dim_q != dim_v"
         non_uniform_backends = ["math", "mem_efficient"]
-        flops = [run_nonuniform_flops(backend, with_backward=False) for backend in non_uniform_backends]
+        flops = [
+            run_nonuniform_flops(backend, with_backward=False)
+            for backend in non_uniform_backends
+        ]
         flops_fw_math, flops_fw_efficient = flops
         self.assertEqual(flops_fw_math, flops_fw_efficient)
 
         self.assertExpectedInline(str(flops_fw_math), """268435456""")
 
-        flops = [run_nonuniform_flops(backend, with_backward=True) for backend in non_uniform_backends]
+        flops = [
+            run_nonuniform_flops(backend, with_backward=True)
+            for backend in non_uniform_backends
+        ]
         flops_fw_bw_math, flops_fw_bw_efficient = flops
         self.assertExpectedInline(str(flops_fw_bw_math), """805306368""")
         self.assertExpectedInline(str(flops_fw_bw_efficient), """939524096""")
 
     @unittest.skipIf(not HAS_CUDA, "CUDA not available")
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION or not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
-                     "Does not support all SDPA backends (pre-SM80 hardware on CUDA)")
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION
+        or not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+        "Does not support all SDPA backends (pre-SM80 hardware on CUDA)",
+    )
     def test_sdpa_nested_tensor(self):
-
         def get_flops(q, k, v, backend, with_backward=False):
             mode = FlopCounterMode()
 
             if backend == "math":
-                backend = torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False)
+                backend = torch.backends.cuda.sdp_kernel(
+                    enable_flash=False, enable_math=True, enable_mem_efficient=False
+                )
             elif backend == "flash":
-                backend = torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False)
+                backend = torch.backends.cuda.sdp_kernel(
+                    enable_flash=True, enable_math=False, enable_mem_efficient=False
+                )
             elif backend == "mem_efficient":
-                backend = torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=True)
+                backend = torch.backends.cuda.sdp_kernel(
+                    enable_flash=False, enable_math=False, enable_mem_efficient=True
+                )
 
             with backend, mode:
-                out = F.scaled_dot_product_attention(q, k, v, dropout_p=0, is_causal=True)
+                out = F.scaled_dot_product_attention(
+                    q, k, v, dropout_p=0, is_causal=True
+                )
                 if with_backward:
                     if out.is_nested:
                         out.values().sum().backward()
@@ -361,25 +478,47 @@ def get_nested_inputs(
             head_dim_v,
             dtype,
         ):
-            q_lengths = torch.tensor([
-                max_seq_len_q // 4,
-                max_seq_len_q // 4 * 2,
-                max_seq_len_q // 4 * 3,
-                max_seq_len_q // 4 * 4
-            ])
-            k_lengths = torch.tensor([
-                max_seq_len_k // 4,
-                max_seq_len_k // 4 * 2,
-                max_seq_len_k // 4 * 3,
-                max_seq_len_k // 4 * 4
-            ])
+            q_lengths = torch.tensor(
+                [
+                    max_seq_len_q // 4,
+                    max_seq_len_q // 4 * 2,
+                    max_seq_len_q // 4 * 3,
+                    max_seq_len_q // 4 * 4,
+                ]
+            )
+            k_lengths = torch.tensor(
+                [
+                    max_seq_len_k // 4,
+                    max_seq_len_k // 4 * 2,
+                    max_seq_len_k // 4 * 3,
+                    max_seq_len_k // 4 * 4,
+                ]
+            )
             q_offsets, k_offsets = (
                 torch.cat((torch.tensor([0]), torch.cumsum(lengths, dim=0))).cuda()
                 for lengths in (q_lengths, k_lengths)
             )
-            q_values = torch.randn(q_offsets[-1], head_dim * n_heads, dtype=dtype, requires_grad=True, device="cuda")
-            k_values = torch.randn(k_offsets[-1], head_dim * n_heads, dtype=dtype, requires_grad=True, device="cuda")
-            v_values = torch.randn(k_offsets[-1], head_dim_v * n_heads, dtype=dtype, requires_grad=True, device="cuda")
+            q_values = torch.randn(
+                q_offsets[-1],
+                head_dim * n_heads,
+                dtype=dtype,
+                requires_grad=True,
+                device="cuda",
+            )
+            k_values = torch.randn(
+                k_offsets[-1],
+                head_dim * n_heads,
+                dtype=dtype,
+                requires_grad=True,
+                device="cuda",
+            )
+            v_values = torch.randn(
+                k_offsets[-1],
+                head_dim_v * n_heads,
+                dtype=dtype,
+                requires_grad=True,
+                device="cuda",
+            )
 
             q = torch.nested.nested_tensor_from_jagged(q_values, q_offsets)
             k = torch.nested.nested_tensor_from_jagged(k_values, k_offsets)
@@ -397,13 +536,16 @@ def split_tensor(x):
                     y.unsqueeze(0).transpose(1, 2).detach().requires_grad_(True)
                     for y in x.transpose(1, 2).unbind(0)
                 )
+
             q_tensors = split_tensor(q)
             k_tensors = split_tensor(k)
             v_tensors = split_tensor(v)
 
             flops = 0
             for q_i, k_i, v_i in zip(q_tensors, k_tensors, v_tensors):
-                flops += get_flops(q_i, k_i, v_i, backend=backend, with_backward=with_backward)
+                flops += get_flops(
+                    q_i, k_i, v_i, backend=backend, with_backward=with_backward
+                )
 
             return flops
 
@@ -429,29 +571,77 @@ def split_tensor(x):
         }
 
         self.assertEqual(
-            get_dense_flops(*get_nested_inputs(**uniform_config), backend="flash", with_backward=False),
-            get_flops(*get_nested_inputs(**uniform_config), backend="flash", with_backward=False),
+            get_dense_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="flash",
+                with_backward=False,
+            ),
+            get_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="flash",
+                with_backward=False,
+            ),
         )
         self.assertEqual(
-            get_dense_flops(*get_nested_inputs(**uniform_config), backend="mem_efficient", with_backward=False),
-            get_flops(*get_nested_inputs(**uniform_config), backend="mem_efficient", with_backward=False),
+            get_dense_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="mem_efficient",
+                with_backward=False,
+            ),
+            get_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="mem_efficient",
+                with_backward=False,
+            ),
         )
         self.assertEqual(
-            get_dense_flops(*get_nested_inputs(**differing_config), backend="mem_efficient", with_backward=False),
-            get_flops(*get_nested_inputs(**differing_config), backend="mem_efficient", with_backward=False),
+            get_dense_flops(
+                *get_nested_inputs(**differing_config),
+                backend="mem_efficient",
+                with_backward=False,
+            ),
+            get_flops(
+                *get_nested_inputs(**differing_config),
+                backend="mem_efficient",
+                with_backward=False,
+            ),
         )
 
         self.assertEqual(
-            get_dense_flops(*get_nested_inputs(**uniform_config), backend="flash", with_backward=True),
-            get_flops(*get_nested_inputs(**uniform_config), backend="flash", with_backward=True),
+            get_dense_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="flash",
+                with_backward=True,
+            ),
+            get_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="flash",
+                with_backward=True,
+            ),
         )
         self.assertEqual(
-            get_dense_flops(*get_nested_inputs(**uniform_config), backend="mem_efficient", with_backward=True),
-            get_flops(*get_nested_inputs(**uniform_config), backend="mem_efficient", with_backward=True),
+            get_dense_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="mem_efficient",
+                with_backward=True,
+            ),
+            get_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="mem_efficient",
+                with_backward=True,
+            ),
         )
         self.assertEqual(
-            get_dense_flops(*get_nested_inputs(**differing_config), backend="mem_efficient", with_backward=True),
-            get_flops(*get_nested_inputs(**differing_config), backend="mem_efficient", with_backward=True),
+            get_dense_flops(
+                *get_nested_inputs(**differing_config),
+                backend="mem_efficient",
+                with_backward=True,
+            ),
+            get_flops(
+                *get_nested_inputs(**differing_config),
+                backend="mem_efficient",
+                with_backward=True,
+            ),
         )
 
     def test_addmm_out(self):
@@ -479,8 +669,8 @@ def test_hook_registration(self):
     def test_pytrees(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
-                x = x['a'].relu_()
-                return {'a': torch.mm(x, x)}
+                x = x["a"].relu_()
+                return {"a": torch.mm(x, x)}
 
         class Mod(torch.nn.Module):
             def __init__(self):
@@ -493,8 +683,12 @@ def forward(self, x):
 
         mod = Mod()
         with FlopCounterMode() as mode:
-            mod({'a': torch.randn(10, 10, requires_grad=True).clone()})['a'].sum().backward()
-        self.assertExpectedInline((mode.flop_counts['Mod'][torch.ops.aten.mm]), """12000""")
+            mod({"a": torch.randn(10, 10, requires_grad=True).clone()})[
+                "a"
+            ].sum().backward()
+        self.assertExpectedInline(
+            (mode.flop_counts["Mod"][torch.ops.aten.mm]), """12000"""
+        )
 
         class Mod2(torch.nn.Module):
             def forward(self, x):
@@ -503,7 +697,9 @@ def forward(self, x):
         mod = Mod2()
         with FlopCounterMode() as mode:
             mod(torch.randn(10, 10, requires_grad=True))[0].sum().backward()
-        self.assertExpectedInline((mode.flop_counts['Mod2'][torch.ops.aten.mm]), """6000""")
+        self.assertExpectedInline(
+            (mode.flop_counts["Mod2"][torch.ops.aten.mm]), """6000"""
+        )
 
     def test_warning(self):
         mod = torch.nn.Linear(2, 2)
@@ -511,5 +707,5 @@ def test_warning(self):
             FlopCounterMode(mod)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_function_schema.py b/test/test_function_schema.py
index 47586147dbbcb..439a3c66d3f02 100644
--- a/test/test_function_schema.py
+++ b/test/test_function_schema.py
@@ -1,8 +1,8 @@
 # Owner(s): ["module: unknown"]
 
 import torch
-from torch.testing._internal.common_utils import TestCase, run_tests
 from torch._C import parse_schema
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class TestFunctionSchema(TestCase):
@@ -16,216 +16,306 @@ def test_serialize_and_deserialize(self):
             self.assertTrue(parsed_schema.is_backward_compatible_with(schema))
 
     def test_out_schema(self):
-        schema_with_out = parse_schema('any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)')
+        schema_with_out = parse_schema(
+            "any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
+        )
         self.assertTrue(schema_with_out.arguments[-1].is_out)
-        schema_without_out = parse_schema('any.not_out(Tensor self, Tensor b) -> Tensor')
+        schema_without_out = parse_schema(
+            "any.not_out(Tensor self, Tensor b) -> Tensor"
+        )
         self.assertFalse(schema_without_out.arguments[-1].is_out)
 
     def test_hash_schema(self):
-        schema1 = parse_schema('any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)')
-        schema2 = parse_schema('any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)')
+        schema1 = parse_schema("any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+        schema2 = parse_schema("any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
         self.assertEqual(hash(schema1), hash(schema2))
 
-        schema3 = parse_schema('any.not_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)')
+        schema3 = parse_schema(
+            "any.not_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
+        )
         self.assertNotEqual(hash(schema2), hash(schema3))
 
-        schema4 = parse_schema('foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)')
+        schema4 = parse_schema(
+            "foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)"
+        )
         self.assertNotEqual(hash(schema2), hash(schema4))
 
         # schemas with different default value, or different kw-only arg, should have different hash
-        default_val_schema0 = parse_schema('foo(Tensor self, int a = 2) -> Tensor(a!)')
-        default_val_schema1 = parse_schema('foo(Tensor self, int a = 3) -> Tensor(a!)')
-        default_val_schema2 = parse_schema('foo(Tensor self, *, int a = 2) -> Tensor(a!)')
+        default_val_schema0 = parse_schema("foo(Tensor self, int a = 2) -> Tensor(a!)")
+        default_val_schema1 = parse_schema("foo(Tensor self, int a = 3) -> Tensor(a!)")
+        default_val_schema2 = parse_schema(
+            "foo(Tensor self, *, int a = 2) -> Tensor(a!)"
+        )
         self.assertNotEqual(hash(default_val_schema0), hash(default_val_schema1))
         self.assertNotEqual(hash(default_val_schema0), hash(default_val_schema2))
 
         # schema with different alias annotation should have different hash
-        alias_schema = parse_schema('foo(Tensor(a!) self, int a = 2) -> Tensor(a!)')
+        alias_schema = parse_schema("foo(Tensor(a!) self, int a = 2) -> Tensor(a!)")
         self.assertNotEqual(hash(default_val_schema0), hash(alias_schema))
-        alias_schema2 = parse_schema('foo(Tensor(b!) self, int a = 2) -> Tensor(a!)')
+        alias_schema2 = parse_schema("foo(Tensor(b!) self, int a = 2) -> Tensor(a!)")
         self.assertNotEqual(hash(alias_schema), hash(alias_schema2))
 
         # schema with different alias infos
-        alias_schema3 = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)')
-        alias_schema4 = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(b!)')
-        alias_schema5 = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(b!) out, Tensor(a!) b) -> Tensor(a!)')
+        alias_schema3 = parse_schema(
+            "foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)"
+        )
+        alias_schema4 = parse_schema(
+            "foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(b!)"
+        )
+        alias_schema5 = parse_schema(
+            "foo(Tensor self, *, int a, int b=1, Tensor(b!) out, Tensor(a!) b) -> Tensor(a!)"
+        )
         self.assertNotEqual(hash(alias_schema3), hash(alias_schema4))
         self.assertNotEqual(hash(alias_schema3), hash(alias_schema5))
 
     def test_backward_compatible_structure(self):
-        old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
+        old_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor")
         # BC: A new schema without changes.
-        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
+        new_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor")
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema with different name.
-        new_schema = parse_schema('any_.over(Tensor self, *, Tensor b) -> Tensor')
+        new_schema = parse_schema("any_.over(Tensor self, *, Tensor b) -> Tensor")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema with different overload name.
-        new_schema = parse_schema('any.other(Tensor self, *, Tensor b) -> Tensor')
+        new_schema = parse_schema("any.other(Tensor self, *, Tensor b) -> Tensor")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema that adds vararg.
-        new_schema = parse_schema('any.over(Tensor self, *, Tensor b, ...) -> Tensor')
+        new_schema = parse_schema("any.over(Tensor self, *, Tensor b, ...) -> Tensor")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema with different number of outputs.
-        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> (Tensor, Tensor)')
+        new_schema = parse_schema(
+            "any.over(Tensor self, *, Tensor b) -> (Tensor, Tensor)"
+        )
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
 
     def test_backward_compatible_outputs(self):
-        old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
+        old_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor")
         # No-BC: A new schema with output becoming of optional type.
-        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor?')
+        new_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor?")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # BC: (the opposite case) An schema where the output is not of optional type anymore.
         self.assertTrue(old_schema.is_backward_compatible_with(new_schema))
         # No-BC: A new schema with a different output type.
-        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> int')
+        new_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> int")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema with a different output type.
-        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor out')
+        new_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor out")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
 
     def test_backward_compatible_arguments(self):
-        old_schema = parse_schema('any(Tensor self, *, Tensor b, int c) -> Tensor')
+        old_schema = parse_schema("any(Tensor self, *, Tensor b, int c) -> Tensor")
         # No-BC: A new schema with less arguments.
-        new_schema = parse_schema('any(Tensor self, *, Tensor b) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, *, Tensor b) -> Tensor")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema with more arguments, appended, but no default value.
-        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d) -> Tensor')
+        new_schema = parse_schema(
+            "any(Tensor self, *, Tensor b, int c, int d) -> Tensor"
+        )
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # BC: A new schema with more arguments, appended, that have a default value.
-        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d=1) -> Tensor')
+        new_schema = parse_schema(
+            "any(Tensor self, *, Tensor b, int c, int d=1) -> Tensor"
+        )
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema with more arguments, not-appended, that have a default value.
-        new_schema = parse_schema('any(Tensor self, int d=1, *, Tensor b, int c) -> Tensor')
+        new_schema = parse_schema(
+            "any(Tensor self, int d=1, *, Tensor b, int c) -> Tensor"
+        )
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # BC: A new schema where old kwargs becomes positional.
-        new_schema = parse_schema('any(Tensor self, Tensor b, *, int c) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, Tensor b, *, int c) -> Tensor")
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
         # BC: (the opposite case) A new schema where an old positional argument becomes kwarg.
         self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
         # BC: A new schema where all old kwargs become positional.
-        new_schema = parse_schema('any(Tensor self, Tensor b, int c) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, Tensor b, int c) -> Tensor")
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
         # BC: (the opposite case) A new schema where all old positional arguments become kwarg.
         self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
         # No-BC: A new schema where old kwargs appear in different order.
-        new_schema = parse_schema('any(Tensor self, *, int c, Tensor b) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, *, int c, Tensor b) -> Tensor")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # BC: A new schema where argument becomes of type optional.
-        new_schema = parse_schema('any(Tensor self, *, Tensor b, int? c) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, *, Tensor b, int? c) -> Tensor")
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
         # BC: A new schema where argument gains a default value.
-        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c=1) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, *, Tensor b, int c=1) -> Tensor")
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema where argument is "renamed".
-        new_schema = parse_schema('any(Tensor self, *, Tensor b, int renamed) -> Tensor')
+        new_schema = parse_schema(
+            "any(Tensor self, *, Tensor b, int renamed) -> Tensor"
+        )
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema where argument type changes to an incompatible type.
-        new_schema = parse_schema('any(Tensor self, *, Tensor b, int[] c) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, *, Tensor b, int[] c) -> Tensor")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
 
     def test_backward_compatible_with_smart_serialization(self):
         # cases where out arg is provided
-        old_schema = parse_schema('foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)')
-        new_schema_same_out = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)')
-        new_schema_wrong_default = parse_schema('foo(Tensor self, *, int b=1, int a, Tensor(a!) out) -> Tensor(a!)')
-        new_schema_more_out = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)')
-        new_schema_wrong_pos = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(b!) b, Tensor(a!) out) -> Tensor(a!)')
+        old_schema = parse_schema(
+            "foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)"
+        )
+        new_schema_same_out = parse_schema(
+            "foo(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)"
+        )
+        new_schema_wrong_default = parse_schema(
+            "foo(Tensor self, *, int b=1, int a, Tensor(a!) out) -> Tensor(a!)"
+        )
+        new_schema_more_out = parse_schema(
+            "foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)"
+        )
+        new_schema_wrong_pos = parse_schema(
+            "foo(Tensor self, *, int a, int b=1, Tensor(b!) b, Tensor(a!) out) -> Tensor(a!)"
+        )
         self.assertTrue(new_schema_same_out.is_backward_compatible_with(old_schema))
         self.assertTrue(new_schema_more_out.is_backward_compatible_with(old_schema))
-        self.assertFalse(new_schema_wrong_default.is_backward_compatible_with(old_schema))
+        self.assertFalse(
+            new_schema_wrong_default.is_backward_compatible_with(old_schema)
+        )
         self.assertFalse(new_schema_wrong_pos.is_backward_compatible_with(old_schema))
 
         # cases where out arg is not provided
-        old_schema_without_arg = parse_schema('foo(Tensor self, int a, int b=1) -> int')
-        new_schema_without_arg = parse_schema('foo(Tensor self, int a, int b=1, int c=2) -> int')
-        new_schema_without_arg_multiple_default = parse_schema('foo(Tensor self, int a, int b=1, int c=2, int d=3) -> int')
-        new_schema_without_arg_wrong_pos = parse_schema('foo(Tensor self, int a, int c=2, int b=1) -> int')
-        self.assertTrue(new_schema_without_arg.is_backward_compatible_with(old_schema_without_arg))
-        self.assertTrue(new_schema_without_arg_multiple_default.is_backward_compatible_with(old_schema_without_arg))
-        self.assertFalse(new_schema_without_arg_wrong_pos.is_backward_compatible_with(old_schema_without_arg))
+        old_schema_without_arg = parse_schema("foo(Tensor self, int a, int b=1) -> int")
+        new_schema_without_arg = parse_schema(
+            "foo(Tensor self, int a, int b=1, int c=2) -> int"
+        )
+        new_schema_without_arg_multiple_default = parse_schema(
+            "foo(Tensor self, int a, int b=1, int c=2, int d=3) -> int"
+        )
+        new_schema_without_arg_wrong_pos = parse_schema(
+            "foo(Tensor self, int a, int c=2, int b=1) -> int"
+        )
+        self.assertTrue(
+            new_schema_without_arg.is_backward_compatible_with(old_schema_without_arg)
+        )
+        self.assertTrue(
+            new_schema_without_arg_multiple_default.is_backward_compatible_with(
+                old_schema_without_arg
+            )
+        )
+        self.assertFalse(
+            new_schema_without_arg_wrong_pos.is_backward_compatible_with(
+                old_schema_without_arg
+            )
+        )
 
     def test_string_optional_parameter_default_value(self):
-        schema_a = parse_schema("example::op(str? order=\"NCHW\") -> (Tensor)")
+        schema_a = parse_schema('example::op(str? order="NCHW") -> (Tensor)')
         schema_b = parse_schema(str(schema_a))
         self.assertEqual(schema_a, schema_b)
 
     def test_forward_compatible_arguments_without_out(self):
-        old_schema = parse_schema('any(Tensor self, int a, int b=1) -> Tensor')
+        old_schema = parse_schema("any(Tensor self, int a, int b=1) -> Tensor")
         # deleting default arg is FC compatible
-        new_schema = parse_schema('any(Tensor self, int a) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a) -> Tensor")
         is_fc, _ = new_schema.check_forward_compatible_with(old_schema)
         self.assertTrue(is_fc)
         # adding default arg is FC compatible
-        new_schema = parse_schema('any(Tensor self, int a, int b=1, int c=1) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a, int b=1, int c=1) -> Tensor")
         is_fc, _ = new_schema.check_forward_compatible_with(old_schema)
         self.assertTrue(is_fc)
         # adding default arg with container type is NOT FC compatible
-        new_schema = parse_schema('any(Tensor self, int a, int b=1, int[2] c=1) -> Tensor')
+        new_schema = parse_schema(
+            "any(Tensor self, int a, int b=1, int[2] c=1) -> Tensor"
+        )
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "Function schema is not forward compatible since the new argument"
-                                 " \'c\' of type int[] has a container type as its default value.")
+        self.assertEqual(
+            reason,
+            "Function schema is not forward compatible since the new argument"
+            " 'c' of type int[] has a container type as its default value.",
+        )
         # updating the default value of a default arg is NOT FC compatible
-        new_schema = parse_schema('any(Tensor self, int a, int b=4) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a, int b=4) -> Tensor")
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "\'b\' is not forward compatible with the older version of the schema")
+        self.assertEqual(
+            reason, "'b' is not forward compatible with the older version of the schema"
+        )
         # updating the arg name of a default arg is NOT FC compatible
-        new_schema = parse_schema('any(Tensor self, int a, int c=1) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a, int c=1) -> Tensor")
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "\'c\' is not forward compatible with the older version of the schema")
+        self.assertEqual(
+            reason, "'c' is not forward compatible with the older version of the schema"
+        )
         # not adding default arg in the end is NOT FC compatible
-        new_schema = parse_schema('any(Tensor self, int a, int c=1, int b=1) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a, int c=1, int b=1) -> Tensor")
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "\'c\' is not forward compatible with the older version of the schema")
+        self.assertEqual(
+            reason, "'c' is not forward compatible with the older version of the schema"
+        )
         # making default arg into positional arg is NOT FC compatible
-        new_schema = parse_schema('any(Tensor self, int a, int b) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a, int b) -> Tensor")
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "\'b\' is not forward compatible with the older version of the schema")
+        self.assertEqual(
+            reason, "'b' is not forward compatible with the older version of the schema"
+        )
         # making positional arg into default arg is NOT FC compatible
-        new_schema = parse_schema('any(Tensor self, int a=1, int b=1) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a=1, int b=1) -> Tensor")
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "\'a\' is not forward compatible with the older version of the schema")
+        self.assertEqual(
+            reason, "'a' is not forward compatible with the older version of the schema"
+        )
 
     def test_forward_compatible_arguments_real_use_case(self):
         # this change introduced forward incompatibility in the past
-        old_slice_schema = parse_schema('slice(Tensor(a) self, int dim=0, int start=0, int end=0, int step=1) -> Tensor(a)')
-        new_slice_schema = parse_schema('slice(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)')
+        old_slice_schema = parse_schema(
+            "slice(Tensor(a) self, int dim=0, int start=0, int end=0, int step=1) -> Tensor(a)"
+        )
+        new_slice_schema = parse_schema(
+            "slice(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)"
+        )
         is_fc, reason = new_slice_schema.check_forward_compatible_with(old_slice_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "\'start\' is not forward compatible with the older version of the schema")
+        self.assertEqual(
+            reason,
+            "'start' is not forward compatible with the older version of the schema",
+        )
 
     def test_forward_compatible_arguments_with_out(self):
-        old_schema = parse_schema('any(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)')
-        new_schema = parse_schema('any(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)')
+        old_schema = parse_schema(
+            "any(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)"
+        )
+        new_schema = parse_schema(
+            "any(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)"
+        )
         is_fc, _ = new_schema.check_forward_compatible_with(old_schema)
         self.assertTrue(is_fc)
-        new_schema = parse_schema('any(Tensor self, *, int a, int b=1, int c=1, Tensor(a!) out) -> Tensor(a!)')
+        new_schema = parse_schema(
+            "any(Tensor self, *, int a, int b=1, int c=1, Tensor(a!) out) -> Tensor(a!)"
+        )
         is_fc, _ = new_schema.check_forward_compatible_with(old_schema)
         self.assertTrue(is_fc)
-        new_schema = parse_schema('any(Tensor self, *, int a, Tensor(d!) d, int b=1, Tensor(a!) out) -> Tensor(a!)')
+        new_schema = parse_schema(
+            "any(Tensor self, *, int a, Tensor(d!) d, int b=1, Tensor(a!) out) -> Tensor(a!)"
+        )
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "Function schema should have the same number of out arguments")
+        self.assertEqual(
+            reason, "Function schema should have the same number of out arguments"
+        )
 
     def test_schema_error(self):
-        with self.assertRaisesRegex(RuntimeError, r"schemas with vararg \(...\) can't have default value args"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"schemas with vararg \(...\) can't have default value args"
+        ):
             schema = parse_schema("any.foo(int arg1, int arg2=0, ...)")
 
     def test_tensor_list_alias_annotation_properly_parsed(self):
-        schema_str = 'foo(Tensor self, *, Tensor(a!)[] out) -> ()'
+        schema_str = "foo(Tensor self, *, Tensor(a!)[] out) -> ()"
         schema = parse_schema(schema_str)
         self.assertTrue(schema.arguments[-1].alias_info.is_write)
         self.assertEqual(str(schema), schema_str)
 
     def test_tensor_option_arguments_properly_parsed(self):
-        schema_str = '_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, ' \
-                     'bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor'
+        schema_str = (
+            "_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, "
+            "bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor"
+        )
         schema = parse_schema(schema_str)
         # fake type of MemoryFormat? is int?
         self.assertEqual(schema.arguments[-1].type.str(), "int?")
@@ -237,7 +327,7 @@ def test_tensor_option_arguments_properly_parsed(self):
         self.assertEqual(str(schema), schema_str)
 
     def test_sym_int_argument_properly_parsed(self):
-        schema_str = 'sym_size.int(Tensor self, int dim) -> SymInt'
+        schema_str = "sym_size.int(Tensor self, int dim) -> SymInt"
         schema = parse_schema(schema_str)
         # fake type of SymInt is int
         self.assertEqual(schema.returns[-1].type.str(), "int")
@@ -247,5 +337,5 @@ def test_sym_int_argument_properly_parsed(self):
         self.assertEqual(str(schema), schema_str)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_functional_autograd_benchmark.py b/test/test_functional_autograd_benchmark.py
index 57a67ccead89b..b0141479dd38a 100644
--- a/test/test_functional_autograd_benchmark.py
+++ b/test/test_functional_autograd_benchmark.py
@@ -1,14 +1,21 @@
 # Owner(s): ["module: autograd"]
 
-from torch.testing._internal.common_utils import TestCase, run_tests, slowTest, IS_WINDOWS
+import os
 
 import subprocess
 import tempfile
-import os
 import unittest
 
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    run_tests,
+    slowTest,
+    TestCase,
+)
+
 PYTORCH_COLLECT_COVERAGE = bool(os.environ.get("PYTORCH_COLLECT_COVERAGE"))
 
+
 # This is a very simple smoke test for the functional autograd benchmarking script.
 class TestFunctionalAutogradBenchmark(TestCase):
     def _test_runner(self, model, disable_gpu=False):
@@ -17,18 +24,20 @@ def _test_runner(self, model, disable_gpu=False):
         # is not allowed to open it again. As this is a simple smoke test, we choose for now
         # not to run this on windows and keep the code here simple.
         with tempfile.NamedTemporaryFile() as out_file:
-            cmd = ['python3',
-                   '../benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py']
+            cmd = [
+                "python3",
+                "../benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py",
+            ]
             # Only run the warmup
-            cmd += ['--num-iters', '0']
+            cmd += ["--num-iters", "0"]
             # Only run the vjp task (fastest one)
-            cmd += ['--task-filter', 'vjp']
+            cmd += ["--task-filter", "vjp"]
             # Only run the specified model
-            cmd += ['--model-filter', model]
+            cmd += ["--model-filter", model]
             # Output file
-            cmd += ['--output', out_file.name]
+            cmd += ["--output", out_file.name]
             if disable_gpu:
-                cmd += ['--gpu', '-1']
+                cmd += ["--gpu", "-1"]
 
             res = subprocess.run(cmd)
 
@@ -37,20 +46,34 @@ def _test_runner(self, model, disable_gpu=False):
             out_file.seek(0, os.SEEK_END)
             self.assertTrue(out_file.tell() > 0)
 
-
-    @unittest.skipIf(IS_WINDOWS, "NamedTemporaryFile on windows does not have all the features we need.")
-    @unittest.skipIf(PYTORCH_COLLECT_COVERAGE, "Can deadlocks with gcov, see https://github.com/pytorch/pytorch/issues/49656")
+    @unittest.skipIf(
+        IS_WINDOWS,
+        "NamedTemporaryFile on windows does not have all the features we need.",
+    )
+    @unittest.skipIf(
+        PYTORCH_COLLECT_COVERAGE,
+        "Can deadlocks with gcov, see https://github.com/pytorch/pytorch/issues/49656",
+    )
     def test_fast_tasks(self):
-        fast_tasks = ['resnet18', 'ppl_simple_reg', 'ppl_robust_reg', 'wav2letter',
-                      'transformer', 'multiheadattn']
+        fast_tasks = [
+            "resnet18",
+            "ppl_simple_reg",
+            "ppl_robust_reg",
+            "wav2letter",
+            "transformer",
+            "multiheadattn",
+        ]
 
         for task in fast_tasks:
             self._test_runner(task)
 
     @slowTest
-    @unittest.skipIf(IS_WINDOWS, "NamedTemporaryFile on windows does not have all the features we need.")
+    @unittest.skipIf(
+        IS_WINDOWS,
+        "NamedTemporaryFile on windows does not have all the features we need.",
+    )
     def test_slow_tasks(self):
-        slow_tasks = ['fcn_resnet', 'detr']
+        slow_tasks = ["fcn_resnet", "detr"]
         # deepspeech is voluntarily excluded as it takes too long to run without
         # proper tuning of the number of threads it should use.
 
@@ -59,5 +82,5 @@ def test_slow_tasks(self):
             self._test_runner(task, disable_gpu=True)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_functional_optim.py b/test/test_functional_optim.py
index da3d40d305e34..5e2a1e67e0159 100644
--- a/test/test_functional_optim.py
+++ b/test/test_functional_optim.py
@@ -1,15 +1,16 @@
 # Owner(s): ["oncall: distributed"]
 
-from typing import List, Optional, Tuple
 import unittest
+from typing import List, Optional, Tuple
 
 import torch
 import torch.distributed
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
-from torch.optim import SGD, Adam, AdamW
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.optim import Adam, AdamW, SGD
+from torch.testing._internal.common_utils import run_tests, TestCase
+
 
 class MyModule(torch.nn.Module):
     def __init__(self):
@@ -21,6 +22,7 @@ def __init__(self):
     def forward(self, t1):
         return self.lin2(F.relu(self.lin1(t1)))
 
+
 # dummy class to showcase custom optimizer registration with functional wrapper
 class MyDummyFnOptimizer:
     def __init__(
@@ -32,7 +34,6 @@ def __init__(
         weight_decay: float = 0.0,
         _allow_empty_param_list: bool = False,
     ):
-
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= eps:
@@ -58,17 +59,26 @@ def __init__(
     def step_param(self, param: Tensor, grad: Optional[Tensor]):
         # call the custom optimizer step_param implementation
         with torch.no_grad():
-            raise RuntimeError("MyDummyFnOptimizer does not support step_param() as of now")
+            raise RuntimeError(
+                "MyDummyFnOptimizer does not support step_param() as of now"
+            )
 
     def step(self, gradients: List[Optional[Tensor]]):
         # call the custom optimizer step implementation
         with torch.no_grad():
             raise RuntimeError("MyDummyFnOptimizer does not support step() as of now")
 
+
 if torch.distributed.is_available():
-    from torch.distributed.optim.utils import functional_optim_map, register_functional_optim
+    from torch.distributed.optim.utils import (
+        functional_optim_map,
+        register_functional_optim,
+    )
+
 
-@unittest.skipIf(not torch.distributed.is_available(), "These are testing distributed functions")
+@unittest.skipIf(
+    not torch.distributed.is_available(), "These are testing distributed functions"
+)
 class TestFunctionalOptimParity(TestCase):
     def _validate_parameters(self, params_1, params_2):
         for p1, p2 in zip(params_1, params_2):
diff --git a/test/test_functionalization_of_rng_ops.py b/test/test_functionalization_of_rng_ops.py
index b2ac62e4f2786..bba22ff34a0b0 100644
--- a/test/test_functionalization_of_rng_ops.py
+++ b/test/test_functionalization_of_rng_ops.py
@@ -1,36 +1,34 @@
 # Owner(s): ["oncall: pt2"]
+import functools
 import sys
 import unittest
-import torch
-from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
-)
-
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes
-from functorch.compile import aot_function, nop, min_cut_rematerialization_partition
 from unittest.mock import patch
-import functools
-import torch.utils.checkpoint
 
+import torch
+import torch.utils.checkpoint
+from functorch.compile import aot_function, min_cut_rematerialization_partition, nop
 
-from torch.testing._internal.common_utils import (
-    IS_CI,
-    IS_WINDOWS,
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
 )
 
+from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS, run_tests, TestCase
+
 if IS_WINDOWS and IS_CI:
-    sys.stderr.write(
-        "torch.compile not supported on windows"
-    )
+    sys.stderr.write("torch.compile not supported on windows")
     if __name__ == "__main__":
         sys.exit(0)
     raise unittest.SkipTest("torch.compile not supported on windows")
 
+
 def count_philox_rand(gm, args, freq):
-    assert [node.target for node in gm.graph.nodes].count(torch.ops.rngprims.philox_rand.default) == freq
+    assert [node.target for node in gm.graph.nodes].count(
+        torch.ops.rngprims.philox_rand.default
+    ) == freq
     return gm
 
+
 class TestFunctionalizationRngOps(TestCase):
     @dtypes(torch.float32)
     @patch.object(torch._functorch.config, "functionalize_rng_ops", True)
@@ -72,8 +70,6 @@ def fn(x):
 
             self.assertEqual(ref, res)
 
-
-
     @dtypes(torch.float32)
     @patch.object(torch._functorch.config, "functionalize_rng_ops", True)
     def test_rand_like_dynamic_bwd(self, dtype, device):
@@ -96,7 +92,6 @@ def fn(x):
 
             self.assertEqual(ref, res)
 
-
     @dtypes(torch.float32)
     @patch.object(torch._functorch.config, "functionalize_rng_ops", True)
     def test_rand(self, dtype, device):
@@ -134,7 +129,7 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, grad_out):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 return grad_out * torch.rand_like(grad_out) * torch.cos(x)
 
         custom = Custom.apply
@@ -174,7 +169,7 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, grad_out):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 return grad_out * torch.rand_like(grad_out) * torch.cos(x)
 
         class CustomOp2(torch.autograd.Function):
@@ -186,10 +181,9 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, grad_out):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 return grad_out * torch.rand_like(grad_out) * torch.rand_like(x)
 
-
         custom_op1 = CustomOp1.apply
         custom_op2 = CustomOp2.apply
 
@@ -210,7 +204,6 @@ def aot_fn(x):
             b = a.sin()
             return aot_custom_op2(b)
 
-
         for seed in range(10):
             torch.cuda.manual_seed(seed)
             x = torch.rand(*shape, device=device, dtype=dtype, requires_grad=True)
@@ -265,7 +258,6 @@ def fn(x):
             a = torch.sin(a)
             return a
 
-
         x = torch.rand(*shape, device=device, dtype=dtype, requires_grad=True)
 
         x_clone = x.clone().detach().requires_grad_(True)
@@ -277,7 +269,12 @@ def fn(x):
         torch.cuda.manual_seed(123)
         fwd_compiler = functools.partial(count_philox_rand, freq=2)
         bwd_compiler = functools.partial(count_philox_rand, freq=0)
-        aot_custom = aot_function(fn, fwd_compiler, bwd_compiler, partition_fn=min_cut_rematerialization_partition)
+        aot_custom = aot_function(
+            fn,
+            fwd_compiler,
+            bwd_compiler,
+            partition_fn=min_cut_rematerialization_partition,
+        )
         # aot_custom = aot_function(fn, fwd_compiler, bwd_compiler)
         res = aot_custom(x_clone)
         res.sum().backward()
diff --git a/test/test_optim.py b/test/test_optim.py
index 13484b1d7876c..f875b4ed669ee 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -911,8 +911,6 @@ def test_fused_large_tensor(self, device, dtype, optim_info):
     @onlyCUDA
     @optims([optim for optim in optim_db if "fused" in optim.supported_impls], dtypes=[torch.float32])
     def test_fused_does_not_step_if_foundinf(self, device, dtype, optim_info):
-        if device not in optim_info.supports_fused_on:
-            self.skipTest(f"{device} is not supported for fused on {optim_info.optim_cls.__name__}")
         optim_cls = optim_info.optim_cls
         optim_inputs = optim_info.optim_inputs_func(device=device)
         num_params = 5
@@ -942,12 +940,9 @@ def test_cpu_load_state_dict(self, device, dtype, impl, optim_info):
         # Since this is a unit test, it is more expedient to simulate what the state_dict
         # would look like, which is basically CPU tensors with fused/capturable flag = True.
         optim_cls = optim_info.optim_cls
-        opt_name = optim_cls.__name__
-        if opt_name in ("SGD", "Adagrad", ) and impl == "capturable":
-            # Capturable SGD/Adagrad does not exist
+        if optim_cls.__name__ == "SGD" and impl == "capturable":
+            # Capturable SGD does not exist
             self.skipTest("SGD does not currently support capturable")
-        if impl == "fused" and device not in optim_info.supports_fused_on:
-            self.skipTest(f"{device} is not supported for fused on {opt_name}")
 
         cpu_optim_inputs = optim_info.optim_inputs_func(device="cpu")
         for optim_input in cpu_optim_inputs:
@@ -1323,8 +1318,6 @@ def closure():
             return closure_loss if optim_info.step_requires_closure else None
 
         for optim_input in cpu_optim_inputs:
-            if "fused" in optim_input.kwargs and "cuda" not in optim_info.supports_fused_on:
-                self.skipTest(f"cuda is not supported for fused on {optim_cls.__name__}")
             params = [Parameter(torch.randn(2, 3, device="cpu", dtype=dtype)) for _ in range(2)]
             for p in params:
                 p.grad = torch.randn_like(p)
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 2f7e6babdecfb..49f8880885ec4 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -26,7 +26,7 @@
 
 from torch.testing._internal.common_utils import (
     IS_FILESYSTEM_UTF8_ENCODING, TemporaryDirectoryName,
-    TestCase, IS_WINDOWS, TEST_DILL, run_tests, download_file, BytesIOContext, TemporaryFileName,
+    TestCase, IS_FBCODE, IS_WINDOWS, TEST_DILL, run_tests, download_file, BytesIOContext, TemporaryFileName,
     parametrize, instantiate_parametrized_tests, AlwaysWarnTypedStorageRemoval, serialTest)
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_dtype import all_types_and_complex_and
@@ -4000,6 +4000,51 @@ def test_serialization_dtype(self, dtype, weights_only):
             y['even'][0] = torch.tensor(-0.25, dtype=dtype)
             self.assertEqual(y['x'][:2].to(dtype=torch.float32), torch.tensor([-0.25, 0.25]))
 
+    @parametrize('filename', (True, False))
+    @unittest.skipIf(IS_WINDOWS, "NamedTemporaryFile on windows")
+    @unittest.skipIf(IS_FBCODE, "miniz version differs between fbcode and oss")
+    def test_filewriter_metadata_writing(self, filename):
+        sd = torch.nn.Linear(3, 5).state_dict()
+        weight_nbytes = sd['weight'].untyped_storage().nbytes()
+        bias_nbytes = sd['bias'].untyped_storage().nbytes()
+        # TemporaryFileName will give a string
+        # NamedTemporaryFile will be treated as a buffer
+        file_creation_func = TemporaryFileName if filename else tempfile.NamedTemporaryFile
+
+        with file_creation_func() as f, file_creation_func() as g:
+            # save state_dict in f
+            torch.save(sd, f)
+            if not filename:
+                f.seek(0)
+            # extract 'data.pkl' for use in our fake checkpoint
+            with torch.serialization._open_file_like(f, 'rb') as opened_file:
+                with torch.serialization._open_zipfile_reader(opened_file) as zip_file:
+                    data_file = io.BytesIO(zip_file.get_record('data.pkl'))
+                    data_0_offset = zip_file.get_record_offset('data/0')
+                    data_1_offset = zip_file.get_record_offset('data/1')
+
+            # write nulls for 'data/0' and 'data/1'
+            with open(f if filename else f.name, 'rb+') as opened_f:
+                opened_f.seek(data_0_offset)
+                opened_f.write(b'0' * weight_nbytes)
+                opened_f.seek(data_1_offset)
+                opened_f.write(b'0' * bias_nbytes)
+
+            with torch.serialization._open_zipfile_writer(g) as zip_file:
+                data_value = data_file.getvalue()
+                zip_file.write_record('data.pkl', data_value, len(data_value))
+                zip_file.write_record('byteorder', sys.byteorder, len(sys.byteorder))
+                # Only write metadata for storages
+                zip_file.write_record_metadata('data/0', weight_nbytes)
+                zip_file.write_record_metadata('data/1', bias_nbytes)
+
+            if not filename:
+                f.seek(0)
+                g.seek(0)
+            sd_loaded = torch.load(g)
+            sd_loaded_ref = torch.load(f)
+            self.assertEqual(sd_loaded, sd_loaded_ref)
+
     def run(self, *args, **kwargs):
         with serialization_method(use_zip=True):
             return super().run(*args, **kwargs)
diff --git a/test/test_utils.py b/test/test_utils.py
index b151b5141a280..66d66b8874f17 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1,37 +1,52 @@
 # Owner(s): ["module: unknown"]
 
-import sys
 import os
+import random
 import re
 import shutil
-import random
 import subprocess
+import sys
 import tempfile
-import traceback
 import textwrap
+import traceback
 import unittest
 import warnings
-from typing import Any, List, Dict
+from typing import Any, Dict, List
+
 import torch
+import torch.cuda
 import torch.nn as nn
+import torch.utils.cpp_extension
 import torch.utils.data
-from torch.utils.data import DataLoader
+from torch.autograd._functions.utils import check_onnx_broadcast
+from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_device_type import (
-    ops,
-    onlyCPU,
     instantiate_device_type_tests,
+    onlyCPU,
+    ops,
 )
 from torch.testing._internal.common_methods_invocations import op_db
-import torch.cuda
-from torch.utils._pytree import tree_any, tree_all_only
-from torch.utils.checkpoint import checkpoint, checkpoint_sequential, get_device_states, _infer_device_type
+from torch.testing._internal.common_utils import (  # type: ignore[attr-defined]
+    IS_FBCODE,
+    IS_SANDCASTLE,
+    IS_WINDOWS,
+    load_tests,
+)
 from torch.utils._device import set_device
-from torch.utils._traceback import report_compile_source_on_error, format_traceback_short, CapturedTraceback
-import torch.utils.cpp_extension
-from torch.autograd._functions.utils import check_onnx_broadcast
-from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
-from torch.testing._internal.common_utils import load_tests, IS_FBCODE, IS_SANDCASTLE, IS_WINDOWS  # type: ignore[attr-defined]
+from torch.utils._pytree import tree_all_only, tree_any
+from torch.utils._traceback import (
+    CapturedTraceback,
+    format_traceback_short,
+    report_compile_source_on_error,
+)
+from torch.utils.checkpoint import (
+    _infer_device_type,
+    checkpoint,
+    checkpoint_sequential,
+    get_device_states,
+)
+from torch.utils.data import DataLoader
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -40,11 +55,10 @@
 HAS_CUDA = torch.cuda.is_available()
 
 
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class RandomDatasetMock(torch.utils.data.Dataset):
-
     def __getitem__(self, index):
         return torch.tensor([torch.rand(1).item(), random.uniform(0, 1)])
 
@@ -53,7 +67,6 @@ def __len__(self):
 
 
 class TestCheckpoint(TestCase):
-
     # This runs checkpoint_sequential on each of the nets in
     # module_lists_to_compare, and compares them against the uncheckpointed model.
     # To compare, it checks outputs as well as input gradients and parameter gradients
@@ -101,9 +114,7 @@ def _check_checkpoint_sequential(
     # Test whether checkpoint is being triggered or not. For this, we check
     # the number of times forward pass happens
     def test_checkpoint_trigger(self):
-
         class Net(nn.Module):
-
             def __init__(self):
                 super().__init__()
                 self.counter = 0
@@ -112,7 +123,7 @@ def forward(self, input_var):
                 self.counter += 1
                 # For reentrant, need to have autograd actually
                 # pack a tensor to trigger recomp
-                ret = input_var * torch.tensor(2.)
+                ret = input_var * torch.tensor(2.0)
                 return ret
 
         # checkpointed
@@ -122,13 +133,15 @@ def forward(self, input_var):
                 for m in modules:
                     self.assertEqual(m.counter, 0)
                 input_var = torch.randn(3, 4, requires_grad=True)
-                out = checkpoint_sequential(modules, 2, input_var, use_reentrant=use_reentrant)
+                out = checkpoint_sequential(
+                    modules, 2, input_var, use_reentrant=use_reentrant
+                )
                 for m in modules:
                     self.assertEqual(m.counter, 1)
                 out.sum().backward()
-                for m in modules[:(len(modules) // 2)]:
+                for m in modules[: (len(modules) // 2)]:
                     self.assertEqual(m.counter, 2)
-                for m in modules[(len(modules) // 2):]:
+                for m in modules[(len(modules) // 2) :]:
                     self.assertEqual(m.counter, 1)
 
     def test_checkpoint_valid(self):
@@ -138,7 +151,7 @@ def test_checkpoint_valid(self):
             nn.Linear(50, 20),
             nn.ReLU(),
             nn.Linear(20, 5),
-            nn.ReLU()
+            nn.ReLU(),
         )
 
         input_var = torch.randn(1, 100, requires_grad=True)
@@ -147,20 +160,33 @@ def test_checkpoint_valid(self):
         chunks = 2
         modules = list(model.children())
         out = checkpoint_sequential(modules, chunks, input_var, use_reentrant=True)
-        with self.assertRaisesRegex(RuntimeError, "torch.utils.checkpoint is incompatible"):
+        with self.assertRaisesRegex(
+            RuntimeError, "torch.utils.checkpoint is incompatible"
+        ):
             torch.autograd.grad(
-                outputs=[out], grad_outputs=[torch.ones(1, 5)], inputs=[input_var], create_graph=True
+                outputs=[out],
+                grad_outputs=[torch.ones(1, 5)],
+                inputs=[input_var],
+                create_graph=True,
             )
         # works with use_reentrant=False, and grads are the same
         out = model(input_var)
         grads_no_checkpoint = torch.autograd.grad(
-            outputs=[out], grad_outputs=[torch.ones(1, 5)], inputs=[input_var], create_graph=True,
+            outputs=[out],
+            grad_outputs=[torch.ones(1, 5)],
+            inputs=[input_var],
+            create_graph=True,
+        )
+        out_checkpoint = checkpoint_sequential(
+            modules, chunks, input_var, use_reentrant=False
         )
-        out_checkpoint = checkpoint_sequential(modules, chunks, input_var, use_reentrant=False)
         # check outputs are the same
         self.assertEqual(out_checkpoint, out)
         grads_checkpoint = torch.autograd.grad(
-            outputs=[out_checkpoint], grad_outputs=[torch.ones(1, 5)], inputs=[input_var], create_graph=True,
+            outputs=[out_checkpoint],
+            grad_outputs=[torch.ones(1, 5)],
+            inputs=[input_var],
+            create_graph=True,
         )
         self.assertEqual(grads_no_checkpoint, grads_checkpoint)
 
@@ -173,7 +199,7 @@ def test_checkpoint(self):
                     nn.Linear(50, 20),
                     nn.ReLU(),
                     nn.Linear(20, 5),
-                    nn.ReLU()
+                    nn.ReLU(),
                 )
 
                 # Compare uncheckpointed model with its checkpointed counterparts
@@ -247,7 +273,7 @@ def forward(self):
 
     def test_checkpoint_rng_cpu(self):
         for _ in range(5):
-            inp = torch.randn(20000, device='cpu').requires_grad_()
+            inp = torch.randn(20000, device="cpu").requires_grad_()
             phase1 = torch.nn.Dropout()
             phase2 = torch.nn.Dropout()
 
@@ -272,10 +298,10 @@ def run_fn(input):
 
             self.assertEqual(grad_with_checkpointing, grad_no_checkpointing)
 
-    @unittest.skipIf(not HAS_CUDA, 'No CUDA')
+    @unittest.skipIf(not HAS_CUDA, "No CUDA")
     def test_checkpoint_rng_cuda(self):
         for _ in range(5):
-            inp = torch.randn(20000, device='cuda').requires_grad_()
+            inp = torch.randn(20000, device="cuda").requires_grad_()
             phase1 = torch.nn.Dropout()
             phase2 = torch.nn.Dropout()
 
@@ -300,9 +326,9 @@ def run_fn(input):
 
             self.assertEqual(grad_with_checkpointing, grad_no_checkpointing)
 
-    @unittest.skipIf(not HAS_CUDA, 'No CUDA')
+    @unittest.skipIf(not HAS_CUDA, "No CUDA")
     def test_checkpoint_not_preserve_rng_state_and_without_reentrant(self):
-        inp = torch.randn(2, device='cuda').requires_grad_()
+        inp = torch.randn(2, device="cuda").requires_grad_()
         layer = torch.nn.Dropout()
 
         def run_fn(input):
@@ -312,9 +338,7 @@ def run_fn(input):
         out.sum().backward()
         # This should run without error
 
-
     def test_checkpoint_non_tensor(self):
-
         def run_fn(tensor1, tensor2):
             if tensor2 is None:
                 return tensor1
@@ -349,7 +373,9 @@ def foo(t1, t2, scale, t3):
         res[1].sum().backward(retain_graph=True)
         res[4].sum().backward(retain_graph=True)
         res[6].sum().backward()
-        with self.assertRaisesRegex(RuntimeError, "Trying to backward through the graph a second time"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Trying to backward through the graph a second time"
+        ):
             res[6].sum().backward()
         t1_grad = t1.grad
         t2_grad = t2.grad
@@ -387,6 +413,7 @@ def test_checkpoint_partial_grad(self):
         def run_fn(tensor1, tensor2):
             # tensor 2 is used for other application logic
             return tensor1, tensor2
+
         input_var = torch.randn(1, 4, requires_grad=True)
         input_var2 = torch.randn(1, 4, requires_grad=False)
         out = checkpoint(run_fn, input_var, input_var2, use_reentrant=True)
@@ -394,11 +421,12 @@ def run_fn(tensor1, tensor2):
 
         def run_fn2(tensor1, tensor2):
             return tensor1
+
         input_var = torch.randn(1, 4, requires_grad=False)
         input_var2 = torch.randn(1, 4, requires_grad=True)
         with self.assertRaisesRegex(
             RuntimeError,
-            r"none of output has requires_grad=True, this checkpoint\(\) is not necessary"
+            r"none of output has requires_grad=True, this checkpoint\(\) is not necessary",
         ):
             out = checkpoint(run_fn2, input_var, input_var2, use_reentrant=True)
             out.sum().backward()
@@ -430,13 +458,13 @@ def hook(_unused):
             def test_fn(x):
                 # The main property of this function is that it contains multiple
                 # operations that save gradients in a chain.
-                x = x ** 2
+                x = x**2
                 track(x, 2)
-                x = x ** 2
+                x = x**2
                 track(x, 1)
-                x = x ** 2
+                x = x**2
                 track(x, 0)
-                x = x ** 2
+                x = x**2
                 return x.sum()
 
             fn(test_fn)
@@ -450,20 +478,32 @@ def test_fn(x):
         non_retain_stats = _do_test(lambda fn: fn(x).backward(), True)
 
         # In a retain_grad backward, buffers get preserved
-        _unused_retain_stats = _do_test(lambda fn: fn(x).backward(retain_graph=True), False)
+        _unused_retain_stats = _do_test(
+            lambda fn: fn(x).backward(retain_graph=True), False
+        )
 
         # In a regular backward with checkpoint, buffers get eagerly freed
-        checkpoint_non_retain_stats = _do_test(lambda fn: checkpoint(fn, x, use_reentrant=False).backward(), True)
+        checkpoint_non_retain_stats = _do_test(
+            lambda fn: checkpoint(fn, x, use_reentrant=False).backward(), True
+        )
 
         # In a retain_grad backward with checkpoint, buffers get eagerly freed
-        checkpoint_retain_stats = _do_test(lambda fn: checkpoint(fn, x, use_reentrant=False).backward(retain_graph=True), True)
+        checkpoint_retain_stats = _do_test(
+            lambda fn: checkpoint(fn, x, use_reentrant=False).backward(
+                retain_graph=True
+            ),
+            True,
+        )
 
         self.assertEqual(non_retain_stats, checkpoint_non_retain_stats)
         self.assertEqual(non_retain_stats, checkpoint_retain_stats)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_get_device_states_recursive(self):
-        inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="cuda:1")]}
+        inp = {
+            "foo": torch.rand(10, device="cuda:0"),
+            "bar": [torch.rand(10, device="cuda:1")],
+        }
         device_ids, device_states = get_device_states(inp)
         self.assertEqual(2, len(device_ids))
         self.assertEqual(2, len(device_states))
@@ -473,7 +513,7 @@ def test_get_device_states_recursive(self):
         self.assertTrue(isinstance(device_states[1], torch.Tensor))
 
     def test_infer_device_state_recursive_meta(self):
-        inp = {'foo' : torch.rand(10, device="meta")}
+        inp = {"foo": torch.rand(10, device="meta")}
         device_type = _infer_device_type(inp)
         self.assertEqual("meta", device_type)
 
@@ -481,19 +521,28 @@ def test_infer_device_state_recursive_meta(self):
     def test_infer_device_state_recursive_multi_cuda(self):
         # Check that no warning is issued for either cuda:0, cuda:1 or
         # cuda:0, cuda:0 cases since they are both the same device type
-        inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="cuda:1")]}
+        inp = {
+            "foo": torch.rand(10, device="cuda:0"),
+            "bar": [torch.rand(10, device="cuda:1")],
+        }
         with warnings.catch_warnings():
             warnings.simplefilter("error")
             device_type = _infer_device_type(inp)
             self.assertEqual("cuda", device_type)
-        inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="cuda:0")]}
+        inp = {
+            "foo": torch.rand(10, device="cuda:0"),
+            "bar": [torch.rand(10, device="cuda:0")],
+        }
         with warnings.catch_warnings():
             warnings.simplefilter("error")
             device_type = _infer_device_type(inp)
             self.assertEqual("cuda", device_type)
         # Check that a warning is issued for cuda:0, meta and that it includes
         # device type information
-        inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="meta")]}
+        inp = {
+            "foo": torch.rand(10, device="cuda:0"),
+            "bar": [torch.rand(10, device="meta")],
+        }
         with warnings.catch_warnings(record=True) as w:
             device_type = _infer_device_type(inp)
             self.assertEqual("cuda", device_type)
@@ -503,7 +552,7 @@ def test_infer_device_state_recursive_multi_cuda(self):
             "Tensor arguments, excluding CPU tensors, are detected on at least two types of devices"
             in warning_msg
         )
-        self.assertTrue("Device types: [\'cuda\', \'meta\']" in warning_msg)
+        self.assertTrue("Device types: ['cuda', 'meta']" in warning_msg)
         self.assertTrue("first device type: cuda" in warning_msg)
 
 
@@ -517,11 +566,13 @@ def setUp(self):
 
     def test_random_seed(self):
         def run():
-            dataloader = torch.utils.data.DataLoader(RandomDatasetMock(),
-                                                     batch_size=2,
-                                                     num_workers=4,
-                                                     shuffle=True,
-                                                     timeout=self.MAX_TIMEOUT_IN_SECOND)
+            dataloader = torch.utils.data.DataLoader(
+                RandomDatasetMock(),
+                batch_size=2,
+                num_workers=4,
+                shuffle=True,
+                timeout=self.MAX_TIMEOUT_IN_SECOND,
+            )
             return next(iter(dataloader))
 
         torch.manual_seed(2018)
@@ -534,37 +585,47 @@ def test_single_keep(self):
         # self.dataset is a Tensor here; technically not a valid input because
         # not a Dataset subclass, but needs to stay working so add ignore's
         # for type checking with mypy
-        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
-                                             batch_size=self.batch_size,
-                                             num_workers=0,
-                                             drop_last=False)
+        dataloader: DataLoader = DataLoader(
+            self.dataset,  # type: ignore[arg-type]
+            batch_size=self.batch_size,
+            num_workers=0,
+            drop_last=False,
+        )
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 2)
 
     def test_single_drop(self):
-        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
-                                             batch_size=self.batch_size,
-                                             num_workers=0,
-                                             drop_last=True)
+        dataloader: DataLoader = DataLoader(
+            self.dataset,  # type: ignore[arg-type]
+            batch_size=self.batch_size,
+            num_workers=0,
+            drop_last=True,
+        )
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 1)
 
-    @unittest.skip("FIXME: Intermittent CUDA out-of-memory error on Windows and time-out under ASAN")
+    @unittest.skip(
+        "FIXME: Intermittent CUDA out-of-memory error on Windows and time-out under ASAN"
+    )
     def test_multi_keep(self):
-        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
-                                             batch_size=self.batch_size,
-                                             num_workers=2,
-                                             drop_last=False,
-                                             timeout=self.MAX_TIMEOUT_IN_SECOND)
+        dataloader: DataLoader = DataLoader(
+            self.dataset,  # type: ignore[arg-type]
+            batch_size=self.batch_size,
+            num_workers=2,
+            drop_last=False,
+            timeout=self.MAX_TIMEOUT_IN_SECOND,
+        )
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 2)
 
     def test_multi_drop(self):
-        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
-                                             batch_size=self.batch_size,
-                                             num_workers=2,
-                                             drop_last=True,
-                                             timeout=self.MAX_TIMEOUT_IN_SECOND)
+        dataloader: DataLoader = DataLoader(
+            self.dataset,  # type: ignore[arg-type]
+            batch_size=self.batch_size,
+            num_workers=2,
+            drop_last=True,
+            timeout=self.MAX_TIMEOUT_IN_SECOND,
+        )
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 1)
 
@@ -572,14 +633,20 @@ def test_multi_drop(self):
 test_dir = os.path.abspath(os.path.dirname(str(__file__)))
 
 
-@unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set')
+@unittest.skipIf(
+    "SKIP_TEST_BOTTLENECK" in os.environ.keys(), "SKIP_TEST_BOTTLENECK is set"
+)
 class TestBottleneck(TestCase):
     def _run(self, command, timeout=30):
         """Returns (return-code, stdout, stderr)"""
         import subprocess
 
-        p = subprocess.Popen(command, stdout=subprocess.PIPE,  # noqa: P204
-                             stderr=subprocess.PIPE, shell=True)
+        p = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True,
+        )
         try:
             output, err = p.communicate(timeout=timeout)
         except subprocess.TimeoutExpired:
@@ -590,67 +657,108 @@ def _run(self, command, timeout=30):
         err_str = err.decode("ascii")
         return (rc, output_str, err_str)
 
-    def _run_bottleneck(self, test_file, scriptargs=''):
+    def _run_bottleneck(self, test_file, scriptargs=""):
         curdir = os.path.dirname(os.path.abspath(__file__))
-        filepath = f'{curdir}/{test_file}'
-        if scriptargs != '':
-            scriptargs = f' {scriptargs}'
+        filepath = f"{curdir}/{test_file}"
+        if scriptargs != "":
+            scriptargs = f" {scriptargs}"
         rc, out, err = self._run(
-            f'{sys.executable} -m torch.utils.bottleneck {filepath}{scriptargs}')
+            f"{sys.executable} -m torch.utils.bottleneck {filepath}{scriptargs}"
+        )
         return rc, out, err
 
     def _check_run_args(self):
         # Check that this fails due to missing args
-        rc, out, err = self._run_bottleneck('bottleneck_test/test_args.py')
-        self.assertEqual(rc, 2, atol=0, rtol=0, msg=self._fail_msg('Missing args should error', out + err))
+        rc, out, err = self._run_bottleneck("bottleneck_test/test_args.py")
+        self.assertEqual(
+            rc,
+            2,
+            atol=0,
+            rtol=0,
+            msg=self._fail_msg("Missing args should error", out + err),
+        )
 
         # This should succeed
-        rc, out, err = self._run_bottleneck('bottleneck_test/test_args.py', '--foo foo --bar bar')
-        self.assertEqual(rc, 0, atol=0, rtol=0, msg=self._fail_msg('Should pass args to script', out + err))
+        rc, out, err = self._run_bottleneck(
+            "bottleneck_test/test_args.py", "--foo foo --bar bar"
+        )
+        self.assertEqual(
+            rc,
+            0,
+            atol=0,
+            rtol=0,
+            msg=self._fail_msg("Should pass args to script", out + err),
+        )
 
     def _fail_msg(self, msg, output):
-        return f'{msg}, output was:\n{output}'
+        return f"{msg}, output was:\n{output}"
 
     def _check_environment_summary(self, output):
-        results = re.search('Environment Summary', output)
-        self.assertIsNotNone(results, self._fail_msg('Should have Environment Summary', output))
+        results = re.search("Environment Summary", output)
+        self.assertIsNotNone(
+            results, self._fail_msg("Should have Environment Summary", output)
+        )
 
         # Up to five lines away from the heading, there should be the version number
-        results = re.search(r'Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+', output)
-        self.assertIsNotNone(results, self._fail_msg('Should have PyTorch version', output))
+        results = re.search(
+            r"Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+", output
+        )
+        self.assertIsNotNone(
+            results, self._fail_msg("Should have PyTorch version", output)
+        )
 
     def _check_cprof_summary(self, output):
-        results = re.search('cProfile output', output)
-        self.assertIsNotNone(results, self._fail_msg('Should have cProfile output', output))
+        results = re.search("cProfile output", output)
+        self.assertIsNotNone(
+            results, self._fail_msg("Should have cProfile output", output)
+        )
 
         # This assumes that after the cProfile output section we have
         # the autograd profiler output
-        results = re.search(r'cProfile output.*(\n.*){6,50}\n.*autograd profiler output', output)
-        self.assertIsNotNone(results, self._fail_msg(
-            'Distance between cProfile and autograd prof out not in [6, 50] lines', output))
+        results = re.search(
+            r"cProfile output.*(\n.*){6,50}\n.*autograd profiler output", output
+        )
+        self.assertIsNotNone(
+            results,
+            self._fail_msg(
+                "Distance between cProfile and autograd prof out not in [6, 50] lines",
+                output,
+            ),
+        )
 
     def _check_autograd_summary(self, output):
-        results = re.search('autograd profiler output', output)
-        self.assertIsNotNone(results, self._fail_msg('Should have autograd profiler output', output))
+        results = re.search("autograd profiler output", output)
+        self.assertIsNotNone(
+            results, self._fail_msg("Should have autograd profiler output", output)
+        )
 
         # This assumes that after the autograd profiler output is the end of the
         # output.
-        results = re.search(r'autograd profiler output.*(\n.*){6,100}', output)
-        self.assertIsNotNone(results, self._fail_msg(
-            'Distance between autograd prof output and end of output not in [6, 100] lines', output))
+        results = re.search(r"autograd profiler output.*(\n.*){6,100}", output)
+        self.assertIsNotNone(
+            results,
+            self._fail_msg(
+                "Distance between autograd prof output and end of output not in [6, 100] lines",
+                output,
+            ),
+        )
 
     def _check_cuda(self, output):
         if HAS_CUDA:
-            results = re.search('CUDA mode', output)
-            self.assertIsNotNone(results, self._fail_msg('Should tell users CUDA', output))
+            results = re.search("CUDA mode", output)
+            self.assertIsNotNone(
+                results, self._fail_msg("Should tell users CUDA", output)
+            )
         else:
-            results = re.search('CUDA mode', output)
-            self.assertIsNone(results, self._fail_msg('Should not tell users about CUDA', output))
+            results = re.search("CUDA mode", output)
+            self.assertIsNone(
+                results, self._fail_msg("Should not tell users about CUDA", output)
+            )
 
-    @unittest.skipIf(HAS_CUDA, 'CPU-only test')
+    @unittest.skipIf(HAS_CUDA, "CPU-only test")
     def test_bottleneck_cpu_only(self):
-        rc, out, err = self._run_bottleneck('bottleneck_test/test.py')
-        self.assertEqual(rc, 0, msg=f'Run failed with\n{err}')
+        rc, out, err = self._run_bottleneck("bottleneck_test/test.py")
+        self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
 
         self._check_run_args()
         self._check_environment_summary(out)
@@ -658,10 +766,10 @@ def test_bottleneck_cpu_only(self):
         self._check_cprof_summary(out)
         self._check_cuda(out)
 
-    @unittest.skipIf(not HAS_CUDA, 'No CUDA')
+    @unittest.skipIf(not HAS_CUDA, "No CUDA")
     def test_bottleneck_cuda(self):
-        rc, out, err = self._run_bottleneck('bottleneck_test/test_cuda.py')
-        self.assertEqual(rc, 0, msg=f'Run failed with\n{err}')
+        rc, out, err = self._run_bottleneck("bottleneck_test/test_cuda.py")
+        self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
 
         self._check_run_args()
         self._check_environment_summary(out)
@@ -677,7 +785,7 @@ def test_bottleneck_cuda(self):
 class TestCollectEnv(TestCase):
     def test_smoke(self):
         info_output = get_pretty_env_info()
-        self.assertTrue(info_output.count('\n') >= 17)
+        self.assertTrue(info_output.count("\n") >= 17)
 
 
 class TestONNXUtils(TestCase):
@@ -688,7 +796,6 @@ def test_prepare_onnx_paddings(self):
         self.assertEqual(paddings, [0, 3, 1, 0, 4, 2])
 
     def test_check_onnx_broadcast(self):
-
         def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail):
             broadcast = True
             fail = False
@@ -741,7 +848,6 @@ def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail):
 
 
 class TestHipify(TestCase):
-
     def test_import_hipify(self):
         from torch.utils.hipify import hipify_python  # noqa: F401
 
@@ -774,15 +880,19 @@ def test_quote_escape(self):
             self.assertEqual(self.trie.quote(orig_chars[i]), quoted_strs[i])
 
     def test_export_trie_to_regex(self):
-        words_to_add = ["__CUDACC__", "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", "CUDA_ERROR_ARRAY_IS_MAPPED",
-                        "CUDA_ERROR_NOT_MAPPED", "CUDA_ERROR_INVALID_SOURCE"]
+        words_to_add = [
+            "__CUDACC__",
+            "CUDA_ERROR_CONTEXT_ALREADY_CURRENT",
+            "CUDA_ERROR_ARRAY_IS_MAPPED",
+            "CUDA_ERROR_NOT_MAPPED",
+            "CUDA_ERROR_INVALID_SOURCE",
+        ]
         for word in words_to_add:
             self.trie.add(word)
         regex = self.trie.export_to_regex()
         expected_regex = r"(?:CUDA_ERROR_(?:ARRAY_IS_MAPPED|CONTEXT_ALREADY_CURRENT|INVALID_SOURCE|NOT_MAPPED)|__CUDACC__)"
         self.assertEqual(regex, expected_regex)
 
-
     def test_prefix_words_export_trie_to_regex(self):
         # test case where some nodes have both children and are also leaf nodes.
         words_to_add = ["apple", "app", "ban", "banana"]
@@ -800,7 +910,6 @@ def test_single_export_trie_to_regex(self):
         expected_regex = "cudaErrorInvalidMemcpyDirection"
         self.assertEqual(regex, expected_regex)
 
-
     def test_char_export_trie_to_regex(self):
         self.trie.add("a")
         self.assertEqual(self.trie.export_to_regex(), "a")
@@ -811,6 +920,7 @@ def test_special_char_export_trie_to_regex(self):
         self.trie.add(r"c*")
         self.assertEqual(self.trie.export_to_regex(), r"c\*")
 
+
 class TestAssert(TestCase):
     def test_assert_true(self):
         # verify assertions work as expected
@@ -845,14 +955,16 @@ def test_load_standalone(self):
         build_dir = tempfile.mkdtemp()
         try:
             src_path = os.path.join(build_dir, "main.cpp")
-            src = textwrap.dedent("""\
+            src = textwrap.dedent(
+                """\
                 #include <iostream>
                 #include <torch/torch.h>
                 int main() {
                     auto x = torch::eye(3);
                     std::cout << x << std::endl;
                 }
-            """)
+            """
+            )
             with open(src_path, "w") as f:
                 f.write(src)
 
@@ -866,8 +978,7 @@ def test_load_standalone(self):
 
             ext = ".exe" if IS_WINDOWS else ""
             self.assertEqual(
-                exec_path,
-                os.path.join(build_dir, f"standalone_load_test{ext}")
+                exec_path, os.path.join(build_dir, f"standalone_load_test{ext}")
             )
 
             for shell in [True, False]:
@@ -880,12 +991,14 @@ def test_load_standalone(self):
                 self.assertEqual(
                     # Windows prints "\r\n" for newlines.
                     textwrap.dedent(r.stdout.decode("utf-8")).replace("\r\n", "\n"),
-                    textwrap.dedent("""\
+                    textwrap.dedent(
+                        """\
                      1  0  0
                      0  1  0
                      0  0  1
                     [ CPUFloatType{3,3} ]
-                    """)
+                    """
+                    ),
                 )
 
         finally:
@@ -930,30 +1043,30 @@ def tearDown(self):
     def test_external_module_register(self):
         # Built-in module
         with self.assertRaisesRegex(RuntimeError, "The runtime module of"):
-            torch._register_device_module('cuda', torch.cuda)
+            torch._register_device_module("cuda", torch.cuda)
 
         # Wrong device type
         with self.assertRaisesRegex(RuntimeError, "Expected one of cpu"):
-            torch._register_device_module('dummmy', DummyPrivateUse1Module)
+            torch._register_device_module("dummmy", DummyPrivateUse1Module)
 
         with self.assertRaises(AttributeError):
             torch.privateuseone.is_available()  # type: ignore[attr-defined]
 
-        torch._register_device_module('privateuseone', DummyPrivateUse1Module)
+        torch._register_device_module("privateuseone", DummyPrivateUse1Module)
 
         torch.privateuseone.is_available()  # type: ignore[attr-defined]
 
         # No supporting for override
         with self.assertRaisesRegex(RuntimeError, "The runtime module of"):
-            torch._register_device_module('privateuseone', DummyPrivateUse1Module)
+            torch._register_device_module("privateuseone", DummyPrivateUse1Module)
 
     def test_external_module_register_with_renamed_backend(self):
-        torch.utils.rename_privateuse1_backend('foo')
+        torch.utils.rename_privateuse1_backend("foo")
         with self.assertRaisesRegex(RuntimeError, "has already been set"):
-            torch.utils.rename_privateuse1_backend('dummmy')
+            torch.utils.rename_privateuse1_backend("dummmy")
 
         custom_backend_name = torch._C._get_privateuse1_backend_name()
-        self.assertEqual(custom_backend_name, 'foo')
+        self.assertEqual(custom_backend_name, "foo")
 
         with self.assertRaises(AttributeError):
             torch.foo.is_available()  # type: ignore[attr-defined]
@@ -961,65 +1074,69 @@ def test_external_module_register_with_renamed_backend(self):
         with self.assertRaisesRegex(AssertionError, "Tried to use AMP with the"):
             with torch.autocast(device_type=custom_backend_name):
                 pass
-        torch._register_device_module('foo', DummyPrivateUse1Module)
+        torch._register_device_module("foo", DummyPrivateUse1Module)
 
         torch.foo.is_available()  # type: ignore[attr-defined]
         with torch.autocast(device_type=custom_backend_name):
             pass
 
-        self.assertEqual(torch._utils._get_device_index('foo:1'), 1)
+        self.assertEqual(torch._utils._get_device_index("foo:1"), 1)
         self.assertEqual(torch._utils._get_device_index(torch.device("foo:2")), 2)
 
+
 class TestRenderUtils(TestCase):
     def test_basic(self):
         self.assertExpectedInline(
-            torch._utils.render_call(torch.sum, [torch.randn(100)], {'dim': 0}),
-            '''torch.sum(tensor([...], size=(100,)), dim=0)'''
+            torch._utils.render_call(torch.sum, [torch.randn(100)], {"dim": 0}),
+            """torch.sum(tensor([...], size=(100,)), dim=0)""",
         )
         self.assertExpectedInline(
-            torch._utils.render_call(torch.sum, [torch.randn(100, 100)], {'dim': 0}),
-            '''torch.sum(tensor([...], size=(100, 100)), dim=0)'''
+            torch._utils.render_call(torch.sum, [torch.randn(100, 100)], {"dim": 0}),
+            """torch.sum(tensor([...], size=(100, 100)), dim=0)""",
         )
 
+
 class TestDeviceUtils(TestCase):
     def test_basic(self):
-        with torch.device('meta') as dev:
+        with torch.device("meta") as dev:
             x = torch.empty(3, 3)
-        self.assertEqual(x.device.type, 'meta')
-        self.assertEqual(dev, torch.device('meta'))
+        self.assertEqual(x.device.type, "meta")
+        self.assertEqual(dev, torch.device("meta"))
 
     def test_decorator(self):
-        @set_device('meta')
+        @set_device("meta")
         def f():
             return torch.empty(3, 3)
-        self.assertEqual(f().device.type, 'meta')
+
+        self.assertEqual(f().device.type, "meta")
 
     def test_decorator_generator(self):
-        @set_device('meta')
+        @set_device("meta")
         def f():
             yield torch.empty(3, 3)
             yield torch.empty(3, 3)
+
         r1, r2 = list(f())
-        self.assertEqual(r1.device.type, 'meta')
-        self.assertEqual(r2.device.type, 'meta')
+        self.assertEqual(r1.device.type, "meta")
+        self.assertEqual(r2.device.type, "meta")
 
     def test_nn_module(self):
-        with torch.device('meta'):
+        with torch.device("meta"):
             m = nn.Linear(40, 50)
-        self.assertEqual(m.weight.device.type, 'meta')
+        self.assertEqual(m.weight.device.type, "meta")
 
     def test_set_default_device(self):
         try:
-            torch.set_default_device('meta')
+            torch.set_default_device("meta")
             r = torch.empty(2, 2)
         finally:
             torch.set_default_device(None)
 
-        self.assertEqual(r.device.type, 'meta')
+        self.assertEqual(r.device.type, "meta")
 
     def test_get_default_device(self):
-        torch.set_default_device('meta')
-        self.assertEqual(torch.get_default_device().type, 'meta')
+        torch.set_default_device("meta")
+        self.assertEqual(torch.get_default_device().type, "meta")
         torch.set_default_device(None)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
@@ -1048,7 +1165,7 @@ def test_device_mode_ops(self, device, dtype, op):
             # very incomplete
             if tree_any(
                 lambda x: isinstance(x, torch.Tensor),
-                (sample.input, sample.args, sample.kwargs)
+                (sample.input, sample.args, sample.kwargs),
             ):
                 continue
             # Many OpInfos will explicitly pass in a device.  DeviceContext
@@ -1057,11 +1174,11 @@ def test_device_mode_ops(self, device, dtype, op):
             # NB: Can't pass None to sample_inputs, the function can't
             # handle it.
             kwargs = sample.kwargs.copy()
-            kwargs.pop('device', None)
-            with torch.device('meta'):
+            kwargs.pop("device", None)
+            with torch.device("meta"):
                 r = func(sample.input, *sample.args, **kwargs)
             self.assertTrue(
-                tree_all_only(torch.Tensor, lambda x: x.device.type == 'meta', r)
+                tree_all_only(torch.Tensor, lambda x: x.device.type == "meta", r)
             )
 
 
@@ -1070,22 +1187,22 @@ def test_device_mode_ops(self, device, dtype, op):
 
 class TestCppExtensionUtils(TestCase):
     def test_cpp_compiler_is_ok(self):
-        self.assertTrue(torch.utils.cpp_extension.check_compiler_ok_for_platform('c++'))
+        self.assertTrue(torch.utils.cpp_extension.check_compiler_ok_for_platform("c++"))
 
     def test_cc_compiler_is_ok(self):
-        self.assertTrue(torch.utils.cpp_extension.check_compiler_ok_for_platform('cc'))
+        self.assertTrue(torch.utils.cpp_extension.check_compiler_ok_for_platform("cc"))
 
 
 class TestTraceback(TestCase):
     def test_basic(self):
-        source = '''\
+        source = """\
 def f(x):
     def g(x):
         raise RuntimeError  # HEYA
 
     x = x * 3
     return g(x) + 1
-'''
+"""
 
         out: Dict[str, Any] = {}
         scope = {"__compile_source__": source}
@@ -1095,29 +1212,36 @@ def g(x):
             with report_compile_source_on_error():
                 out["f"](1)
         except RuntimeError as e:
-            self.assertIn("HEYA", ''.join(traceback.format_tb(e.__traceback__)))
+            self.assertIn("HEYA", "".join(traceback.format_tb(e.__traceback__)))
 
     def test_format_traceback_short(self):
         try:
             raise RuntimeError
         except RuntimeError as e:
-            self.assertRegex(format_traceback_short(e.__traceback__), r'.*test_utils.py:\d+ in test_format_traceback_short')
+            self.assertRegex(
+                format_traceback_short(e.__traceback__),
+                r".*test_utils.py:\d+ in test_format_traceback_short",
+            )
 
     def test_captured_traceback(self):
-        self.assertIn('test_captured_traceback', ''.join(CapturedTraceback.extract().format()))
+        self.assertIn(
+            "test_captured_traceback", "".join(CapturedTraceback.extract().format())
+        )
 
     def test_captured_traceback_format_all(self):
-        rs = CapturedTraceback.format_all([CapturedTraceback.extract(), CapturedTraceback.extract()])
+        rs = CapturedTraceback.format_all(
+            [CapturedTraceback.extract(), CapturedTraceback.extract()]
+        )
         self.assertEqual(len(rs), 2)
-        self.assertIn('test_captured_traceback_format_all', ''.join(rs[0]))
+        self.assertIn("test_captured_traceback_format_all", "".join(rs[0]))
 
     def test_captured_traceback_format_all_cached(self):
         tb = CapturedTraceback.extract()
         tb.format()  # cached
         rs = CapturedTraceback.format_all([tb, CapturedTraceback.extract()])
         self.assertEqual(len(rs), 2)
-        self.assertIn('test_captured_traceback_format_all', ''.join(rs[0]))
+        self.assertIn("test_captured_traceback_format_all", "".join(rs[0]))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/third_party/miniz-2.1.0/miniz.c b/third_party/miniz-2.1.0/miniz.c
index 4b5d53f817216..dc790d9e36b7c 100755
--- a/third_party/miniz-2.1.0/miniz.c
+++ b/third_party/miniz-2.1.0/miniz.c
@@ -6250,6 +6250,7 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
     mz_uint32 extra_size = 0;
     mz_uint8 extra_data[MZ_ZIP64_MAX_CENTRAL_EXTRA_FIELD_SIZE];
     mz_uint16 bit_flags = 0;
+    mz_bool write_metadata_only = buf_size && !pBuf;
 
     if ((int)level_and_flags < 0)
         level_and_flags = MZ_DEFAULT_LEVEL;
@@ -6263,7 +6264,7 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
     level = level_and_flags & 0xF;
     store_data_uncompressed = ((!level) || (level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA));
 
-    if ((!pZip) || (!pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING) || ((buf_size) && (!pBuf)) || (!pArchive_name) || ((comment_size) && (!pComment)) || (level > MZ_UBER_COMPRESSION))
+    if ((!pZip) || (!pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING) || (!pArchive_name) || ((comment_size) && (!pComment)) || (level > MZ_UBER_COMPRESSION))
         return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER);
 
     pState = pZip->m_pState;
@@ -6308,7 +6309,9 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
 
 	if (!(level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA))
 	{
-		uncomp_crc32 = (mz_uint32)mz_crc32(MZ_CRC32_INIT, (const mz_uint8 *)pBuf, buf_size);
+        if (!write_metadata_only) {
+            uncomp_crc32 = (mz_uint32)mz_crc32(MZ_CRC32_INIT, (const mz_uint8 *)pBuf, buf_size);
+        }
 		uncomp_size = buf_size;
 		if (uncomp_size <= 3)
 		{
@@ -6330,8 +6333,8 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
     if (!pState->m_zip64)
     {
         /* Bail early if the archive would obviously become too large */
-        if ((pZip->m_archive_size + num_alignment_padding_bytes + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + archive_name_size 
-			+ MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + archive_name_size + comment_size + user_extra_data_len + 
+        if ((pZip->m_archive_size + num_alignment_padding_bytes + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + archive_name_size
+			+ MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + archive_name_size + comment_size + user_extra_data_len +
 			pState->m_central_dir.m_size + MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE + user_extra_data_central_len
 			+ MZ_ZIP_DATA_DESCRIPTER_SIZE32) > 0xFFFFFFFF)
         {
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index efc37aee123d8..776125e84a7f1 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2733,6 +2733,7 @@
 
 - name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
   input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple<Tensor, Tensor, Tensor>()"
+  result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, epsilon)
 
 - name: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor)
   save_mean: not_implemented("miopen_batch_norm_backward save_mean")
diff --git a/tools/autograd/templates/Functions.h b/tools/autograd/templates/Functions.h
index 437a5e8e89889..1780df8edaab7 100644
--- a/tools/autograd/templates/Functions.h
+++ b/tools/autograd/templates/Functions.h
@@ -22,7 +22,7 @@ using at::ArrayRef;
 using at::Type;
 using at::TensorGeometry;
 using at::ScalarType;
-using c10::optional;
+using std::optional;
 using c10::fmap;
 
 inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr) {
@@ -34,12 +34,12 @@ inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs, std::shar
   });
 }
 
-inline c10::List<c10::optional<Tensor>> unpack_opt_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr) {
-  torch::List<c10::optional<Tensor>> result;
+inline c10::List<std::optional<Tensor>> unpack_opt_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr) {
+  torch::List<std::optional<Tensor>> result;
   result.reserve(xs.size());
   for (const SavedVariable& v : xs) {
     auto var = v.unpack(saved_for);
-    result.push_back(var.defined() ? c10::optional<Tensor>(var) : c10::nullopt);
+    result.push_back(var.defined() ? std::optional<Tensor>(var) : c10::nullopt);
   }
   return result;
 }
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index 065812694cfe4..08da173f94bf8 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -42,7 +42,7 @@ using at::Quantizer;
 // we'll remove them when we are actually exposing Quantizer class
 // to frontend
 using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
-using c10::optional;
+using std::optional;
 
 namespace VariableType {
   TORCH_API std::vector<at::DeprecatedTypeProperties*> allCUDATypes();
diff --git a/tools/autograd/templates/ViewFuncs.h b/tools/autograd/templates/ViewFuncs.h
index faf5ab6881f18..1f69c062d344e 100644
--- a/tools/autograd/templates/ViewFuncs.h
+++ b/tools/autograd/templates/ViewFuncs.h
@@ -20,7 +20,7 @@ using at::IntArrayRef;
 using at::ArrayRef;
 using at::Type;
 using at::ScalarType;
-using c10::optional;
+using std::optional;
 using c10::fmap;
 
 ${view_func_declarations}
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index 437ea23d079bf..242adcd205336 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -397,7 +397,7 @@ static PyObject * THPVariable_invert(PyObject* self, PyObject* args) {
   END_HANDLE_TH_ERRORS
 }
 
-static Tensor dispatch_to(const Tensor & self, Device device, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
+static Tensor dispatch_to(const Tensor & self, Device device, bool non_blocking, bool copy, std::optional<c10::MemoryFormat> optional_memory_format) {
   pybind11::gil_scoped_release no_gil;
   // NOTE: this is where we record aten::to in the graph during tracing. However, the behavior of aten::to
   // is different with respect to TensorOptions fields that are not present: aten::to inherits fields that
@@ -407,18 +407,18 @@ static Tensor dispatch_to(const Tensor & self, Device device, bool non_blocking,
   return self.to(self.options().device(device).memory_format(optional_memory_format), non_blocking, copy);
 }
 
-static Tensor dispatch_to(const Tensor & self, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
+static Tensor dispatch_to(const Tensor & self, bool non_blocking, bool copy, std::optional<c10::MemoryFormat> optional_memory_format) {
   pybind11::gil_scoped_release no_gil;
   return self.to(self.options().memory_format(optional_memory_format), non_blocking, copy);
 }
 
-static Tensor dispatch_to(const Tensor & self, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
+static Tensor dispatch_to(const Tensor & self, ScalarType dtype, bool non_blocking, bool copy, std::optional<c10::MemoryFormat> optional_memory_format) {
   pybind11::gil_scoped_release no_gil;
   // TODO: Make this call the TensorOptions version, maybe?
   return self.to(dtype, non_blocking, copy, optional_memory_format);
 }
 
-static Tensor dispatch_to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
+static Tensor dispatch_to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking, bool copy, std::optional<c10::MemoryFormat> optional_memory_format) {
   pybind11::gil_scoped_release no_gil;
   // TODO: Make this call the TensorOptions version, maybe?
   return self.to(device, dtype, non_blocking, copy, optional_memory_format);
@@ -546,7 +546,7 @@ static PyObject * THPVariable_ipu(PyObject* self, PyObject* args, PyObject* kwar
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject * THPVariable_to_type(PyObject* self, ScalarType scalarType, c10::optional<c10::MemoryFormat> optional_memory_format) {
+static PyObject * THPVariable_to_type(PyObject* self, ScalarType scalarType, std::optional<c10::MemoryFormat> optional_memory_format) {
   HANDLE_TH_ERRORS
   auto& self_ = THPVariable_Unpack(self);
   return THPVariable_Wrap(dispatch_to(self_, scalarType, false, false, optional_memory_format));
diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py
index 311eac59eb283..3e43edd502475 100644
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@@ -12,7 +12,7 @@
 
 IS_MEM_LEAK_CHECK = os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1"
 BUILD_ENVIRONMENT = os.getenv("BUILD_ENVIRONMENT", "")
-USE_3_PROCS = "sm86" in BUILD_ENVIRONMENT
+USE_3_PROCS = "sm86" in BUILD_ENVIRONMENT or "cuda" not in BUILD_ENVIRONMENT
 
 # NUM_PROCS_FOR_SHARDING_CALC must remain consistent across all shards of a job
 # to ensure that sharding is consistent, NUM_PROCS is the actual number of procs
diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py
index 982544dfe6079..b277bb7eceb06 100644
--- a/torch/_decomp/__init__.py
+++ b/torch/_decomp/__init__.py
@@ -279,6 +279,7 @@ def core_aten_decompositions() -> Dict[torch._ops.OperatorBase, Callable]:
             aten.linalg_cross,
             aten.cudnn_batch_norm,
             aten.cudnn_batch_norm_backward,
+            aten.miopen_batch_norm_backward,
             aten.deg2rad,
             aten.deg2rad_,
             aten.detach,
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 6cfccbab0d04b..040fbc825becd 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -2319,6 +2319,32 @@ def native_batch_norm_backward_out(
     return grad_input
 
 
+@register_decomposition(aten.miopen_batch_norm_backward)
+@out_wrapper("out0", "out1", "out2")
+def miopen_batch_norm_backward(
+    input: Tensor,
+    grad_output: Tensor,
+    weight: Tensor,
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    save_mean: Optional[Tensor],
+    save_var: Optional[Tensor],
+    epsilon: float,
+):
+    return aten.native_batch_norm_backward(
+        grad_output,
+        input,
+        weight,
+        running_mean,
+        running_var,
+        save_mean,
+        save_var,
+        True,
+        epsilon,
+        [True, True, True],
+    )
+
+
 @register_decomposition(aten.cudnn_batch_norm_backward)
 @out_wrapper("out0", "out1", "out2")
 def cudnn_batch_norm_backward(
diff --git a/torch/_decomp/decompositions_for_jvp.py b/torch/_decomp/decompositions_for_jvp.py
index 81946c314638a..d430386ff3606 100644
--- a/torch/_decomp/decompositions_for_jvp.py
+++ b/torch/_decomp/decompositions_for_jvp.py
@@ -329,3 +329,4 @@ def batch_norm_backward(
 _register_jit_decomposition_for_jvp(torch.ops.aten.native_batch_norm_backward.default)
 _register_jit_decomposition_for_jvp(torch.ops.aten.cudnn_batch_norm_backward.default)
 _register_jit_decomposition_for_jvp(torch.ops.aten.batch_norm_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.miopen_batch_norm_backward.default)
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index b9c4fbfd7b6e4..98496b5fc5de5 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -62,7 +62,7 @@
 
     # Wrap manual_seed with the disable decorator.
     # Can't do it at its implementation due to dependency issues.
-    torch.manual_seed = disable(torch.manual_seed)
+    torch.manual_seed = torch._disable_dynamo(torch.manual_seed)
     # Add the new manual_seed to the builtin registry.
     torch.jit._builtins._register_builtin(torch.manual_seed, "aten::manual_seed")
 
diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py
index 0f4c0dad59bde..6dbd7f36b0b5d 100644
--- a/torch/_dynamo/codegen.py
+++ b/torch/_dynamo/codegen.py
@@ -136,6 +136,23 @@ def __call__(self, value, allow_cache=True):
                 )
             )
             output.extend(create_call_function(2, True))
+        elif (
+            isinstance(value, SymNodeVariable)
+            and value.python_type() == float
+            and not self.tx.export
+        ):
+            # This is a little unusual; force the output convention to be a
+            # Tensor here.  Don't do this for export because this is
+            # apparently load bearing for export tests (but I am a bit
+            # doubtful it actually works in the real world)
+            # NB: It works to add_graph_output on a computed expression
+            # as_tensor here, because we memoize as_tensor calls on
+            # SymNodeVariable!
+            graph_outputs_key = self.add_graph_output(value.as_tensor(self.tx))
+            self.load_graph_output(graph_outputs[graph_outputs_key].index)
+            output.extend(
+                [self.create_load_attr("item")] + create_call_function(0, True)
+            )
         elif isinstance(
             value,
             (
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index f5a3978eb2ae8..498478a540991 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -54,6 +54,11 @@ def is_fbcode():
 # to be dynamic, but accesses to ints should NOT get promoted into inputs.
 specialize_int = False
 
+# Whether or not to specialize on float inputs.  Dynamo will always promote
+# float inputs into Tensor inputs, but at the moment, backends inconsistently
+# support codegen on float (this is to be fixed).
+specialize_float = True
+
 # legacy config, does nothing now!
 dynamic_shapes = True
 
@@ -232,7 +237,7 @@ def is_fbcode():
 # false_fn produces code with identical guards.
 enforce_cond_guards_match = True
 
-# Specify how to optimize a compiiled DDP module. The flag accepts a bollean
+# Specify how to optimize a compiled DDP module. The flag accepts a boolean
 # value or a string. There are 4 modes.
 # 1. "ddp_optimizer" (or True): with "ddp_ptimizer", Dynamo will automatically
 # split model graph into pieces to match DDP bucket sizes to allow DDP
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 77447bc17dee1..38795341be216 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -1,10 +1,14 @@
+import base64
 import collections
+import cProfile
 import dis
 import functools
 import itertools
 import logging
 import os
+import pstats
 import random
+import subprocess
 import sys
 import threading
 import time
@@ -12,8 +16,11 @@
 import types
 import typing
 import weakref
+from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Set
 
+from torch._utils_internal import maybe_upload_prof_stats_to_manifold
+
 from torch.fx._lazy_graph_module import (  # type: ignore[attr-defined]
     _use_lazy_graph_module,
 )
@@ -87,7 +94,6 @@
     is_namedtuple,
     istype,
     LazyString,
-    maybe_cprofile,
     orig_code_map,
     record_compilation_metrics,
     reset_graph_break_dup_checker,
@@ -286,6 +292,83 @@ def exception_handler(e, code, frame=None, export=False):
 FRAME_COMPILE_COUNTER: typing.Counter[int] = collections.Counter()
 
 
+def maybe_cprofile(func):
+    if config.cprofile:
+        return cprofile_wrapper(func)
+    return func
+
+
+def cprofile_wrapper(func):
+    @functools.wraps(func)
+    def profile_wrapper(*args, **kwargs):
+        trace_id = CompileContext.current_trace_id()
+        assert trace_id, "Trace id is None"
+        profile_path = Path(
+            f"/tmp/{func.__name__}_{str(trace_id).replace('/','_')}.profile"
+        )
+        prof = cProfile.Profile()
+        prof.enable()
+        start_ts = time.time()
+        retval = prof.runcall(func, *args, **kwargs)
+        profile_latency = time.time() - start_ts
+        prof.disable()
+        log.info(
+            "### Cprofile for %s trace id [%s] took %.3f seconds ###",
+            func.__name__,
+            trace_id,
+            profile_latency,
+        )
+        ps = pstats.Stats(prof)
+        try:
+            prof.dump_stats(profile_path)
+        except PermissionError:
+            log.info("Cannot write to %s", str(profile_path))
+        svg_path = profile_path.with_suffix(".svg")
+        try:
+            gprof2dot_process = subprocess.Popen(
+                [
+                    "gprof2dot",
+                    "-f",
+                    "pstats",
+                    "--node-label=total-time-percentage",
+                    "--node-label=self-time-percentage",
+                    "--node-label=total-time",
+                    str(profile_path),
+                ],
+                stdout=subprocess.PIPE,
+            )
+            subprocess.check_call(
+                ["dot", "-Tsvg", "-o", str(svg_path)],
+                stdin=gprof2dot_process.stdout,
+            )
+            log.info("Generated SVG from profile at %s", str(svg_path))
+        except FileNotFoundError:
+            log.info(
+                "Failed to generate SVG from profile -- dumping stats instead."
+                "Try installing gprof2dot and dot for a better visualization"
+            )
+            ps.sort_stats(pstats.SortKey.TIME).print_stats(20)
+            ps.sort_stats(pstats.SortKey.CUMULATIVE).print_stats(20)
+
+        maybe_upload_prof_stats_to_manifold(str(profile_path))  # fb-only
+
+        torch._logging.trace_structured(
+            "artifact",
+            lambda: {
+                "name": "dynamo_cprofile_prof",
+                "type": "prof",
+                "encoding": "base64",
+            },
+            payload_fn=lambda: base64.encodebytes(
+                open(profile_path, "rb").read()
+            ).decode("ascii"),
+        )
+
+        return retval
+
+    return profile_wrapper
+
+
 def convert_frame_assert(
     compiler_fn: CompilerFn,
     one_graph: bool = True,
@@ -428,7 +511,6 @@ def register_bytecode_hook(hook: BytecodeHook) -> RemovableHandle:
 
 @compile_time_strobelight_meta(phase_name="_compile")
 @_use_lazy_graph_module(config.use_lazy_graph_module)
-@maybe_cprofile
 def _compile(
     code: types.CodeType,
     globals: Dict[str, object],
@@ -512,6 +594,7 @@ def transform(instructions, code_options):
             instructions[:] = remove_pointless_jumps(remove_dead_code(instructions))
 
     @dynamo_timed(phase_name="entire_frame_compile")
+    @maybe_cprofile
     def compile_inner(
         code: types.CodeType,
         one_graph: bool,
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index bb90d28421457..391bdfcf02020 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -150,7 +150,10 @@ def __init__(self, mod: torch.nn.Module, dynamo_ctx):
 
     def _initialize(self):
         # Do this stuff in constructor to lower overhead slightly
-        if isinstance(self._orig_mod.forward, types.MethodType) and trace_rules.check(
+        if isinstance(self.dynamo_ctx, DisableContext):
+            # No need to check trace rules
+            self.forward = self.dynamo_ctx(self._orig_mod.__call__)
+        elif isinstance(self._orig_mod.forward, types.MethodType) and trace_rules.check(
             self._orig_mod.forward
         ):
             # This may be a torch.nn.* instance in trace_rules.py which
@@ -353,14 +356,9 @@ def get_compiler_config():
             # User has wrapped the class with compile/disable decorator. Apply
             # disable to init/call method.
             cls_obj = fn
-            if isinstance(self, DisableContext):
-                # Disable on init is useful for reconstruction of bytecodes where we
-                # want to prevent Dynamo from tracing into the init function. Check
-                # test_reconstruction in test_model_output.py.
-                cls_obj.__init__ = self(cls_obj.__init__)
             cls_obj.__call__ = self(cls_obj.__call__)
             if issubclass(cls_obj, torch.nn.Module):
-                # NN module variable tracker directly inlines the _call_impl. Disable it.
+                # NN module variable tracker directly inlines the _call_impl.
                 cls_obj._call_impl = self(cls_obj._call_impl)
             return cls_obj
 
@@ -383,12 +381,8 @@ def get_compiler_config():
 
         callback = self.callback
 
-        if isinstance(self, DisableContext):
-            is_jit_tracing = always_false
-            is_fx_tracing = always_false
-        else:
-            is_jit_tracing = torch._C._is_tracing
-            is_fx_tracing = torch.fx._symbolic_trace.is_fx_tracing
+        is_jit_tracing = torch._C._is_tracing
+        is_fx_tracing = torch.fx._symbolic_trace.is_fx_tracing
 
         @functools.wraps(fn)
         def _fn(*args, **kwargs):
@@ -424,10 +418,7 @@ def _fn(*args, **kwargs):
                     cleanup()
 
         # hooks to properly handle inlining
-        if isinstance(self, DisableContext):
-            _fn._torchdynamo_disable = True  # type: ignore[attr-defined]
-        else:
-            _fn._torchdynamo_inline = fn  # type: ignore[attr-defined]
+        _fn._torchdynamo_inline = fn  # type: ignore[attr-defined]
 
         # Save the function pointer to find the original callable while nesting
         # of decorators.
@@ -519,6 +510,53 @@ class DisableContext(_TorchDynamoContext):
     def __init__(self):
         super().__init__(callback=None)
 
+    def __call__(self, fn):
+        # Earlier this code was in the base class _TorchDynamoContext. But we
+        # moved it here to have better code organization. For disable, we just
+        # want the callback to be None. We don't have to check trace_rules or
+        # create any wrapper.
+        fn = innermost_fn(fn)
+
+        if isinstance(fn, torch.nn.Module):
+            mod = fn
+            new_mod = OptimizedModule(mod, self)
+            new_mod._torchdynamo_orig_callable = mod.forward
+            return new_mod
+
+        if inspect.isclass(fn):
+            # User has wrapped the class with compile/disable decorator. Apply
+            # disable to init/call method.
+            cls_obj = fn
+            # Disable on init is useful for reconstruction of bytecodes where we
+            # want to prevent Dynamo from tracing into the init function. Check
+            # test_reconstruction in test_model_output.py.
+            cls_obj.__init__ = self(cls_obj.__init__)
+            cls_obj.__call__ = self(cls_obj.__call__)
+            if issubclass(cls_obj, torch.nn.Module):
+                # NN module variable tracker directly inlines the _call_impl. Disable it.
+                cls_obj._call_impl = self(cls_obj._call_impl)
+            return cls_obj
+
+        assert callable(fn)
+
+        callback = self.callback
+
+        @functools.wraps(fn)
+        def _fn(*args, **kwargs):
+            prior = set_eval_frame(callback)
+            try:
+                return fn(*args, **kwargs)
+            finally:
+                set_eval_frame(prior)
+
+        _fn._torchdynamo_disable = True  # type: ignore[attr-defined]
+
+        # Save the function pointer to find the original callable while nesting
+        # of decorators.
+        _fn._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
+
+        return _fn
+
 
 def _optimize_catch_errors(
     compile_fn,
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 42353eca8bb23..0e714cb1a5428 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -259,6 +259,7 @@ def uninteresting_files():
     "utils_device": torch.utils._device,
     "device": torch.device,
     "___from_numpy": from_numpy,
+    "___as_tensor": torch.as_tensor,
     "torch": torch,
     "inspect": inspect,
 }
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 4606795bf677d..a1b63304fa897 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -1069,6 +1069,7 @@ def append_prefix_insts():
                         TensorWithTFOverrideVariable,
                     ),
                 )
+                and not (isinstance(v, SymNodeVariable) and v.python_type() is float)
                 for v in stack_values
             )
             and all(isinstance(x, TensorVariable) for x in stack_values)
@@ -1412,13 +1413,103 @@ def example_inputs(self) -> List[torch.Tensor]:
         return result
 
     def remove_unused_graphargs(self) -> None:
+        # NB: It's always OK to drop GraphArg for symbols that ended up being
+        # specialized.  You don't even have to make a guard for it, because
+        # ShapeEnv produce_guards operates on tracked_fakes, which never gets
+        # pruned.  That being said, you'll get marginally better generated
+        # guard code if you promote the guard into a Dynamo guard (since that
+        # allows for the guard to be done using C++ guards.)  If we get
+        # ShapeEnv guards to go into C++ guards, this will stop being a thing
+        # though!
+
         assert self.should_exit
+
         # Miniature DCE pass, but only for obviously trivial operations
+        def is_static_true(b_node: fx.node.Argument):
+            if b_node is True:
+                return True
+            if not isinstance(b_node, fx.Node):
+                return False
+            b = b_node.meta.get("example_value")
+            if b is None:
+                return False
+            if b is True:
+                return True
+            if (
+                isinstance(b, torch.SymBool)
+                and (r := b.node.maybe_as_bool()) is not None
+            ):
+                return r
+            # TODO: We can also technically remove all cases when the input
+            # doesn't have unbacked inputs, since it's all in the ShapeEnv
+            return False
+
+        def is_symnode_arg(a: fx.node.Argument):
+            from torch.fx.experimental.sym_node import SymTypes
+
+            if isinstance(a, (int, float, bool)):
+                return True
+            if isinstance(a, fx.Node):
+                return isinstance(a.meta.get("example_value"), SymTypes)
+            return False
+
+        # NB: We assume that you cannot do mutations on int/float/bool,
+        # because they are immutable types, and therefore is always safe to
+        # DCE.
+        def is_symnode_compute_node(node):
+            from torch.fx.experimental.sym_node import SymTypes
+
+            if node.op != "call_function":
+                return False
+            # TODO: I don't think it's possible to have a bare int/float here?
+            if not isinstance(node.meta.get("example_value"), SymTypes):
+                return False
+            # TODO: This will bail here if you ever end up with a more complicated
+            # computation function, like sum(list_of_ints), even though it
+            # should be DCE'able
+            if not all(is_symnode_arg(a) for a in node.args):
+                return False
+            if not all(is_symnode_arg(a) for a in node.kwargs.values()):
+                return False
+            return True
+
+        # NB: You could try to expand this to cover more cases by simply
+        # detecting whenever you have an int output, but this is a bit
+        # dangerous in case someone adds a function that returns an int but is
+        # mutating.  So manually whitelist for now.
+        def is_accessor_node(node):
+            if (
+                node.op == "call_method"
+                and isinstance(node.args[0].meta.get("example_value"), torch.Tensor)
+                and node.target in ["size", "stride", "storage_offset", "item"]
+            ):
+                return True
+            if node.op == "call_function" and node.target in [
+                torch.ops.aten.sym_size,
+                torch.ops.aten.sym_size.default,
+                torch.ops.aten.sym_size.int,
+                torch.ops.aten.sym_stride,
+                torch.ops.aten.sym_stride.default,
+                torch.ops.aten.sym_stride.int,
+                torch.ops.aten.sym_storage_offset,
+                torch.ops.aten.sym_storage_offset.default,
+            ]:
+                return True
+            return False
+
         for node in reversed(list(self.graph.nodes)):
             if len(list(node.users)) == 0:
-                if node.op == "get_attr":
-                    self.remove_node(node)
-                elif node.op == "call_function" and node.target is operator.getitem:
+                if (
+                    node.op == "get_attr"
+                    or (node.op == "call_function" and node.target is operator.getitem)
+                    or (
+                        node.op == "call_function"
+                        and node.target is torch._check
+                        and is_static_true(node.args[0])
+                    )
+                    or is_symnode_compute_node(node)
+                    or is_accessor_node(node)
+                ):
                     self.remove_node(node)
 
         def placeholder_binds_symbol(node):
diff --git a/torch/_dynamo/polyfill.py b/torch/_dynamo/polyfill.py
index 18aaa067a3d28..6104da9311098 100644
--- a/torch/_dynamo/polyfill.py
+++ b/torch/_dynamo/polyfill.py
@@ -56,6 +56,13 @@ def list_cmp(op: Callable[[Any, Any], bool], left: Sequence[Any], right: Sequenc
     return op(len(left), len(right))
 
 
+def set_isdisjoint(set1, set2):
+    for x in set1:
+        if x in set2:
+            return False
+    return True
+
+
 def dropwhile(predicate, iterable):
     # dropwhile(lambda x: x<5, [1,4,6,4,1]) -> 6 4 1
     iterable = iter(iterable)
diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py
index cb42c7eb20344..33c464da5bd3f 100644
--- a/torch/_dynamo/source.py
+++ b/torch/_dynamo/source.py
@@ -560,6 +560,17 @@ def reconstruct(self, codegen):
         codegen.extend_output(create_call_function(1, True))
 
 
+# NB: We don't expect you to actually ever generate guards against this
+# source, it is ephemeral
+@dataclasses.dataclass(frozen=True)
+class FloatTensorSource(ChainedSource):
+    def name(self) -> str:
+        return f"___as_tensor({self.base.name()})"
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+
 # This is a synthetic source that is associated with the singleton
 # shape env guard we always register for all frames.  We get the actual
 # guard contents from the ambient ShapeEnv
@@ -617,3 +628,7 @@ def is_from_defaults(source: Source):
     if isinstance(source, ChainedSource):
         return is_from_defaults(source.base)
     return False
+
+
+def is_cell_contents(source: Source):
+    return isinstance(source, AttrSource) and source.member == "cell_contents"
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index ff9438085c529..9c050d84a5eee 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -2,7 +2,6 @@
 import collections
 import contextlib
 import copy
-import cProfile
 import dataclasses
 import datetime
 import dis
@@ -16,9 +15,7 @@
 import math
 import operator
 import os
-import pstats
 import re
-import subprocess
 import sys
 import textwrap
 import threading
@@ -28,7 +25,6 @@
 import weakref
 from contextlib import contextmanager
 from functools import lru_cache, wraps
-from pathlib import Path
 from types import MethodWrapperType
 from typing import (
     Any,
@@ -50,8 +46,6 @@
     ValuesView,
 )
 
-from torch._utils_internal import maybe_upload_prof_stats_to_manifold
-
 from ..utils.hooks import RemovableHandle
 
 try:
@@ -135,63 +129,6 @@ def tabulate(rows, headers):
         )
 
 
-def maybe_cprofile(func):
-    if config.cprofile:
-        return cprofile_wrapper(func)
-    return func
-
-
-def cprofile_wrapper(func):
-    @wraps(func)
-    def profile_wrapper(*args, **kwargs):
-        global timer_counter
-        profile_cnt = next(timer_counter)
-        profile_path = Path("/tmp/" + func.__name__ + f"{profile_cnt}.profile")
-        prof = cProfile.Profile()
-        prof.enable()
-        start_ts = time.time()
-        retval = prof.runcall(func, *args, **kwargs)
-        profile_latency = time.time() - start_ts
-        prof.disable()
-        print(
-            f"### Cprofile for {func.__name__} iter {profile_cnt} took {profile_latency:.3f} seconds ###"
-        )
-        ps = pstats.Stats(prof)
-        prof.dump_stats(profile_path)
-        svg_path = profile_path.with_suffix(".svg")
-        try:
-            gprof2dot_process = subprocess.Popen(
-                [
-                    "gprof2dot",
-                    "-f",
-                    "pstats",
-                    "--node-label=total-time-percentage",
-                    "--node-label=self-time-percentage",
-                    "--node-label=total-time",
-                    str(profile_path),
-                ],
-                stdout=subprocess.PIPE,
-            )
-            subprocess.check_call(
-                ["dot", "-Tsvg", "-o", str(svg_path)],
-                stdin=gprof2dot_process.stdout,
-            )
-            print(f"Generated SVG from profile at {str(svg_path)}")
-        except FileNotFoundError:
-            print(
-                "Failed to generate SVG from profile -- dumping stats instead."
-                "Try installing gprof2dot and dot for a better visualization"
-            )
-            ps.sort_stats(pstats.SortKey.TIME).print_stats(20)
-            ps.sort_stats(pstats.SortKey.CUMULATIVE).print_stats(20)
-
-        maybe_upload_prof_stats_to_manifold(str(profile_path))  # fb-only
-
-        return retval
-
-    return profile_wrapper
-
-
 curr_frame = 0
 
 
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 575ccfa53f8d2..8f9ab01088a70 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -9,6 +9,7 @@
 import inspect
 import itertools
 import logging
+import math
 import operator
 import re
 import sys
@@ -54,8 +55,10 @@
     ConstantSource,
     ConstDictKeySource,
     ConvertIntSource,
+    FloatTensorSource,
     GetItemSource,
     GradSource,
+    is_cell_contents,
     is_constant_source,
     is_from_defaults,
     is_from_optimizer_source,
@@ -1152,8 +1155,7 @@ def wrap_module(self, value: torch.nn.Module):
             )
 
     def wrap_literal(self, value):
-        unspec = not config.specialize_int
-        if unspec and type(value) is int:
+        if not config.specialize_int and type(value) is int:
             # unspecializing int by default, but still
             # specialize for the following conditions
             if not TracingContext.get().force_unspec_int_unbacked_size_like and (
@@ -1165,11 +1167,14 @@ def wrap_literal(self, value):
                 # NN modules on the fly)
                 or self.source.guard_source().is_nn_module()
                 or is_from_defaults(self.source)
+                or is_cell_contents(self.source)
             ):
                 self.install_guards(GuardBuilder.CONSTANT_MATCH)
                 return ConstantVariable.create(value=value, source=self.source)
             else:
                 return self.wrap_symint(value)
+        elif not config.specialize_float and type(value) is float:
+            return self.wrap_symfloat(value)
         else:
             self.install_guards(GuardBuilder.CONSTANT_MATCH)
             return ConstantVariable.create(value=value)
@@ -1498,6 +1503,140 @@ def wrap_symint(self, value):
 
         return unspec_var
 
+    def wrap_symfloat(self, value):
+        # SymFloat wrapping is special.  We first wrap it in the same way we
+        # do an unspecialized primitive, and then we item() it into a
+        # SymFloat.  Removal of the item() call is left to a later FX pass,
+        # mostly because that pass is more easily done after we have lowered
+        # to ATen ops.  (Dynamo doesn't do decomposition right now).
+
+        if self.name in self.tx.output.unspec_variable_map:
+            return self.tx.output.unspec_variable_map[self.name]
+
+        # NB: we specialize on nan input, because our guard modeling in
+        # ShapeEnv cannot deal with nan
+        if (
+            torch._dynamo.config.specialize_float
+            or is_constant_source(self.get_source())
+            or math.isnan(value)
+        ):
+            self.install_guards(GuardBuilder.CONSTANT_MATCH)
+            return ConstantVariable.create(value=value, source=self.source)
+
+        # NB: At the point we've gotten here, we don't assume static by
+        # default.  Since we have a guard mechanism, there isn't really any
+        # downside to trying to be dynamic for float all the time.  Unlike
+        # ints, this won't make codegen perf worse.  Modest cost to compile
+        # time.
+
+        wrapped_value = torch.tensor(value)
+        # TODO: Switch RandomValueSource over to use this, this is more
+        # accurate
+        assert not isinstance(self.get_source(), RandomValueSource)
+        install_guard(self.get_source().make_guard(GuardBuilder.TYPE_MATCH))
+
+        # The FloatTensorSource here is just for pedantic correctness: if you
+        # guard against an UnspecializedPythonVariable, you need to guard
+        # against the tensor-ified version of the local, otherwise it's not a
+        # Tensor.  However, we never let the UnspecializedPythonVariable escape
+        # here, so there should never actually be any guards against this
+        # source.
+        options = {"source": FloatTensorSource(self.get_source()), "raw_value": value}
+
+        # TODO: Maybe the tensor-ification should be built into the source,
+        # rather than by special pattern match
+        proxy = self.tx.output.root_tracer.create_graph_input(
+            re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
+            type(wrapped_value),
+            source=self.get_source(),
+        )
+
+        unspec_var = wrap_fx_proxy_cls(
+            UnspecializedPythonVariable,
+            tx=self.tx,
+            proxy=proxy,
+            example_value=wrapped_value,
+            **options,
+        )
+        assert isinstance(unspec_var, UnspecializedPythonVariable)
+        self.tx.output.unspec_variable_map[self.name] = unspec_var
+
+        if self.tx.export and not isinstance(self.get_source(), LocalSource):
+            raise AssertionError(
+                f"Dynamo attempts to add additional input during export: value={wrapped_value}, source={self.get_source()}"
+            )
+        fake_tensor_value = None
+        example_value = unspec_var.proxy.node.meta["example_value"]
+        assert is_fake(example_value)
+
+        fake_tensor_value = example_value
+        assert fake_tensor_value.fake_mode is self.tx.fake_mode, (
+            f"fake mode ({fake_tensor_value.fake_mode}) from fake tensor metadata doesn't match mode"
+            "({self.tx.fake_mode}) from InstructionTranslator"
+        )
+
+        # There's something a bit incoherent about pass_arg_as_tensor,
+        # specifically regarding sources.
+        #
+        # Specifically, suppose we have "x: float" local argument.  We
+        # eventually end up with an UnspecializedPythonVariable denoting
+        # torch.as_tensor(x)... but it's source is still L['x'] (which if you
+        # accessed it directly is a float!)  So you gotta be careful when
+        # setting up your guards, because it's still going to be a float at
+        # this point, the conversion happens only precisely at the point we're
+        # actually calling the FX graph.  This happens to be what we want for
+        # shape guard generation, but it's kind of unintuitive.
+        proxy.node.meta["grapharg"] = GraphArg(
+            self.get_source(),
+            wrapped_value,
+            pass_arg_as_tensor=True,
+            fake_tensor=fake_tensor_value,
+            is_tensor=False,
+            example_strong_ref=wrapped_value,
+        )
+
+        # OK, now the crazy sauce.  We want to generate a SymNodeVariable to
+        # do the rest of our tracing, doing the equivalent of an item() call.
+        # But we don't /actually/ want to do an item() call, because that will
+        # give us an unbacked SymFloat, but this is really a backed SymFloat.
+
+        item_proxy = self.tx.output.create_proxy(
+            "call_method",
+            "item",
+            (proxy,),
+            {},
+        )
+        # Do NOT do conventional fake tensor prop
+
+        shape_env = self.tx.output.shape_env
+        item_symbol = shape_env.create_unspecified_symbol(
+            value,
+            # Interesting!  Normally if you do compute on a Variable (the
+            # compute in this case being an item() call), you end up with a
+            # new variable that doesn't have source, but in this case, we can
+            # still put a source on it.
+            source=self.source,
+            # If we put in a Tensor input, definitely dynamic (if you wanted
+            # it to be static, gotta bail out earlier)
+            dynamic_dim=DimDynamic.DYNAMIC,
+        )
+        item_example_value = shape_env.create_symfloatnode(
+            item_symbol, hint=value, source=self.source
+        )
+        set_example_value(item_proxy.node, item_example_value)
+
+        self.tx.output.tracked_fakes.append(
+            TrackedFake(item_example_value, self.source, None)
+        )
+
+        item_unspec_var = SymNodeVariable(
+            item_proxy,
+            item_example_value,
+            source=self.get_source(),  # Interesting as above!
+        )
+
+        return item_unspec_var
+
     def wrap_unspecialized_primitive(self, value):
         if self.name in self.tx.output.unspec_variable_map:
             return self.tx.output.unspec_variable_map[self.name]
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 91a1da13db895..791d19ffb4c1a 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -1187,8 +1187,10 @@ def _call_iter_tuple_list(self, tx, obj=None, *args, **kwargs):
                         obj.source.make_guard(GuardBuilder.TUPLE_ITERATOR_LEN)
                     )
                 else:
-                    if getattr(obj, "source", False) and isinstance(
-                        obj, ConstDictVariable
+                    if (
+                        getattr(obj, "source", False)
+                        and isinstance(obj, ConstDictVariable)
+                        and not istype(obj, SetVariable)
                     ):
                         tx.output.guard_on_key_order.add(obj.source.name())
 
diff --git a/torch/_dynamo/variables/constant.py b/torch/_dynamo/variables/constant.py
index 29a3a72a6f86f..c4502cca6bbe3 100644
--- a/torch/_dynamo/variables/constant.py
+++ b/torch/_dynamo/variables/constant.py
@@ -39,7 +39,7 @@ def create(value, **kwargs) -> VariableTracker:
                 assert not isinstance(value, disallowed_type), reason
 
         # Routing for list and tuple literals.
-        if is_literal and isinstance(value, (list, tuple)):
+        if is_literal and isinstance(value, (list, tuple, set, frozenset)):
             items = []
             for i, x in enumerate(value):
                 item_source = GetItemSource(source, i) if source else None
@@ -51,7 +51,11 @@ def create(value, **kwargs) -> VariableTracker:
                         source=item_source,
                     )
                 )
-            return variables.BaseListVariable.cls_for(type(value))(items, **kwargs)
+            if isinstance(value, (list, tuple)):
+                return variables.BaseListVariable.cls_for(type(value))(items, **kwargs)
+            else:
+                assert isinstance(value, (set, frozenset)), type(value)
+                return variables.SetVariable(items)
 
         return ConstantVariable(value, **kwargs)
 
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index 60fda2146432f..77da855b69aec 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -9,7 +9,7 @@
 
 from torch._subclasses.fake_tensor import is_fake
 
-from .. import variables
+from .. import polyfill, variables
 from ..bytecode_transformation import (
     create_call_function,
     create_call_method,
@@ -17,7 +17,6 @@
     create_load_method,
 )
 from ..eval_frame import skip_code
-
 from ..exc import unimplemented
 from ..guards import GuardBuilder, install_guard
 from ..source import AttrSource, GetItemSource
@@ -401,6 +400,12 @@ def call_method(
             result = self.set_items.pop().vt
             super().call_method(tx, name, (result,), kwargs)
             return result
+        elif name == "isdisjoint":
+            assert not kwargs
+            assert len(args) == 1
+            return variables.UserFunctionVariable(
+                polyfill.set_isdisjoint
+            ).call_function(tx, [self, args[0]], {})
         return super().call_method(tx, name, args, kwargs)
 
     def getitem_const(self, arg: VariableTracker):
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index d51b4daff3471..7802ddbb3390b 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -529,6 +529,9 @@ def get_item_dyn(self, tx, arg: VariableTracker):
             assert isinstance(index, (int, torch.SymInt))
             return self.items[index]
 
+    def call_hasattr(self, tx, name: str) -> "VariableTracker":
+        return variables.ConstantVariable.create(hasattr(torch.Size, name))
+
 
 class NamedTupleVariable(TupleVariable):
     _nonvar_fields = {
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index e928a9e0ea6ed..e1d5cee368dac 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -958,7 +958,9 @@ def set_name_hint(self, name: str):
 
 class SymNodeVariable(VariableTracker):
     """
-    Represents a symbolic size, e.g., as returned by tensor.size(0)
+    Represents a symbolic scalar, either int, float or bool.  This is most commonly used to
+    handle symbolic size computation, e.g., tensor.size(0), but it is also used to
+    handle logic like float_tensor.item() or unspecialized float inputs.
     """
 
     _nonvar_fields = {
@@ -986,6 +988,7 @@ def __init__(self, proxy, sym_num, **kwargs):
         self.proxy = proxy
         # TODO: Should we allow non SymTypes here?  Today it is allowed
         self.sym_num = sym_num
+        self._tensor_var = None
 
     def python_type(self):
         if isinstance(self.sym_num, SymTypes):
@@ -996,6 +999,15 @@ def python_type(self):
     def as_proxy(self):
         return self.proxy
 
+    def as_tensor(self, tx):
+        if self._tensor_var is None:
+            from .builder import SourcelessBuilder
+
+            self._tensor_var = SourcelessBuilder.create(
+                tx, torch.scalar_tensor
+            ).call_function(tx, [self], {})
+        return self._tensor_var
+
     def evaluate_expr(self, output_graph=None):
         try:
             return guard_scalar(self.sym_num)
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 47705cdc07e1f..8e7089f080595 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -83,6 +83,7 @@
     torch._assert,
     torch._utils._get_device_index,
     torch._C._get_cublas_allow_tf32,
+    torch._C._is_any_autocast_enabled,
     torch.cuda.get_device_properties,
     torch.cuda.is_available,
     torch.distributed.is_available,
diff --git a/torch/_export/passes/_node_metadata_hook.py b/torch/_export/passes/_node_metadata_hook.py
index 529690ed934f2..9ca554349fab3 100644
--- a/torch/_export/passes/_node_metadata_hook.py
+++ b/torch/_export/passes/_node_metadata_hook.py
@@ -37,7 +37,7 @@ def _node_metadata_hook(node: torch.fx.Node, stack_trace: str) -> None:
         node.meta["val"] = fake_res
 
     node.meta["stack_trace"] = stack_trace
-    node.meta["nn_module_stack"] = arg_meta["nn_module_stack"]
+    node.meta["nn_module_stack"] = arg_meta.get("nn_module_stack", {})
     node.meta["torch_fn"] = (
         f"{node.target.__name__}_0",
         f"{node.target.__class__.__name__}.{node.target.__name__}",
diff --git a/torch/_export/passes/replace_set_grad_with_hop_pass.py b/torch/_export/passes/replace_set_grad_with_hop_pass.py
index e362ee3547715..91104c17c38d7 100644
--- a/torch/_export/passes/replace_set_grad_with_hop_pass.py
+++ b/torch/_export/passes/replace_set_grad_with_hop_pass.py
@@ -60,6 +60,12 @@ def _replace_with_hop(node: torch.fx.Node):
                 set_grad_node.meta.get("nn_module_stack", {})
             )
             output_node = next(iter(reversed(sub_gm.graph.nodes)), None)
+            # Split_module pass intentially doesn't add output node
+            # if the graph doesn't return anything.
+            # TODO (tmanlaibaatar) Figure out if this is right behaviour
+            # for split_module
+            if isinstance(output_node, torch.fx.Node) and output_node.op != "output":
+                output_node = None
             if output_node is not None:
                 assert len(output_node.args) == 1
                 output_args = output_node.args[0]
@@ -106,9 +112,7 @@ def _replace_with_hop(node: torch.fx.Node):
                         f"repalce_set_grad_with_hop_pass doesnt' support output type {type(output_args)}"
                     )
             else:
-                raise NotImplementedError(
-                    "Cannot replace a call_module with a hop if it has no output. This module will gets DCEed."
-                )
+                node.graph.erase_node(node)
         sub_graph.erase_node(set_grad_node)
 
 
@@ -164,6 +168,7 @@ def _maybe_inline_or_replace_with_hop(node: torch.fx.Node):
                     else node
                 ),
             )
+        new_gm.recompile()
         return new_gm
 
     return gm
diff --git a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
index f0d39fd1e858e..320a899e6b646 100644
--- a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
@@ -88,13 +88,18 @@ def _compute_output_meta_with_inductor_strides(fw_module, fwd_output_strides):
     # will only be set for inductor
     if not fwd_output_strides:
         return out
-    with TracingContext.get().fake_mode.shape_env.suppress_guards():
-        for i in range(len(out)):
-            if not isinstance(out[i], Tensor):
-                continue
-            if all(s1 == s2 for s1, s2 in zip(out[i].stride(), fwd_output_strides[i])):
-                continue
-            out[i] = out[i].as_strided(out[i].shape, fwd_output_strides[i])
+
+    from torch.fx.experimental.symbolic_shapes import statically_known_true
+
+    for i in range(len(out)):
+        if not isinstance(out[i], Tensor):
+            continue
+        if all(
+            statically_known_true(s1 == s2)
+            for s1, s2 in zip(out[i].stride(), fwd_output_strides[i])
+        ):
+            continue
+        out[i] = out[i].as_strided(out[i].shape, fwd_output_strides[i])
     return out
 
 
@@ -141,7 +146,6 @@ def aot_dispatch_base(
     (
         fw_module,
         updated_flat_args,
-        aot_config,
         fw_metadata,
     ) = fakified_out_wrapper.pre_compile(
         fw_module, updated_flat_args, aot_config, fw_metadata=fw_metadata
@@ -150,7 +154,6 @@ def aot_dispatch_base(
     (
         fw_module,
         updated_flat_args,
-        aot_config,
         fw_metadata,
     ) = functionalized_rng_wrapper.pre_compile(
         fw_module, updated_flat_args, aot_config, fw_metadata=fw_metadata
@@ -187,12 +190,12 @@ def aot_dispatch_base(
 
     # Create a wrapper to set up the rng functionalize and fakified out bits
     compiled_fw = functionalized_rng_wrapper.post_compile(
-        compiled_fw, aot_config, fw_metadata=fw_metadata
+        compiled_fw, aot_config, runtime_metadata=fw_metadata
     )
     compiled_fw = fakified_out_wrapper.post_compile(
         compiled_fw,
         aot_config,
-        fw_metadata=fw_metadata,
+        runtime_metadata=fw_metadata,
     )
     # Why do we need to pass in num_fw_outs_saved_for_bw?
     # See Note: [Partitioner handling for Subclasses, Part 2]
@@ -205,7 +208,7 @@ def aot_dispatch_base(
     ).post_compile(
         compiled_fw,
         aot_config,  # not used
-        fw_metadata=fw_metadata,
+        runtime_metadata=fw_metadata,
     )
 
     if not hasattr(compiled_fw_func, "_boxed_call"):
@@ -218,7 +221,7 @@ def aot_dispatch_base(
     ).post_compile(
         compiled_fw_func,
         aot_config,
-        fw_metadata=fw_metadata,
+        runtime_metadata=fw_metadata,
     )
 
     return compiled_fn
@@ -420,7 +423,6 @@ def aot_dispatch_autograd(
             (
                 fw_module,
                 adjusted_flat_args,
-                aot_config,
                 fw_metadata,
             ) = fakified_out_wrapper.pre_compile(
                 fw_module, adjusted_flat_args, aot_config, fw_metadata=fw_metadata
@@ -432,7 +434,6 @@ def aot_dispatch_autograd(
             (
                 fw_module,
                 adjusted_flat_args,
-                aot_config,
                 fw_metadata,
             ) = functionalized_rng_wrapper.pre_compile(
                 fw_module, adjusted_flat_args, aot_config, fw_metadata=fw_metadata
@@ -457,16 +458,16 @@ def aot_dispatch_autograd(
             ).post_compile(
                 compiled_fw_func,
                 aot_config,  # not used
-                fw_metadata=fw_metadata,
+                runtime_metadata=fw_metadata,
             )
 
             compiled_fw_func = functionalized_rng_wrapper.post_compile(
-                compiled_fw_func, aot_config, fw_metadata=fw_metadata
+                compiled_fw_func, aot_config, runtime_metadata=fw_metadata
             )
             compiled_fw_func = fakified_out_wrapper.post_compile(
                 compiled_fw_func,
                 aot_config,
-                fw_metadata=fw_metadata,
+                runtime_metadata=fw_metadata,
             )
 
         # NB: It's important to compile backwards ahead of time, as this may
@@ -1032,7 +1033,7 @@ def backward(ctx, *args):
     ).post_compile(
         CompiledFunction.apply,
         aot_config,
-        fw_metadata=fw_metadata,
+        runtime_metadata=fw_metadata,
     )
 
     if not config.debug_assert:
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
index 934b783f6fc8d..a1fb2980ed1d4 100644
--- a/torch/_functorch/_aot_autograd/runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -73,7 +73,7 @@ def pre_compile(
         aot_config: AOTConfig,
         *,
         fw_metadata: ViewAndMutationMeta,
-    ):
+    ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]:
         """
         Process the inputs to the compiler_fn. You can pass in extra metadata via kwargs.
         Args:
@@ -82,15 +82,15 @@ def pre_compile(
         aot_config: AOTConfig passed in at compile time
         fw_metadata: ViewAndMutationMeta generated from flat_fn and flat_args
         """
-        return flat_fn, flat_args, aot_config, fw_metadata
+        return flat_fn, flat_args, fw_metadata
 
-    def post_compile(self, compiled_fn, aot_config, *, fw_metadata):
+    def post_compile(self, compiled_fn, aot_config, *, runtime_metadata) -> Callable:
         """
         Given an output of the compiler, wrap it with information received from prologue.
         Args:
         compiled_fn: Callable after calling compiler_fn
         aot_config: AOTConfig after calling prologue
-        fw_metadata: ViewAndMutationMeta after calling prologue
+        runtime_metadata: ViewAndMutationMeta after calling all wrappers's pre_compile steps.
         Example:
 
         def wrapped_compiled_fn(args):
@@ -101,28 +101,6 @@ def wrapped_compiled_fn(args):
         """
         return compiled_fn
 
-    def create(
-        self,
-        flat_fn,
-        flat_args: List[Tensor],
-        aot_config: AOTConfig,
-        *,
-        fw_metadata: ViewAndMutationMeta,
-        compiler_fn,
-    ):
-        (
-            wrapped_flat_fn,
-            new_flat_args,
-            new_aot_config,
-            new_fw_metadata,
-        ) = self.pre_compile(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
-        compiled_fn = compiler_fn(
-            wrapped_flat_fn, new_flat_args, new_aot_config, fw_metadata=new_fw_metadata
-        )
-        return self.post_compile(
-            compiled_fn, new_aot_config, fw_metadata=new_fw_metadata
-        )
-
 
 # The wrapper created by this function handles all of the runtime aliasing and mutation "epilogue" logic
 # that needs to run after the compiled function.
@@ -143,11 +121,11 @@ def post_compile(
         compiled_fn,
         aot_config: AOTConfig,
         *,
-        fw_metadata: ViewAndMutationMeta,
+        runtime_metadata: ViewAndMutationMeta,
     ):
         return _create_runtime_wrapper(
             compiled_fn,
-            runtime_metadata=fw_metadata,
+            runtime_metadata=runtime_metadata,
             indices_of_inps_to_detach=self.indices_of_inps_to_detach,
             trace_joint=self.trace_joint,
             keep_input_mutations=aot_config.keep_inference_input_mutations,
@@ -421,7 +399,7 @@ def pre_compile(
         aot_config,
         *,
         fw_metadata,
-    ):
+    ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]:
         if config.functionalize_rng_ops:
             # Update example inputs for the fw_compiler
             fake_mode = detect_fake_mode()
@@ -430,27 +408,27 @@ def pre_compile(
             # We are not clearing flat_args here because
             # 1) There is a check in the debug compiler at the end
             # 2) It does not matter as these are fake tensors
-        return flat_fn, flat_args, aot_config, fw_metadata
+        return flat_fn, flat_args, fw_metadata
 
     def post_compile(
         self,
         compiled_fn,
         aot_config: AOTConfig,
         *,
-        fw_metadata: ViewAndMutationMeta,
+        runtime_metadata: ViewAndMutationMeta,
     ):
         @wraps(compiled_fn)
         def wrapper(runtime_args: List[Any]):
-            if fw_metadata.is_rng_op_functionalized:
+            if runtime_metadata.is_rng_op_functionalized:
                 # Add the seed and offset to args
                 seed, offset = CUDARngStateHelper.get_torch_state_as_tuple()
                 runtime_args.extend([seed, offset])
                 out = compiled_fn(runtime_args)
                 out = self._functionalized_rng_runtime_epilogue(
-                    fw_metadata,
+                    runtime_metadata,
                     out,
                     # TODO: this won't be right for the backward when we convert the call_compiled_backward to use the wrapper
-                    fw_metadata.num_forward_returns,
+                    runtime_metadata.num_forward_returns,
                 )
                 return out
             return compiled_fn(runtime_args)
@@ -493,7 +471,7 @@ def pre_compile(
         aot_config,
         *,
         fw_metadata,
-    ):
+    ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]:
         tracing_context = torch._guards.TracingContext.try_get()
         if tracing_context and tracing_context.fakify_first_call:
             self.out_metas = [
@@ -501,7 +479,7 @@ def pre_compile(
             ]
         else:
             self.needs_post_compile = False
-        return fw_module, flat_args, aot_config, fw_metadata
+        return fw_module, flat_args, fw_metadata
 
     def _compute_output_meta_with_inductor_strides(self):
         out = self.out_metas
@@ -528,7 +506,7 @@ def post_compile(
         compiled_fn,
         aot_config: AOTConfig,
         *,
-        fw_metadata: ViewAndMutationMeta,
+        runtime_metadata: ViewAndMutationMeta,
     ):
         if self.needs_post_compile:
             assert self.fwd_output_strides is not None
@@ -575,19 +553,19 @@ def pre_compile(
             fw_only=self.fw_only,  # type: ignore[arg-type]
         )
         self.maybe_subclass_meta = subclass_meta
-        return new_flat_fn, new_flat_args, aot_config, fw_metadata
+        return new_flat_fn, new_flat_args, fw_metadata
 
     def post_compile(
         self,
         compiled_fn,
         _aot_config: AOTConfig,
         *,
-        fw_metadata: ViewAndMutationMeta,
+        runtime_metadata: ViewAndMutationMeta,
     ):
         if self.maybe_subclass_meta is None:
             return compiled_fn
 
-        subclass_metas = fw_metadata.subclass_fw_graph_out_meta
+        subclass_metas = runtime_metadata.subclass_fw_graph_out_meta
 
         @wraps(compiled_fn)
         def inner_fn(args: List[Any]):
@@ -713,7 +691,7 @@ def pre_compile(
         aot_config: AOTConfig,
         *,
         fw_metadata: ViewAndMutationMeta,
-    ):
+    ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]:
         # Use information about whether or not flat_fn mutates its arguments
         # or not to handle dupe args
 
@@ -740,7 +718,7 @@ def pre_compile(
 
         if ok:
             self.needs_post_compile = False
-            return flat_fn, leaf_flat_args, aot_config, fw_metadata
+            return flat_fn, leaf_flat_args, fw_metadata
 
         if requires_subclass_dispatch(leaf_flat_args, fw_metadata):
             raise RuntimeError(
@@ -865,14 +843,14 @@ def wrapped_flat_fn(*args):
                 ref_fw_metadata == updated_fw_metadata
             ), f"ref_metadata={str(ref_fw_metadata)}, actual_metadata={str(updated_fw_metadata)}"
 
-        return wrapped_flat_fn, deduped_flat_args, aot_config, updated_fw_metadata
+        return wrapped_flat_fn, deduped_flat_args, updated_fw_metadata
 
     def post_compile(
         self,
         compiled_fn,
         aot_config: AOTConfig,
         *,
-        fw_metadata: ViewAndMutationMeta,
+        runtime_metadata: ViewAndMutationMeta,
     ):
         if not self.needs_post_compile:
             return compiled_fn
@@ -932,6 +910,8 @@ def debugged_compiled_fn(args):
 # would cause us to hit that path more frequently).
 @dataclass
 class AOTSyntheticBaseWrapper(CompilerWrapper):
+    # Currently, the only reason we need to plumb this bool is because
+    # the synthetic base code prohibits more cases in the autograd case than the inference case.
     trace_joint: bool  # TODO: refactor trace_joint
     needs_post_compile: bool = True
     aliased_arg_idx_with_metadata_mutations: List[int] = field(default_factory=list)
@@ -943,7 +923,7 @@ def pre_compile(
         aot_config: AOTConfig,
         *,
         fw_metadata: ViewAndMutationMeta,
-    ):
+    ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]:
         is_inference = not self.trace_joint
         flat_args_with_synthetic_bases, synthetic_base_info = merge_view_inputs(
             flat_args,
@@ -954,7 +934,7 @@ def pre_compile(
         # Happy path: we don't need synthetic bases
         if synthetic_base_info is None:
             self.needs_post_compile = False
-            return flat_fn, flat_args, aot_config, fw_metadata
+            return flat_fn, flat_args, fw_metadata
 
         # export path: ban synthetic bases for now, add later if requested.
         if requires_subclass_dispatch(flat_args, fw_metadata):
@@ -1050,7 +1030,6 @@ def wrapped_flat_fn(*args):
         return (
             wrapped_flat_fn,
             flat_args_with_synthetic_bases,
-            aot_config,
             fw_metadata_updated,
         )
 
@@ -1059,7 +1038,7 @@ def post_compile(
         compiled_fn,
         aot_config: AOTConfig,
         *,
-        fw_metadata: ViewAndMutationMeta,
+        runtime_metadata: ViewAndMutationMeta,
     ):
         if not self.needs_post_compile:
             return compiled_fn
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index f1ba67794bc70..379518fb958c3 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -670,23 +670,26 @@ def convert(idx, x):
                 aot_dispatch_base_graph if aot_config.is_export else aot_dispatch_base
             )
 
-        wrappers = [
+        # Wrappers that edit fw_metadata
+        fw_metadata_wrappers = [
             AOTDedupeWrapper(),
             AOTSyntheticBaseWrapper(trace_joint=needs_autograd),
             # Add more passes here
         ]
-        for wrapper in wrappers:
-            flat_fn, fake_flat_args, aot_config, fw_metadata = wrapper.pre_compile(
+        for wrapper in fw_metadata_wrappers:
+            flat_fn, fake_flat_args, fw_metadata = wrapper.pre_compile(
                 flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata
             )
+        # Once all fw_metadata_wrappers have run, runtime_metadata is fixed
+        runtime_metadata = fw_metadata
 
         compiled_fn = compiler_fn(
-            flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata
+            flat_fn, fake_flat_args, aot_config, fw_metadata=runtime_metadata
         )
 
-        for wrapper in reversed(wrappers):
+        for wrapper in reversed(fw_metadata_wrappers):
             compiled_fn = wrapper.post_compile(
-                compiled_fn, aot_config, fw_metadata=fw_metadata
+                compiled_fn, aot_config, runtime_metadata=runtime_metadata
             )
 
         if aot_config.is_export:
diff --git a/torch/_higher_order_ops/associative_scan.py b/torch/_higher_order_ops/associative_scan.py
index 287e59ea00932..8b406f39a64d7 100644
--- a/torch/_higher_order_ops/associative_scan.py
+++ b/torch/_higher_order_ops/associative_scan.py
@@ -110,16 +110,12 @@ def add(x: torch.Tensor, y: torch.Tensor):
 def trace_associative_scan(
     proxy_mode, func_overload, combine_fn: Callable, input: List[torch.Tensor], dim: int
 ):
-    pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
-
     with disable_proxy_modes_tracing():
         sample_inputs = [
             torch.full((), False, dtype=x.dtype, device=x.device)
             for x in itertools.chain(input, input)
         ]
-        combine_graph = reenter_make_fx(combine_fn, pre_dispatch=pre_dispatch)(
-            *sample_inputs
-        )
+        combine_graph = reenter_make_fx(combine_fn)(*sample_inputs)
 
     outputs = None
     for node in combine_graph.graph.nodes:
diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py
index 40aee90affccd..359feb192ae5c 100644
--- a/torch/_higher_order_ops/cond.py
+++ b/torch/_higher_order_ops/cond.py
@@ -29,7 +29,6 @@
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.proxy_tensor import (
     _temp_remove_pre_dispatch_torch_function_mode,
-    disable_proxy_modes_tracing,
     ProxyTorchDispatchMode,
     track_tensor_tree,
 )
@@ -159,11 +158,8 @@ def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
         isinstance(o, torch.Tensor) for o in operands
     ), "Cond operands must be a list of tensors"
 
-    pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
-
-    with disable_proxy_modes_tracing():
-        true_graph = reenter_make_fx(true_fn, pre_dispatch)(*operands)
-        false_graph = reenter_make_fx(false_fn, pre_dispatch)(*operands)
+    true_graph = reenter_make_fx(true_fn)(*operands)
+    false_graph = reenter_make_fx(false_fn)(*operands)
 
     true_outs = []
     false_outs = []
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
index 664bfe1c4dd0a..b5e1385da346b 100644
--- a/torch/_higher_order_ops/flex_attention.py
+++ b/torch/_higher_order_ops/flex_attention.py
@@ -6,6 +6,7 @@
 from torch._higher_order_ops.utils import (
     _has_potential_branch_input_mutation,
     autograd_not_implemented,
+    reenter_make_fx,
     UnsupportedAliasMutationException,
 )
 from torch._ops import HigherOrderOperator
@@ -178,7 +179,7 @@ def trace_flex_attention(
         torch.zeros((), dtype=query.dtype, requires_grad=query.requires_grad)
     ] + [torch.zeros((), dtype=torch.int) for _ in range(4)]
     with TransformGetItemToIndex():
-        score_graph = make_fx(score_mod)(*example_vals, *other_buffers)
+        score_graph = reenter_make_fx(score_mod)(*example_vals, *other_buffers)
     qualname = proxy_mode.tracer.get_fresh_qualname("sdpa_score")
     proxy_mode.tracer.root.register_module(qualname, score_graph)
     node_args = (query, key, value, score_graph, *other_buffers)
diff --git a/torch/_higher_order_ops/map.py b/torch/_higher_order_ops/map.py
index 6bef897dfa511..2bf88ea19565f 100644
--- a/torch/_higher_order_ops/map.py
+++ b/torch/_higher_order_ops/map.py
@@ -230,8 +230,7 @@ def trace_map(proxy_mode, func_overload, f, xs, pos_args):
     example_input = _unstack_pytree(xs)[0]
     body_graph = f
 
-    pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
-    body_graph = reenter_make_fx(body_graph, pre_dispatch)(*example_input, *pos_args)
+    body_graph = reenter_make_fx(body_graph)(*example_input, *pos_args)
 
     next_name = proxy_mode.tracer.get_fresh_qualname("body_graph_")
 
diff --git a/torch/_higher_order_ops/utils.py b/torch/_higher_order_ops/utils.py
index 32bb465041ce5..0fcf22bcc3388 100644
--- a/torch/_higher_order_ops/utils.py
+++ b/torch/_higher_order_ops/utils.py
@@ -1,3 +1,4 @@
+import functools
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Any, Callable
@@ -76,16 +77,19 @@ def graph_with_interpreter(*args):
     return maybe_interpreted_fn
 
 
-# We'll use the current decomposition table to make sure operators in subgraphs are
-# decomposed properly.
-# We also need to maybe run with interpreter for propagating stack_trace
-def reenter_make_fx(fn, pre_dispatch=False):
-    decomp_table = torch.fx.experimental.proxy_tensor.CURRENT_DECOMPOSITION_TABLE
-    return make_fx(
-        _maybe_run_with_interpreter(fn),
-        decomposition_table=decomp_table,
-        pre_dispatch=pre_dispatch,
-    )
+def reenter_make_fx(fn):
+    from torch.fx.experimental.proxy_tensor import _CURRENT_MAKE_FX_TRACER
+
+    @functools.wraps(fn)
+    def wrapped(*args):
+        assert (
+            _CURRENT_MAKE_FX_TRACER is not None
+        ), "Cannot reenter make_fx when we're not under a make_fx tracing session"
+        return _CURRENT_MAKE_FX_TRACER.trace_subgraph(
+            _maybe_run_with_interpreter(fn), *args
+        )
+
+    return wrapped
 
 
 @contextmanager
diff --git a/torch/_higher_order_ops/while_loop.py b/torch/_higher_order_ops/while_loop.py
index 15bacb4bc1942..b0ab00bdfac45 100644
--- a/torch/_higher_order_ops/while_loop.py
+++ b/torch/_higher_order_ops/while_loop.py
@@ -15,11 +15,7 @@
 )
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
-from torch.fx.experimental.proxy_tensor import (
-    disable_proxy_modes_tracing,
-    ProxyTorchDispatchMode,
-    track_tensor_tree,
-)
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
 
 
 class WhileLoopOp(HigherOrderOperator):
@@ -189,14 +185,8 @@ def while_loop_tracing(mode, cond_fn, body_fn, carried_inputs, additional_inputs
     def _trace_while_loop(
         proxy_mode, while_loop_op, cond_fn, body_fn, carried_inputs, additional_inputs
     ):
-        pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
-        with disable_proxy_modes_tracing():
-            cond_graph = reenter_make_fx(cond_fn, pre_dispatch)(
-                *carried_inputs, *additional_inputs
-            )
-            body_graph = reenter_make_fx(body_fn, pre_dispatch)(
-                *carried_inputs, *additional_inputs
-            )
+        cond_graph = reenter_make_fx(cond_fn)(*carried_inputs, *additional_inputs)
+        body_graph = reenter_make_fx(body_fn)(*carried_inputs, *additional_inputs)
 
         next_name = None
         i = 0
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 13a61e10689b2..5ac418b847f85 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -564,23 +564,27 @@ def get_str(obj) -> str:
 
 
 @functools.lru_cache(None)
-def get_inductor_code_hash() -> bytes:
+def torch_key():
     """
-    Compute a hash of all inductor code modules. Used by the FxGraph cache
-    so any inductor code changes would result in new cache keys.
+    Compute a key that contains relevant information about torch source files
     """
-    inductor_root = os.path.dirname(__file__)
+    if not config.is_fbcode():
+        inductor_root = os.path.dirname(__file__)
 
-    contents: Dict[str, bytes] = {}
-    for lib in pkgutil.iter_modules([inductor_root]):
-        spec = lib.module_finder.find_spec(lib.name, None)
-        assert spec is not None
-        module = spec.origin
-        assert module is not None
-        with open(module, "rb") as f:
-            contents[module] = f.read()
+        contents: Dict[str, bytes] = {torch.__version__: b""}
+        for lib in pkgutil.iter_modules([inductor_root]):
+            spec = lib.module_finder.find_spec(lib.name, None)
+            assert spec is not None
+            module = spec.origin
+            assert module is not None
+            with open(module, "rb") as f:
+                contents[module] = f.read()
+
+        return hashlib.sha256(pickle.dumps(contents)).digest()
+
+    from libfb.py import parutil
 
-    return hashlib.sha256(pickle.dumps(contents)).digest()
+    return parutil.get_file_contents("torch/src_hash.txt").rstrip()
 
 
 @dataclasses.dataclass
@@ -645,11 +649,9 @@ def __init__(
         )
 
         # Also hash on various system info (including the triton compiler version).
-        self.torch_version = torch.__version__
+        self.torch_version = torch_key()
         self.system_info = CacheBase.get_system()
 
-        # And the inductor configuration and code.
-        self.inductor_code_hash = get_inductor_code_hash()
         try:
             self.inductor_config = config.save_config()
         except (TypeError, AttributeError) as e:
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 2b7d6c65704e7..0d90e474d04b5 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -1717,7 +1717,14 @@ def rename_indexing(self, index) -> sympy.Expr:
         replacements = {
             x: self.args.size(x)
             for x in sorted_symbols
-            if symbol_is_type(x, (SymT.UNBACKED_INT, SymT.SIZE, SymT.PRECOMPUTED_SIZE))
+            if symbol_is_type(
+                x,
+                (
+                    SymT.UNBACKED_INT,
+                    SymT.SIZE,
+                    SymT.PRECOMPUTED_SIZE,
+                ),
+            )
         }
         return sympy_subs(index, replacements)
 
diff --git a/torch/_inductor/codegen/cpp_micro_gemm.py b/torch/_inductor/codegen/cpp_micro_gemm.py
index 7d54bd8605ec4..353562923c91c 100644
--- a/torch/_inductor/codegen/cpp_micro_gemm.py
+++ b/torch/_inductor/codegen/cpp_micro_gemm.py
@@ -344,7 +344,7 @@ def create_from_config(cls, config: CppMicroGemmConfig):
 
     assert isinstance(n, int) or n.is_number, n
     assert isinstance(k, int) or k.is_number, k
-    m = V.graph.sizevars.size_hint(m) if isinstance(m, sympy.Expr) else m
+    m = V.graph.sizevars.size_hint(m, fallback=1) if isinstance(m, sympy.Expr) else m
     assert isinstance(m, int), m
     if output_dtype is None:
         output_dtype = input_dtype
diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h
index c034522b83332..45f874fc4d269 100644
--- a/torch/_inductor/codegen/cpp_prefix.h
+++ b/torch/_inductor/codegen/cpp_prefix.h
@@ -152,6 +152,28 @@ inline at::vec::Vectorized<float> vec_shuffle_down(at::vec::Vectorized<float> x,
 }
 #endif
 
+#ifdef CPU_CAPABILITY_AVX512
+inline at::vec::Vectorized<float> vec_shuffle_down(at::vec::Vectorized<float> x, size_t n) {
+  using vec_t = at::vec::Vectorized<float>;
+#define SHUFFLE_MASK(z, y, x, w) ((z << 6) | (y << 4) | (x << 2) | w)
+  switch (n) {
+    case 1:
+      return vec_t(_mm512_permute_ps(x, SHUFFLE_MASK(1, 1, 3, 3)));
+    case 2:
+      return vec_t(_mm512_permute_ps(x, SHUFFLE_MASK(2, 2, 2, 2)));
+    case 4:
+      return vec_t(_mm512_permutexvar_ps(
+          _mm512_set_epi32(
+              12, 12, 12, 12, 12, 12, 12, 12, 4, 4, 4, 4, 4, 4, 4, 4),
+          x));
+    case 8:
+      return vec_t(_mm512_permutexvar_ps(
+          _mm512_set_epi32(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8), x));
+  }
+  TORCH_CHECK(false, "Unhandled vec_shuffle_down value ", n);
+}
+#endif
+
 template <typename scalar_t>
 Welford<scalar_t> welford_vec_reduce_all(Welford<at::vec::Vectorized<scalar_t>> acc) {
   using Vec = at::vec::Vectorized<scalar_t>;
diff --git a/torch/_inductor/codegen/cpp_utils.py b/torch/_inductor/codegen/cpp_utils.py
index 5e27f99f181d7..a3b4fd3206b6b 100644
--- a/torch/_inductor/codegen/cpp_utils.py
+++ b/torch/_inductor/codegen/cpp_utils.py
@@ -107,16 +107,19 @@ def _print_Pow(self, expr):
 
         if exp == 0.5 or exp == -0.5:
             return f"std::sqrt({base})" if exp == 0.5 else f"1.0/std::sqrt({base})"
-        assert exp.is_integer
-        exp = int(exp)
-        if exp > 0:
-            r = "*".join([self.paren(base)] * exp)
-        elif exp < 0:
-            r = "1.0/" + self.paren("*".join([self.paren(base)] * abs(exp)))
-        else:  # exp == 0
-            r = "1.0"
-
-        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
+        if exp.is_integer:
+            exp = int(exp)
+            if exp > 0:
+                r = "*".join([self.paren(base)] * exp)
+            elif exp < 0:
+                r = "1.0/" + self.paren("*".join([self.paren(base)] * abs(exp)))
+            else:  # exp == 0
+                r = "1.0"
+
+            return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
+        else:
+            # TODO: float vs double
+            return f"std::pow({base}, {float(exp)})"
 
     def _print_Rational(self, expr):
         # Uses float constants to perform FP div
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index fafd176dc26a5..6ce230714632a 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -1158,7 +1158,9 @@ def generate_c_shim_extern_kernel_call(self, kernel, args):
                 # so just avoid wrapping integers.
                 # Name matching is to find tensor is hacky, but fixing all the
                 # ArrayRefTensor issues is not a priority for now.
-                if isinstance(piece, str) and piece.startswith(("buf", "arg")):
+                if isinstance(piece, str) and piece.startswith(
+                    ("buf", "arg", "wrap_with_raii_handle_if_needed")
+                ):
                     piece = f"convert_arrayref_tensor_to_tensor({piece})"
                 wrapped_args.append(piece)
 
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 5ad5f791a9023..ac6699675af1f 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -1681,7 +1681,15 @@ def indexing(
                 cse_var = self.cse.varname_map[var.name]
                 mask_vars.update(cse_var.mask_vars)
             elif symbol_is_type(
-                var, (SymT.UNBACKED_INT, SymT.SIZE, SymT.PRECOMPUTED_SIZE, SymT.INDEX)
+                var,
+                (
+                    SymT.UNBACKED_INT,
+                    SymT.SIZE,
+                    SymT.PRECOMPUTED_SIZE,
+                    SymT.INDEX,
+                    SymT.FLOAT,
+                    SymT.UNBACKED_FLOAT,
+                ),
             ):
                 pass
             else:
@@ -2755,6 +2763,7 @@ def inductor_meta_common():
             "autotune_local_cache": config.autotune_local_cache,
             "autotune_pointwise": config.triton.autotune_pointwise,
             "autotune_remote_cache": config.autotune_remote_cache,
+            "force_disable_caches": config.force_disable_caches,
             "dynamic_scale_rblock": config.dynamic_scale_rblock,
             "max_autotune": config.max_autotune,
             "max_autotune_pointwise": config.max_autotune_pointwise,
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index cf37b7b9dbe32..bdbfef2eee28f 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -39,6 +39,7 @@
 from torch._inductor.utils import (
     BoxedBool,
     count_tangents,
+    fresh_inductor_cache,
     should_assume_input_aligned,
     tensor_is_aligned,
 )
@@ -414,6 +415,15 @@ def get_patched_config_dict(config_patches=None) -> Dict[str, Any]:
         return config.get_config_copy()
 
 
+@functools.wraps
+def with_fresh_cache_if_config(f):
+    if config.force_disable_caches:
+        with fresh_inductor_cache():
+            return f
+    else:
+        return f
+
+
 @DebugContext.wrap
 @torch.utils._python_dispatch._disable_current_modes()
 @time_and_log(attr="compilation time (in seconds)")
@@ -422,6 +432,7 @@ def get_patched_config_dict(config_patches=None) -> Dict[str, Any]:
 # compile_fx return and we may want to use the _LazyGraphModule for compiling
 # the backward graph as well.
 @_use_lazy_graph_module(dynamo_config.use_lazy_graph_module)
+@with_fresh_cache_if_config
 @dynamo_utils.dynamo_timed(phase_name="inductor_compile")
 def compile_fx_inner(
     gm: torch.fx.GraphModule,
@@ -494,7 +505,11 @@ def compile_fx_inner(
     start = time.time()
 
     fx_graph_remote_cache = should_use_remote_fx_graph_cache()
-    if (config.fx_graph_cache or fx_graph_remote_cache) and not aot_mode:
+    if (
+        not config.force_disable_caches
+        and (config.fx_graph_cache or fx_graph_remote_cache)
+        and not aot_mode
+    ):
         compiled_graph = FxGraphCache.load(
             fx_codegen_and_compile,
             gm,
@@ -1413,7 +1428,6 @@ def partition_fn(graph, joint_inputs, **kwargs):
 
     @compile_time_strobelight_meta(phase_name="bw_compiler")
     @dynamo_utils.dynamo_timed
-    @dynamo_utils.maybe_cprofile
     def bw_compiler(model: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
         user_visible_outputs = {}
 
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 9968ce460cc02..79af641514bd6 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -33,6 +33,9 @@ def is_fbcode():
 # enable autotune remote cache
 autotune_remote_cache = os.environ.get("TORCHINDUCTOR_AUTOTUNE_REMOTE_CACHE") == "1"
 
+# Force disabled all inductor level caching -- This will override any other caching flag
+force_disable_caches = os.environ.get("TORCHINDUCTOR_FORCE_DISABLE_CACHES") == "1"
+
 # use cpp wrapper instead of python wrapper
 cpp_wrapper = os.environ.get("TORCHINDUCTOR_CPP_WRAPPER", "0") == "1"
 
@@ -315,15 +318,13 @@ def is_fbcode():
 benchmark_fusion = os.environ.get("TORCHINDUCTOR_BENCHMARK_FUSION") == "1"
 enabled_metric_tables = os.environ.get("TORCHINDUCTOR_ENABLED_METRIC_TABLES", "")
 
-benchmark_multi_templates = (
-    os.environ.get(
-        "TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES", "0" if is_fbcode() else "1"
-    )
-    == "1"
+# For Triton Templates, select fastest of best template + epilogue vs best template + separate epilogue kernel
+benchmark_epilogue_fusion = (
+    os.environ.get("TORCHINDUCTOR_BENCHMARK_EPILOGUE_FUSION", "1") == "1"
 )
 
 # Take how many of the top triton kernels to benchmark epilogue
-max_epilogue_benchmarked_choices = 3
+max_epilogue_benchmarked_choices = 1
 
 # how many nodes to allow into a single fusion
 max_fusion_size = 64
@@ -456,6 +457,9 @@ def decide_compile_threads():
 # For user visible outputs, inductor will make sure the stride matches with eager.
 bw_outputs_user_visible = True
 
+# Whether to always use shape padding if it is enabled and possible
+force_shape_pad: bool = False
+
 # Fx-based linear/matmul/bmm + permute/transpose vertical fusion
 permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
 
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index a4fd1a9191c1c..01803af152608 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -28,7 +28,11 @@
 )
 
 from . import config, inductor_prims
-from .utils import needs_fallback_due_to_atomic_add_limitations, use_scatter_fallback
+from .utils import (
+    is_gpu,
+    needs_fallback_due_to_atomic_add_limitations,
+    use_scatter_fallback,
+)
 
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
@@ -167,7 +171,7 @@ def convolution_backward(
     groups,
     output_mask,
 ):
-    if not output_mask[2] or grad_output.device.type != "cuda":
+    if not output_mask[2] or not is_gpu(grad_output.device.type):
         return NotImplemented
     grad_bias = aten.sum(grad_output, [0] + list(range(2, grad_output.dim())))
     grad_inp, grad_weight, _ = aten.convolution_backward(
@@ -593,7 +597,7 @@ def select_decomp_table():
 
 @register_decomposition(aten.masked_scatter)
 def masked_scatter(self, mask, source):
-    if self.device.type == "cuda":
+    if is_gpu(self.device.type):
         # This two-step algorithm is the same as eager CUDA, for eager CPU we
         # use a 1-shot serial iteration.
         self, mask = aten.broadcast_tensors([self, mask])
diff --git a/torch/_inductor/fx_passes/joint_graph.py b/torch/_inductor/fx_passes/joint_graph.py
index 3713583e69eee..3302dfd632921 100644
--- a/torch/_inductor/fx_passes/joint_graph.py
+++ b/torch/_inductor/fx_passes/joint_graph.py
@@ -1,11 +1,14 @@
+import itertools
 import logging
 import typing
 from collections import Counter
-from typing import Dict, List, Set
+from typing import Dict, List, Set, Union
 
 import torch
 import torch._guards
 from torch._inductor.constant_folding import ConstantFolder
+from torch._inductor.virtualized import V
+from torch.fx.experimental.symbolic_shapes import statically_known_true
 from torch.multiprocessing.reductions import StorageWeakRef
 
 from .. import config
@@ -14,6 +17,7 @@
     init_once_fakemode,
     KeywordArg,
     Match,
+    MULTIPLE,
     PatternMatcherPass,
     register_graph_pattern,
     stable_topological_sort,
@@ -22,6 +26,13 @@
 
 log = logging.getLogger(__name__)
 patterns = PatternMatcherPass()
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+pass_patterns = [
+    patterns,
+    PatternMatcherPass(),
+]
 
 
 @init_once_fakemode
@@ -40,7 +51,6 @@ def remove_no_ops(
     gm: torch.fx.GraphModule, zeros: Set[torch.fx.Node], ones: Set[torch.fx.Node]
 ):
     "Removes no-ops: (+ 0, - 0, * 1, / 1)"
-    aten = torch.ops.aten
     graph = gm.graph
 
     def fake_tensors_eq(t1, t2, fields=("shape", "dtype", "device")):
@@ -308,7 +318,8 @@ def joint_graph_passes(graph: torch.fx.GraphModule):
         constant_fold_uniform_value(graph)
 
     if config.pattern_matcher:
-        count += patterns.apply(graph.graph)  # type: ignore[arg-type]
+        for patterns in pass_patterns:
+            count += patterns.apply(graph.graph)  # type: ignore[arg-type]
 
     if not config.fallback_random:
         count += replace_random_passes(graph)
@@ -362,3 +373,131 @@ def pointless_view(match: Match, arg, size):
     if size == arg_size:
         node.replace_all_uses_with(node.args[0])
         match.erase_nodes(graph)
+
+
+# When softmax is used with temperature or other scaling, we get the pattern
+#
+#   scale(x) - scale(x).amax(dim, keepdim=True)
+#
+# which is expected to be at most zero, but we may end up with numerical
+# discrepancies # between the recomputed values of scale(x) inside and out
+# of the reduction, # depending on compiler optimizations, e.g. use of fma
+# instructions.
+#
+# Here we replace it with the mathematically equivalent,
+#
+#   scale(x - x.amax(dim, keepdim=True))
+#
+# which is more stable as we only compute the scaling once.
+#
+# NOTE: This pattern must come after fused attention matching!
+
+
+def _partial_softmax_pattern(linear_func, reverse=False, to_dtype=False):
+    # Allow matching inp * other and other * input
+    if reverse:
+        scaled = CallFunction(
+            linear_func, KeywordArg("other"), KeywordArg("inp"), _users=MULTIPLE
+        )
+    else:
+        scaled = CallFunction(
+            linear_func, KeywordArg("inp"), KeywordArg("other"), _users=MULTIPLE
+        )
+    if to_dtype:
+        scaled = CallFunction(
+            prims.convert_element_type, scaled, KeywordArg("dtype"), _users=MULTIPLE
+        )
+    amax = CallFunction(
+        aten.amax.default, scaled, KeywordArg("dim"), KeywordArg("keepdim")
+    )
+    return CallFunction(aten.sub.Tensor, scaled, amax)
+
+
+def _other_is_broadcasted_in_dim(match):
+    # Check that the scaling factor is constant across the reduction dim,
+    # so scaling doesn't change which index corresponds to the maximum value
+    other = match.kwargs["other"]
+    if isinstance(other, (int, float)):
+        return True
+
+    inp = match.kwargs["inp"]
+    if not all(isinstance(x, torch.fx.Node) for x in (inp, other)):
+        return False
+
+    inp_example = inp.meta["val"]
+    other_example = other.meta["val"]
+    if isinstance(other_example, (torch.SymInt, torch.SymFloat)):
+        return True
+
+    if not all(isinstance(x, torch.Tensor) for x in (inp_example, other_example)):
+        return False
+
+    inp_ndim = inp_example.ndim
+    other_shape = other_example.shape
+    if inp_ndim < len(other_shape):
+        return False
+
+    # Pad other_shape to the same ndim as inp
+    other_shape = [1] * (inp_ndim - len(other_shape)) + list(other_shape)
+
+    dim = match.kwargs["dim"]
+    if isinstance(dim, int):
+        dim = (dim,)
+
+    return all(statically_known_true(other_shape[d] == 1) for d in dim)
+
+
+def mul_softmax_pattern(match: Match, *, inp, other, dim, keepdim, dtype=None):
+    def repl(inp, other):
+        if dtype is not None:
+            inp = inp.to(dtype)
+
+        sign: Union[int, float, torch.Tensor]
+        if isinstance(other, (int, float)):
+            sign = 1 if other >= 0 else -1
+        else:
+            one = torch.scalar_tensor(1, dtype=inp.dtype, device=inp.device)
+            sign = torch.where(other >= 0, one, -one)
+
+        inp = inp * sign
+        max_ = torch.amax(inp, dim=dim, keepdim=keepdim)
+        return (inp - max_) * (sign * other)
+
+    with V.fake_mode:
+        match.replace_by_example(repl, [inp, other])
+
+
+for reverse, to_dtype in itertools.product((False, True), repeat=2):
+    register_graph_pattern(
+        _partial_softmax_pattern(aten.mul.Tensor, reverse=reverse, to_dtype=to_dtype),
+        pass_dict=pass_patterns[1],
+        extra_check=_other_is_broadcasted_in_dim,
+    )(mul_softmax_pattern)
+
+
+def div_softmax_pattern(match: Match, *, inp, other, dim, keepdim, dtype=None):
+    def repl(inp, other):
+        if dtype is not None:
+            inp = inp.to(dtype)
+
+        sign: Union[int, float, torch.Tensor]
+        if isinstance(other, (int, float)):
+            sign = 1 if other >= 0 else -1
+        else:
+            one = torch.scalar_tensor(1, dtype=inp.dtype, device=inp.device)
+            sign = torch.where(other >= 0, one, -one)
+
+        inp = inp * sign
+        max_ = torch.amax(inp, dim=dim, keepdim=keepdim)
+        return (inp - max_) / (sign * other)
+
+    with V.fake_mode:
+        match.replace_by_example(repl, [inp, other])
+
+
+for to_dtype in (False, True):
+    register_graph_pattern(
+        _partial_softmax_pattern(aten.div.Tensor, to_dtype=to_dtype),
+        pass_dict=pass_patterns[1],
+        extra_check=_other_is_broadcasted_in_dim,
+    )(div_softmax_pattern)
diff --git a/torch/_inductor/fx_passes/pad_mm.py b/torch/_inductor/fx_passes/pad_mm.py
index e351d38d96ec0..df282629e2ce7 100644
--- a/torch/_inductor/fx_passes/pad_mm.py
+++ b/torch/_inductor/fx_passes/pad_mm.py
@@ -1,4 +1,5 @@
 import functools
+import operator
 from typing import List, Optional, Union
 
 import torch
@@ -7,7 +8,7 @@
 from torch._inductor import utils
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.utils._mode_utils import no_dispatch
-from torch.utils._triton import has_triton
+from ...utils._triton import has_triton
 
 from ..pattern_matcher import fwd_only, gen_register_replacement, joint_fwd_bwd, Match
 
@@ -111,32 +112,10 @@ def addmm_pattern(
 def should_pad_addmm(match: Match) -> bool:
     mat1, mat2, input = fetch_fake_tensors(match, ("mat1", "mat2", "input"))
     return should_pad_common(mat1, mat2, input) and should_pad_bench(
-        mat1, mat2, torch.ops.aten.addmm, input=input
+        match, mat1, mat2, torch.ops.aten.addmm, input=input
     )
 
 
-def addmm_replace(
-    input: Optional[Tensor], mat1: Tensor, mat2: Tensor, beta=1.0, alpha=1.0
-) -> Tensor:
-    m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1))
-    k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
-    n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2))
-
-    if m_padded_length != 0 or k_padded_length != 0 or n_padded_length != 0:
-        return pad_addmm(
-            input,
-            mat1,
-            mat2,
-            m_padded_length,
-            k_padded_length,
-            n_padded_length,
-            beta,
-            alpha,
-        )
-
-    return aten.addmm(input, mat1, mat2, beta=beta, alpha=alpha)
-
-
 def pad_addmm(
     input: Optional[Tensor],
     mat1: Tensor,
@@ -146,36 +125,55 @@ def pad_addmm(
     n_padded_length: int,
     beta=1.0,
     alpha=1.0,
+    mat1_pre_padded: bool = False,
+    mat2_pre_padded: bool = False,
 ):
-    # addmm decomp with padding will go through pad_addmm multiple times if multiple dimensions are needed to be padded
-    if k_padded_length != 0:
-        mat1 = pad_dim(mat1, k_padded_length, 1)
-        mat2 = pad_dim(mat2, k_padded_length, 0)
-    elif n_padded_length != 0:
-        mat2 = pad_dim(mat2, n_padded_length, 1)
-    elif m_padded_length != 0:
-        mat1 = pad_dim(mat1, m_padded_length, 0)
+    # for paddings, dim order is reversed for some reasons
+    # and for every dim, we need to specify left and right padding
+    if not mat1_pre_padded:
+        mat1 = pad_mat1(
+            mat1, m_padded_length=m_padded_length, k_padded_length=k_padded_length
+        )
+    if not mat2_pre_padded:
+        mat2 = pad_mat2(
+            mat2, k_padded_length=k_padded_length, n_padded_length=n_padded_length
+        )
 
     # the add broadcasts, so we only pad if the dimension != 1
-    if input is not None and k_padded_length == 0:
+    if input is not None:
         if n_padded_length != 0:
             if input.dim() == 2 and input.shape[1] != 1:
                 input = pad_dim(input, n_padded_length, 1)
             elif input.dim() == 1 and input.shape[0] != 1:
                 input = pad_dim(input, n_padded_length, 0)
-        elif m_padded_length != 0 and input.dim() == 2 and input.shape[0] != 1:
+        if m_padded_length != 0 and input.dim() == 2 and input.shape[0] != 1:
             input = pad_dim(input, m_padded_length, 0)
 
-    if k_padded_length != 0:
-        return addmm_replace(input, mat1, mat2, beta=beta, alpha=alpha)
-    elif n_padded_length != 0:
-        return addmm_replace(input, mat1, mat2, beta=beta, alpha=alpha)[
-            :, :-n_padded_length
-        ]
-    else:
-        return addmm_replace(input, mat1, mat2, beta=beta, alpha=alpha)[
-            :-m_padded_length, :
-        ]
+    res = aten.addmm(input, mat1, mat2, beta=beta, alpha=alpha)
+
+    if m_padded_length != 0:
+        res = res[:-m_padded_length, :]
+    if n_padded_length != 0:
+        res = res[:, :-n_padded_length]
+    return res
+
+
+def addmm_replace(
+    input: Optional[Tensor], mat1: Tensor, mat2: Tensor, beta=1.0, alpha=1.0
+) -> Tensor:
+    k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
+    n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2))
+    m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1))
+    return pad_addmm(
+        input,
+        mat1,
+        mat2,
+        m_padded_length,
+        k_padded_length,
+        n_padded_length,
+        beta,
+        alpha,
+    )
 
 
 def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool:
@@ -216,16 +214,29 @@ def get_pad_cache():
     return torch._inductor.codecache.LocalCache()
 
 
-def get_cached_should_pad(key):
+def get_cached_should_pad(key: str) -> bool:
     return get_pad_cache().lookup(key)
 
 
-def set_cached_should_pad(key, value):
+def set_cached_should_pad(key: str, value: bool):
+    return get_pad_cache().set_value(key, value=value)
+
+
+def get_cached_base_mm_benchmark_time(key: str) -> float:
+    return get_pad_cache().lookup(key)
+
+
+def set_cached_base_mm_benchmark_time(key: str, value: float):
     return get_pad_cache().set_value(key, value=value)
 
 
 def should_pad_bench_key(
-    mat1: Tensor, mat2: Tensor, op, input: Optional[Tensor] = None
+    match,
+    mat1: Tensor,
+    mat2: Tensor,
+    op,
+    input: Optional[Tensor] = None,
+    is_base_time_key=False,
 ) -> str:
     def tensor_key(t):
         return (t.shape, t.stride(), t.dtype)
@@ -233,44 +244,80 @@ def tensor_key(t):
     tf32_key = (
         None if mat1.dtype != torch.float32 else torch.backends.cuda.matmul.allow_tf32
     )
+
+    def fmt_pad(name):
+        if is_base_time_key:
+            return None
+        return f"exclude_pad:{should_exclude_padding_time(match, name)}"
+
     key = (
         tensor_key(mat1),
         tensor_key(mat2),
+        fmt_pad("mat1"),
+        fmt_pad("mat2"),
         op,
         input if input is None else tensor_key(input),
         tf32_key,
     )
 
-    return str(key)
+    key = str(key)
+    if is_base_time_key:
+        key = f"base mm time: {key}"
+    return key
 
 
-def should_pad_bench(
-    mat1: Tensor, mat2: Tensor, op, input: Optional[Tensor] = None
-) -> bool:
-    if not has_triton():
+def get_non_view_def(node):
+    if node.op == operator.getitem:
+        return get_non_view_def(node.args[0])
+
+    if (
+        node.op == "call_function"
+        and isinstance(node.target, torch._ops.OpOverload)
+        and utils.is_view(node.target)
+    ):
+        return get_non_view_def(node.all_input_nodes[0])
+
+    return node
+
+
+def should_exclude_padding_time(match, arg_name):
+    node_def = get_non_view_def(match.kwargs[arg_name])
+
+    # constant padding converts tensors to contiguous so even if the input tensor
+    # can be planned layout transform is not free. TODO - way to pad and preserve layout ?
+    if not fetch_fake_tensors(match, (arg_name,))[0].is_contiguous():
         return False
 
+    # optimistically assume we should be able to memory plan away
+    # all non inputs
+    return node_def.op != "placeholder"
+
+
+def should_pad_bench(
+    match, mat1: Tensor, mat2: Tensor, op, input: Optional[Tensor] = None
+) -> bool:
     do_bench = functools.partial(
         torch._inductor.runtime.runtime_utils.do_bench_gpu,
         warmup=5,
     )
-
+    m_padded_length = 0
+    n_padded_length = 0
+    batchsize = 1
     with no_dispatch():
         if op is torch.ops.aten.mm or op is torch.ops.aten.addmm:
             m = mat1.shape[0]
             k = mat1.shape[1]
             n = mat2.shape[1]
-
-            m_padded_length = get_padded_length(m, get_alignment_size(mat1))
             k_padded_length = get_padded_length(k, get_alignment_size(mat1))
             n_padded_length = get_padded_length(n, get_alignment_size(mat2))
+            m_padded_length = get_padded_length(m, get_alignment_size(mat1))
         elif op is torch.ops.aten.bmm:
+            batchsize = mat1.shape[0]
             m = mat1.shape[1]
             k = mat1.shape[2]
             n = mat2.shape[2]
-
-            m_padded_length = get_padded_length(m, get_alignment_size(mat1))
             k_padded_length = get_padded_length(k, get_alignment_size(mat1))
+            m_padded_length = get_padded_length(m, get_alignment_size(mat1))
             n_padded_length = get_padded_length(n, get_alignment_size(mat2))
         else:
             return False
@@ -278,12 +325,18 @@ def should_pad_bench(
         if m_padded_length == k_padded_length == n_padded_length == 0:
             return False
 
+        if torch._inductor.config.force_shape_pad:
+            return True
+
+        if not has_triton():
+            return False
+
         if not is_mm_compute_bound(m, k, n, mat1.dtype):
             return False
 
         # We don't want to look up the cache for cases that are trivially false
         # since it does file io
-        key = should_pad_bench_key(mat1, mat2, op, input)
+        key = should_pad_bench_key(match, mat1, mat2, op, input)
 
         cached_pad = get_cached_should_pad(key)
         if cached_pad is not None:
@@ -306,19 +359,48 @@ def realize_tensor(t):
 
         mat1 = realize_tensor(mat1)
         mat2 = realize_tensor(mat2)
-        if op is torch.ops.aten.bmm or op is torch.ops.aten.mm:
-            ori_time = do_bench(
-                lambda: op(mat1, mat2),
-            )
-        else:
-            if input is not None:
-                input = realize_tensor(input)
-            ori_time = do_bench(
-                lambda: op(input, mat1, mat2),
+
+        # since we key on whether or not the inputs can be memory planned, set cache for the
+        # original time which is unaffected by whether or not the input can be planned
+        ori_time_key = should_pad_bench_key(
+            match, mat1, mat2, op, input, is_base_time_key=True
+        )
+        ori_time = get_cached_base_mm_benchmark_time(ori_time_key)
+        if ori_time is None:
+            if op is torch.ops.aten.bmm or op is torch.ops.aten.mm:
+                ori_time = do_bench(
+                    lambda: op(mat1, mat2),
+                )
+            else:
+                if input is not None:
+                    # realize bias for addmm
+                    input = realize_tensor(input)
+                ori_time = do_bench(
+                    lambda: op(input, mat1, mat2),
+                )
+            set_cached_base_mm_benchmark_time(ori_time_key, ori_time)
+
+        mat1_pad = mat1
+        mat2_pad = mat2
+
+        is_bmm = op is torch.ops.aten.bmm
+        mat1_pre_padded = should_exclude_padding_time(match, "mat1")
+        if mat1_pre_padded:
+            mat1_pad = pad_mat1(
+                mat1_pad,
+                m_padded_length=m_padded_length,
+                k_padded_length=k_padded_length,
+                is_bmm=is_bmm,
             )
 
-        mat1_pad = torch.randn_like(mat1)
-        mat2_pad = torch.randn_like(mat2)
+        mat2_pre_padded = should_exclude_padding_time(match, "mat2")
+        if mat2_pre_padded:
+            mat2_pad = pad_mat2(
+                mat2_pad,
+                k_padded_length=k_padded_length,
+                n_padded_length=n_padded_length,
+                is_bmm=is_bmm,
+            )
 
         if op is torch.ops.aten.addmm:
             input_pad = None
@@ -332,6 +414,8 @@ def realize_tensor(t):
                     m_padded_length,
                     k_padded_length,
                     n_padded_length,
+                    mat1_pre_padded=mat1_pre_padded,
+                    mat2_pre_padded=mat2_pre_padded,
                 ),
             )
         elif op is torch.ops.aten.mm:
@@ -342,6 +426,8 @@ def realize_tensor(t):
                     m_padded_length,
                     k_padded_length,
                     n_padded_length,
+                    mat1_pre_padded=mat1_pre_padded,
+                    mat2_pre_padded=mat2_pre_padded,
                 ),
             )
         else:
@@ -352,6 +438,8 @@ def realize_tensor(t):
                     m_padded_length,
                     k_padded_length,
                     n_padded_length,
+                    mat1_pre_padded=mat1_pre_padded,
+                    mat2_pre_padded=mat2_pre_padded,
                 ),
             )
 
@@ -371,16 +459,29 @@ def mm_pattern(mat1: Tensor, mat2: Tensor) -> Tensor:
 def should_pad_mm(match: Match) -> bool:
     mat1, mat2 = fetch_fake_tensors(match, ("mat1", "mat2"))
     return should_pad_common(mat1, mat2) and should_pad_bench(
-        mat1, mat2, torch.ops.aten.mm
+        match, mat1, mat2, torch.ops.aten.mm
     )
 
 
-def mm_replace(mat1: Tensor, mat2: Tensor) -> Tensor:
-    m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1))
-    k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
-    n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2))
+def pad_mat1(mat1, *, m_padded_length, k_padded_length, is_bmm=False):
+    if k_padded_length != 0 or m_padded_length != 0:
+        # dim order is reversed for constant_pad_nd, for every dim we specify right and left padding
+        pad_arg = [0, k_padded_length, 0, m_padded_length]
+        if is_bmm:
+            pad_arg.extend((0, 0))
+        return aten.constant_pad_nd(mat1, pad_arg)
+    return mat1
 
-    return pad_mm(mat1, mat2, m_padded_length, k_padded_length, n_padded_length)
+
+def pad_mat2(mat2, *, k_padded_length, n_padded_length, is_bmm=False):
+    if k_padded_length != 0 or n_padded_length != 0:
+        # dim order is reversed for constant_pad_nd, for every dim we specify right and left padding
+        pad_arg = [0, n_padded_length, 0, k_padded_length]
+        if is_bmm:
+            pad_arg.extend((0, 0))
+        return aten.constant_pad_nd(mat2, pad_arg)
+    else:
+        return mat2
 
 
 def pad_mm(
@@ -389,18 +490,36 @@ def pad_mm(
     m_padded_length: int,
     k_padded_length: int,
     n_padded_length: int,
+    mat1_pre_padded: bool = False,
+    mat2_pre_padded: bool = False,
 ) -> Tensor:
-    # mm_replace will go through pad_mm multiple times if multiple dimensions are needed to be padded
-    if k_padded_length != 0:
-        mat1 = pad_dim(mat1, k_padded_length, 1)
-        mat2 = pad_dim(mat2, k_padded_length, 0)
-        return torch.ops.aten.mm(mat1, mat2)
-    elif n_padded_length != 0:
-        mat2 = pad_dim(mat2, n_padded_length, 1)
-        return torch.ops.aten.mm(mat1, mat2)[:, :-n_padded_length]
-    else:
-        mat1 = pad_dim(mat1, m_padded_length, 0)
-        return torch.ops.aten.mm(mat1, mat2)[:-m_padded_length, :]
+    if not mat1_pre_padded:
+        mat1 = pad_mat1(
+            mat1, m_padded_length=m_padded_length, k_padded_length=k_padded_length
+        )
+    if not mat2_pre_padded:
+        mat2 = pad_mat2(
+            mat2, k_padded_length=k_padded_length, n_padded_length=n_padded_length
+        )
+    res = aten.mm(mat1, mat2)
+    if m_padded_length != 0:
+        res = res[:-m_padded_length, :]
+    if n_padded_length != 0:
+        res = res[:, :-n_padded_length]
+    return res
+
+
+def mm_replace(mat1: Tensor, mat2: Tensor) -> Tensor:
+    k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
+    m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1))
+    n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2))
+    return pad_mm(
+        mat1,
+        mat2,
+        m_padded_length,
+        k_padded_length,
+        n_padded_length,
+    )
 
 
 def bmm_pattern(mat1: Tensor, mat2: Tensor) -> Tensor:
@@ -410,40 +529,52 @@ def bmm_pattern(mat1: Tensor, mat2: Tensor) -> Tensor:
 def should_pad_bmm(match: Match) -> bool:
     mat1, mat2 = fetch_fake_tensors(match, ("mat1", "mat2"))
     return should_pad_common(mat1, mat2) and should_pad_bench(
-        mat1, mat2, torch.ops.aten.bmm
+        match, mat1, mat2, torch.ops.aten.bmm
     )
 
 
-def bmm_replace(mat1: Tensor, mat2: Tensor) -> Tensor:
-    m_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
-    k_padded_length = get_padded_length(mat1.shape[2], get_alignment_size(mat1))
-    n_padded_length = get_padded_length(mat2.shape[2], get_alignment_size(mat2))
-
-    if m_padded_length != 0 or k_padded_length != 0 or n_padded_length != 0:
-        return pad_bmm(mat1, mat2, m_padded_length, k_padded_length, n_padded_length)
-
-    return aten.bmm(mat1, mat2)
-
-
 def pad_bmm(
     mat1: Tensor,
     mat2: Tensor,
     m_padded_length: int,
     k_padded_length: int,
     n_padded_length: int,
+    mat1_pre_padded: bool = False,
+    mat2_pre_padded: bool = False,
 ) -> Tensor:
-    # bmm_replace will go through pad_bmm multiple times if multiple dimensions are needed to be padded
-    if k_padded_length != 0:
-        mat1 = pad_dim(mat1, k_padded_length, 2)
-        mat2 = pad_dim(mat2, k_padded_length, 1)
-
-        return aten.bmm(mat1, mat2)
-    elif n_padded_length != 0:
-        mat2 = pad_dim(mat2, n_padded_length, 2)
-        return aten.bmm(mat1, mat2)[:, :, :-n_padded_length].contiguous()
-    else:
-        mat1 = pad_dim(mat1, m_padded_length, 1)
-        return aten.bmm(mat1, mat2)[:, :-m_padded_length, :].contiguous()
+    if not mat1_pre_padded:
+        mat1 = pad_mat1(
+            mat1,
+            m_padded_length=m_padded_length,
+            k_padded_length=k_padded_length,
+            is_bmm=True,
+        )
+    if not mat2_pre_padded:
+        mat2 = pad_mat2(
+            mat2,
+            k_padded_length=k_padded_length,
+            n_padded_length=n_padded_length,
+            is_bmm=True,
+        )
+    res = aten.bmm(mat1, mat2)
+    if m_padded_length != 0:
+        res = res[:, :-m_padded_length, :]
+    if n_padded_length != 0:
+        res = res[:, :, :-n_padded_length]
+    return res
+
+
+def bmm_replace(mat1: Tensor, mat2: Tensor) -> Tensor:
+    k_padded_length = get_padded_length(mat1.shape[2], get_alignment_size(mat1))
+    n_padded_length = get_padded_length(mat2.shape[2], get_alignment_size(mat2))
+    m_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
+    return pad_bmm(
+        mat1,
+        mat2,
+        m_padded_length,
+        k_padded_length,
+        n_padded_length,
+    )
 
 
 @functools.lru_cache(None)
diff --git a/torch/_inductor/index_propagation.py b/torch/_inductor/index_propagation.py
index 6bc5def57d650..ea22955edbb6a 100644
--- a/torch/_inductor/index_propagation.py
+++ b/torch/_inductor/index_propagation.py
@@ -82,6 +82,10 @@ def to_dtype(
     ) -> TypedExpr:
         return TypedExpr(value.expr, dtype)
 
+    @staticmethod
+    def abs(x: TypedExpr) -> TypedExpr:
+        return TypedExpr(abs(x.expr), x.dtype)  # type: ignore[arg-type]
+
     @staticmethod
     def square(x: TypedExpr) -> TypedExpr:
         return TypedExpr(x.expr * x.expr, x.dtype)
diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
index 143c616fcb84e..a8650cd32c3f0 100644
--- a/torch/_inductor/kernel/bmm.py
+++ b/torch/_inductor/kernel/bmm.py
@@ -59,8 +59,15 @@ def bmm_grid(b, m, n, meta):
 
     rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
     rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
-    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
+        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    else:
+        ram = rm % M
+    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
+        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    else:
+        rbn = rn % N
+
     rk = tl.arange(0, BLOCK_K)
 
     idx_q = tl.program_id(1)  # batch dimension for BMM
diff --git a/torch/_inductor/kernel/conv.py b/torch/_inductor/kernel/conv.py
index 37a760a90e1e0..44aef074457a2 100644
--- a/torch/_inductor/kernel/conv.py
+++ b/torch/_inductor/kernel/conv.py
@@ -5,6 +5,7 @@
 from typing import cast, List, Optional, Sequence, Tuple, TYPE_CHECKING, TypedDict
 
 import torch
+
 from .. import config, ir
 
 from ..lowering import (
@@ -245,11 +246,11 @@ def conv_layout(
             ir.ir_node_to_tensor(x, guard_shape=True),
             ir.ir_node_to_tensor(weight, guard_shape=True),
             ir.ir_node_to_tensor(bias, guard_shape=True),
-            stride,
-            tuple(V.graph.sizevars.size_hint(p) for p in padding),  # type: ignore[arg-type]
+            V.graph.sizevars.size_hints(stride),  # type: ignore[arg-type]
+            V.graph.sizevars.size_hints(padding),  # type: ignore[arg-type]
             dilation,
             transposed,
-            tuple(V.graph.sizevars.size_hint(p) for p in output_padding),  # type: ignore[arg-type]
+            V.graph.sizevars.size_hints(output_padding),  # type: ignore[arg-type]
             groups,
         )
         sizes = ir.convert_shape_to_inductor(output.size())
diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py
index 15a99faa7b37f..a780d3709cb0c 100644
--- a/torch/_inductor/kernel/flex_attention.py
+++ b/torch/_inductor/kernel/flex_attention.py
@@ -162,7 +162,7 @@ def sdpa_grid(batch_size, num_heads, num_queries, d_model, meta):
 
     # TODO generalize and add proper mask support
     mask = (idx_m != -1) & (idx_d != -1)
-    {{store_output(("idx_z", "idx_h", "idx_m", "idx_d"), "acc")}}
+    {{store_output(("idx_z", "idx_h", "idx_m", "idx_d"), "acc", "mask")}}
 
     # TODO dont want to write this if we dont require grad
     if OUTPUT_LOGSUMEXP:
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
index be1177393df93..fa14b4406de69 100644
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@@ -65,8 +65,14 @@
 
     rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
     rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
-    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
+        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    else:
+        ram = rm % M
+    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
+        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    else:
+        rbn = rn % N
     rk = tl.arange(0, BLOCK_K)
     A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
     B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
diff --git a/torch/_inductor/kernel/mm_common.py b/torch/_inductor/kernel/mm_common.py
index 5a7f60e59102f..26d08183b0e55 100644
--- a/torch/_inductor/kernel/mm_common.py
+++ b/torch/_inductor/kernel/mm_common.py
@@ -178,14 +178,14 @@ def filtered_configs(
     if config["cond"]
 )
 
-# On ROCm convert num_stages to 1 as pipelining provides no benefit
+# On ROCm convert num_stages to 0 to enable software pipelining
 if torch.version.hip:
     mm_platform_configs = tuple(
-        (config[0], config[1], config[2], 1, config[4])
+        (config[0], config[1], config[2], 0, config[4])
         for config in mm_platform_configs
     )
     int8_platform_configs = tuple(
-        (config[0], config[1], config[2], 1, config[4])
+        (config[0], config[1], config[2], 0, config[4])
         for config in mm_platform_configs
     )
 
diff --git a/torch/_inductor/kernel/mm_plus_mm.py b/torch/_inductor/kernel/mm_plus_mm.py
index 95ef6f043dfce..931aa592556bd 100644
--- a/torch/_inductor/kernel/mm_plus_mm.py
+++ b/torch/_inductor/kernel/mm_plus_mm.py
@@ -54,8 +54,19 @@
 
     rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
     rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
-    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+
+    if (((stride_am == 1 and stride_ak == M) or (stride_am == K1 and stride_ak == 1))
+        and ((stride_cm == 1 and stride_ck == M) or (stride_cm == K1 and stride_ck == 1))):
+        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    else:
+        ram = rm % M
+
+    if (((stride_bk == 1 and stride_bn == K1) or (stride_bk == N and stride_bn == 1))
+        and ((stride_dk == 1 and stride_dn == K1) or (stride_dk == N and stride_dn == 1))):
+        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    else:
+        rbn = rn % N
+
     rk = tl.arange(0, BLOCK_K)
     A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
     B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index e4001aa3b27b8..389ff16e39025 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -1342,21 +1342,6 @@ def unwrap_tensor(x: Union[TensorBox, ir.StorageBox]) -> ir.IRNode:
 
         return x
 
-    def should_lower_cat_input(x) -> bool:
-        # Unrealized inputs will not be storage and layouts, and we dont want to realize
-        # them in case we want to fuse
-        if ir.is_storage_and_layout(x):
-            storage, _ = ir.as_storage_and_layout(x, freeze=False)
-            return not ir.ConcatKernel.can_realize_into_without_copy(storage)
-
-        if isinstance(x, (TensorBox, ir.StorageBox)):
-            return should_lower_cat_input(unwrap_tensor(x))
-
-        if isinstance(x, ir.Pointwise):
-            return True
-
-        return False
-
     def is_reduction(t):
         return isinstance(t, ir.ComputedBuffer) and isinstance(t.data, ir.Reduction)
 
@@ -1375,9 +1360,24 @@ def can_fuse_reduction(t):
     # fusing reducutions into computed concat buffer can cause regressions.
     fusable_reduction = any(can_fuse_reduction(t) for t in inputs)
 
+    def should_lower_cat_input(x) -> bool:
+        # Unrealized inputs will not be storage and layouts, and we dont want to realize
+        # them in case we want to fuse
+        if ir.is_storage_and_layout(x):
+            storage, _ = ir.as_storage_and_layout(x, freeze=False)
+            return not ir.ConcatKernel.can_realize_into_without_copy(storage)
+
+        if isinstance(x, (TensorBox, ir.StorageBox)):
+            return should_lower_cat_input(unwrap_tensor(x))
+
+        if isinstance(x, ir.Pointwise):
+            return True
+
+        return False
+
     # TODO: We observed negative performance impact of pointwise_cat optimization on CPU so disabled it.
     #             We will revisit this later after enabling vectorization on index_expr.
-    if cpu_device or fusable_reduction:
+    if cpu_device:
         return TensorBox(ir.ConcatKernel.create(inputs, dim))
 
     def op_count(x):
@@ -1406,10 +1406,18 @@ def op_count(x):
         and all(op_count(t) <= MAX_SIMPLE_OP_COUNT for t in inputs)
     ):
         pointwise_uses = all(is_pointwise_use(use) for use in V.current_node.users)
-        all_pointwise_inputs = all(should_lower_cat_input(inp) for inp in inputs)
-        any_pointwise_inputs = any(should_lower_cat_input(inp) for inp in inputs)
+        # fuse in case we will be used in a pointwise node, and there are any inputs we
+        # we can prevent materialization of.
+        fuse_pointwise_use = (
+            any(should_lower_cat_input(inp) for inp in inputs) and pointwise_uses
+        )
 
-        if all_pointwise_inputs or (any_pointwise_inputs and pointwise_uses):
+        # horizontal fuse in case all inputs will require a copy kernel anyway.
+        # only horizontally fuse pointwise kernels
+        horizontal_fuse_cat = all(
+            should_lower_cat_input(inp) for inp in inputs
+        ) and not any(can_fuse_reduction(t) for t in inputs)
+        if fuse_pointwise_use or (horizontal_fuse_cat and not fusable_reduction):
             return pointwise_cat(inputs, dim)
 
     return TensorBox(ir.ConcatKernel.create(inputs, dim))
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 53b2790df20c2..d7fb163cd589f 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -1086,17 +1086,18 @@ def cached_autotune(
                 )
 
         best_config = None
-        if cache_filename is not None and os.path.exists(cache_filename):
-            with open(cache_filename) as fd:
-                best_config = json.loads(fd.read())
-        elif remote_cache is not None and remote_cache_key is not None:
-            best_config = remote_cache.get(remote_cache_key)
-
-        best_config = load_cached_autotuning(
-            best_config, configs_hash, configs, inductor_meta
-        )
-        if best_config:
-            configs = [best_config]
+        if not inductor_meta.get("force_disable_caches", False):
+            if cache_filename is not None and os.path.exists(cache_filename):
+                with open(cache_filename) as fd:
+                    best_config = json.loads(fd.read())
+            elif remote_cache is not None and remote_cache_key is not None:
+                best_config = remote_cache.get(remote_cache_key)
+
+            best_config = load_cached_autotuning(
+                best_config, configs_hash, configs, inductor_meta
+            )
+            if best_config:
+                configs = [best_config]
 
         def save_cache_hook(cfg, time_taken_ns, found_by_coordesc=False):
             data = {
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index 0bcc166982a18..d1550529bb8ee 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -1499,7 +1499,7 @@ def autotune_select_algorithm(*args, **kwargs):
     if "return_multi_template" not in kwargs:
         kwargs[
             "return_multi_template"
-        ] = torch._inductor.config.benchmark_multi_templates
+        ] = torch._inductor.config.benchmark_epilogue_fusion
 
     return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
 
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index b770d51d67cc1..917dbfc3dd193 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -8,6 +8,7 @@
 import inspect
 import io
 import itertools
+import json
 import logging
 import math
 import operator
@@ -21,6 +22,7 @@
 import unittest
 from datetime import datetime
 from io import StringIO
+from pathlib import Path
 from typing import (
     Any,
     Callable,
@@ -32,6 +34,7 @@
     Optional,
     Protocol,
     Set,
+    Tuple,
     TypeVar,
     Union,
     ValuesView,
@@ -42,6 +45,8 @@
 from typing_extensions import Concatenate, ParamSpec
 
 import torch
+import torch._export
+import torch.utils._pytree as pytree
 from torch._dynamo.device_interface import get_interface_for_device
 from torch._dynamo.utils import detect_fake_mode
 from torch.autograd import DeviceType
@@ -51,7 +56,7 @@
 from torch.utils._sympy.symbol import make_symbol, SymT
 from torch.utils._sympy.value_ranges import bound_sympy, ValueRanges
 from . import config
-from .runtime.runtime_utils import ceildiv as runtime_ceildiv
+from .runtime.runtime_utils import cache_dir, ceildiv as runtime_ceildiv
 
 log = logging.getLogger(__name__)
 
@@ -1524,7 +1529,7 @@ def should_assume_input_aligned(example_input: torch.Tensor):
     # See Note: [Input Alignment handling in Inductor]
 
     # right now, we only care about alignment for cuda tensors.
-    if example_input.device.type != "cuda":
+    if not is_gpu(example_input.device.type):
         return False
     return config.assume_aligned_inputs or tensor_is_aligned(example_input)
 
@@ -1544,3 +1549,140 @@ def maybe_get_suppress_shape_guards_ctx():
         return contextlib.nullcontext()
 
     return shape_env.suppress_guards()
+
+
+def aoti_eager_cache_dir(namespace: str, device: str):
+    return Path(cache_dir()) / "aoti_eager" / namespace / device
+
+
+def aoti_eager_op_conf_lock(op_func_name_with_overload: str):
+    from filelock import FileLock
+
+    # Avoid circular import
+    from torch._inductor.codecache import get_lock_dir, LOCK_TIMEOUT
+
+    op_conf_lock_file = f"{op_func_name_with_overload}.lock"
+    lock_dir = get_lock_dir()
+    return FileLock(os.path.join(lock_dir, op_conf_lock_file), timeout=LOCK_TIMEOUT)
+
+
+def load_aoti_eager_cache(ns: str, op_func_name_with_overload: str, device_type: str):
+    device_kernel_cache = aoti_eager_cache_dir(ns, device_type)
+    op_conf = device_kernel_cache / f"{op_func_name_with_overload}.json"
+    if not op_conf.exists():
+        return []
+
+    with aoti_eager_op_conf_lock(op_func_name_with_overload):
+        with open(op_conf) as f:
+            json_data = json.load(f)
+            for item in json_data:
+                # Get absolution path for kernel library
+                kernel_lib_abs_path = device_kernel_cache / item["kernel_path"]
+                item["kernel_path"] = kernel_lib_abs_path.as_posix()
+
+                # Check if the kernel library exists
+                if not kernel_lib_abs_path.exists():
+                    return []
+
+                for metadata in item["meta_info"]:
+                    assert not metadata[
+                        "is_dynamic"
+                    ], "Only support static shape for now"
+                    if metadata["device_type"] == "cpu":
+                        metadata["device_index"] = -1
+                    metadata["dtype"] = getattr(torch, metadata["dtype"].split(".")[-1])
+
+            return json_data
+
+
+def aoti_compile_with_persistent_cache(
+    ns: str,
+    op_func_name_with_overload: str,
+    device_type: str,
+    dynamic: bool,
+    f: Callable[..., Any],
+    args: Tuple[Any],
+    kwargs: Dict[str, Any],
+    *,
+    dynamic_shapes: Optional[Dict[str, Any]] = None,
+    options: Optional[Dict[str, Any]] = None,
+    remove_runtime_assertions: bool = False,
+    disable_constraint_solver: bool = False,
+):
+    """
+    Compile the given function with persistent cache for AOTI eager mode.
+    """
+    flattened_inputs = pytree.arg_tree_leaves(*args, **kwargs)
+    assert all(
+        isinstance(input, torch.Tensor) for input in flattened_inputs
+    ), "Only support tensor for now"
+    assert not dynamic, "Only support static shape for now"
+
+    persistent_cache = aoti_eager_cache_dir(ns, device_type)
+    persistent_cache.mkdir(parents=True, exist_ok=True)
+    persistent_cache_lib = persistent_cache / "lib"
+    persistent_cache_lib.mkdir(parents=True, exist_ok=True)
+
+    with mock.patch.dict(
+        os.environ,
+        {"TORCHINDUCTOR_CACHE_DIR": persistent_cache_lib.absolute().as_posix()},
+    ):
+        try:
+            kernel_lib_path = torch._export.aot_compile(
+                f,
+                args,
+                kwargs,
+                dynamic_shapes=dynamic_shapes,
+                options=options,
+                remove_runtime_assertions=remove_runtime_assertions,
+                disable_constraint_solver=disable_constraint_solver,
+            )
+
+            kernel_metadata_items = []
+            for input_tensor in flattened_inputs:
+                # TODO(Eikan): To add dynamic support
+                metadata: Dict[str, Any] = {}
+                metadata["is_dynamic"] = dynamic
+                metadata["device_type"] = f"{input_tensor.device.type}"
+                if is_cpu_device([input_tensor]):
+                    metadata["device_index"] = -1
+                else:
+                    metadata["device_index"] = input_tensor.device.index
+                metadata["dtype"] = f"{input_tensor.dtype}"
+                metadata["sizes"] = list(input_tensor.size())
+                metadata["strides"] = list(input_tensor.stride())
+                kernel_metadata_items.append(metadata)
+
+            kernel_meta_info: Dict[str, Any] = {}
+            kernel_meta_info["meta_info"] = kernel_metadata_items
+            kernel_meta_info["kernel_path"] = (
+                Path(kernel_lib_path).relative_to(persistent_cache).as_posix()
+            )
+
+            json_data = []
+            update_json = True
+            op_conf = persistent_cache / f"{op_func_name_with_overload}.json"
+            mode = "r" if op_conf.exists() else "w"
+            with aoti_eager_op_conf_lock(op_func_name_with_overload):
+                with open(op_conf, mode) as op_conf_file:
+                    try:
+                        json_data = json.load(op_conf_file)
+                    except Exception as e:
+                        json_data = []
+
+                    assert isinstance(json_data, list)
+                    for item in json_data:
+                        assert isinstance(item, dict)
+                        # Same kernel meta info already exists in the json file
+                        if item["meta_info"] == kernel_metadata_items:
+                            update_json = False
+                            break
+
+                if update_json:
+                    json_data.append(kernel_meta_info)
+                    with open(op_conf, "w") as op_conf_file:
+                        json.dump(json_data, op_conf_file, indent=4)
+
+            return kernel_lib_path
+        except Exception as e:
+            return ""
diff --git a/torch/_ops.py b/torch/_ops.py
index f5d7313591dbd..0b19c75a51aa6 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -412,14 +412,22 @@ def key_extractor(tensors, key_mask):
 
 
 # Mode stack for PreDispatchKey
-# it should always have two keys with
+# it should always have three keys with
 # priority given to FunctionalTensorMode and
 # then ProxyTorchDispatchMode. It means that
 # slot 0 belongs to ProxyTorchDispatchMode and
 # slot 1 belongs to FunctionalTensorMode.
+#
+# SchemaCheckMode is separate from the other 2,
+# and is only valid when the stack is empty.
+# SchemaCheckMode is for testing purposes, and
+# is meant to run in eager mode on concrete inputs,
+# checking for incorrect schemas in regards to
+# aliasing or mutating ops.
 class _ModeStackStateForPreDispatch:
     def __init__(self):
         self.__infra_modes = [None, None]
+        self._schema_check_mode = None
 
     def set(self, index, mode):
         assert index < len(self.__infra_modes)
@@ -430,28 +438,36 @@ def get(self, index):
         return self.__infra_modes[index]
 
     def count(self):
-        return len([i for i in self.__infra_modes if i is not None])
+        return len([i for i in self.__infra_modes if i is not None]) + int(
+            self._schema_check_mode is not None
+        )
 
 
 _mode_stack_state_for_pre_dispatch = _ModeStackStateForPreDispatch()
 
 
-def unset_mode_pre_dispatch(mode_key):
+def unset_mode_pre_dispatch(mode_key, schema_check=False):
     current_mode_stack_pre_dispatch = mode_stack_state_for_pre_dispatch()
-    assert mode_key in (
+    assert mode_key is None or mode_key in (
         torch._C._TorchDispatchModeKey.PROXY,
         torch._C._TorchDispatchModeKey.FUNCTIONAL,
     )
+    if schema_check:
+        assert mode_key is None
 
     def _unset_mode():
         if mode_key == torch._C._TorchDispatchModeKey.PROXY:
             current_mode = current_mode_stack_pre_dispatch.get(0)
             mode_stack_state_for_pre_dispatch().set(0, None)
             return current_mode
-        else:
+        elif mode_key == torch._C._TorchDispatchModeKey.FUNCTIONAL:
             current_mode = current_mode_stack_pre_dispatch.get(1)
             mode_stack_state_for_pre_dispatch().set(1, None)
             return current_mode
+        else:
+            current_mode = mode_stack_state_for_pre_dispatch()._schema_check_mode
+            mode_stack_state_for_pre_dispatch()._schema_check_mode = None
+            return current_mode
 
     current_mode = _unset_mode()
 
@@ -470,12 +486,27 @@ def _unset_mode():
 
 def _set_mode_pre_dispatch(mode):
     from torch._subclasses.functional_tensor import FunctionalTensorMode
+    from torch._subclasses.schema_check_mode import SchemaCheckMode
     from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
 
-    assert isinstance(mode, (FunctionalTensorMode, ProxyTorchDispatchMode))
+    assert isinstance(
+        mode,
+        (
+            FunctionalTensorMode,
+            ProxyTorchDispatchMode,
+            SchemaCheckMode,
+        ),
+    )
 
     previous_mode_stack_len = _len_torch_dispatch_stack_pre_dispatch()
-    if isinstance(mode, FunctionalTensorMode):
+    if isinstance(mode, SchemaCheckMode):
+        current_mode = mode_stack_state_for_pre_dispatch()._schema_check_mode
+        if previous_mode_stack_len > 0:
+            raise AssertionError(
+                "SchemaCheckMode for pre-dispatch must be used exclusively, found other modes on the stack"
+            )
+        mode_stack_state_for_pre_dispatch()._schema_check_mode = mode
+    elif isinstance(mode, FunctionalTensorMode):
         current_mode = mode_stack_state_for_pre_dispatch().get(1)
         assert current_mode is None
         mode_stack_state_for_pre_dispatch().set(1, mode)
@@ -501,9 +532,10 @@ def _pop_mode_from_pre_dispatch():
     if pre_dispatch_len == 0:
         raise AssertionError("Trying to pop empty mode stack")
 
+    if mode_stack._schema_check_mode is not None:
+        return unset_mode_pre_dispatch(None, schema_check=True)
     if mode_stack.get(1) is not None:
         return unset_mode_pre_dispatch(torch._C._TorchDispatchModeKey.FUNCTIONAL)
-
     if mode_stack.get(0) is not None:
         return unset_mode_pre_dispatch(torch._C._TorchDispatchModeKey.PROXY)
 
@@ -519,19 +551,23 @@ def _get_dispatch_mode_pre_dispatch(mode_key):
     )
     if mode_key == torch._C._TorchDispatchModeKey.PROXY:
         return mode_stack_state_for_pre_dispatch().get(0)
-    return mode_stack_state_for_pre_dispatch().get(1)
+    else:
+        return mode_stack_state_for_pre_dispatch().get(1)
 
 
 def _get_current_dispatch_mode_pre_dispatch():
-    stack_len = mode_stack_state_for_pre_dispatch().count()
-    if stack_len == 2:
-        return mode_stack_state_for_pre_dispatch().get(1)
-    if stack_len == 1:
-        return (
-            mode_stack_state_for_pre_dispatch().get(1)
-            if mode_stack_state_for_pre_dispatch().get(1) is not None
-            else mode_stack_state_for_pre_dispatch().get(0)
-        )
+    if mode_stack_state_for_pre_dispatch()._schema_check_mode is not None:
+        return mode_stack_state_for_pre_dispatch()._schema_check_mode
+    else:
+        stack_len = mode_stack_state_for_pre_dispatch().count()
+        if stack_len == 2:
+            return mode_stack_state_for_pre_dispatch().get(1)
+        if stack_len == 1:
+            return (
+                mode_stack_state_for_pre_dispatch().get(1)
+                if mode_stack_state_for_pre_dispatch().get(1) is not None
+                else mode_stack_state_for_pre_dispatch().get(0)
+            )
     return None
 
 
diff --git a/torch/_streambase.py b/torch/_streambase.py
index 5a0df2c22ba95..b06946523fa3b 100644
--- a/torch/_streambase.py
+++ b/torch/_streambase.py
@@ -5,27 +5,27 @@ class _StreamBase(ABC):
     r"""Base stream class abstraction for multi backends Stream to herit from"""
 
     @abstractmethod
-    def wait_event(self, event):
+    def wait_event(self, event) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def wait_stream(self, stream):
+    def wait_stream(self, stream) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def record_event(self, event=None):
+    def record_event(self, event=None) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def query(self):
+    def query(self) -> bool:
         raise NotImplementedError
 
     @abstractmethod
-    def synchronize(self):
+    def synchronize(self) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def __eq__(self, stream):
+    def __eq__(self, stream) -> bool:
         raise NotImplementedError
 
 
@@ -33,13 +33,13 @@ class _EventBase(ABC):
     r"""Base Event class abstraction for multi backends Event to herit from"""
 
     @abstractmethod
-    def wait(self, stream=None):
+    def wait(self, stream=None) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def query(self):
+    def query(self) -> bool:
         raise NotImplementedError
 
     @abstractmethod
-    def synchronize(self):
+    def synchronize(self) -> None:
         raise NotImplementedError
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 3fdc4fc01e6b9..79c8e951edfcc 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -215,8 +215,8 @@ def tensor_memo(self):
     meta_converter: MetaConverter
     constant_storage_mapping: Dict[StorageWeakRef, List[ReferenceType]]
 
-    def __init__(self):
-        self.meta_converter = MetaConverter()
+    def __init__(self, *, copy_data=False):
+        self.meta_converter = MetaConverter(copy_data=copy_data)
 
         # map from to storage to corresponding constant tensors
         self.constant_storage_mapping = {}
@@ -294,8 +294,6 @@ def from_real_tensor(
             assert not make_constant
 
         def mk_fake_tensor(make_meta_t):
-            from torch._dynamo.utils import clone_input
-
             # NB: don't use in_kernel_invocation_manager. to
             # ensure FakeTensor can internally do constant computation
             # as necessary.  Invocation manager is "more correct" as
@@ -311,18 +309,6 @@ def mk_fake_tensor(make_meta_t):
                     # TODO: callback might be used in recursive contexts, in
                     # which case using t is wrong!  BUG!
                     constant=t if make_constant else None,
-                    # TODO: This won't preserve aliasing relationships, so if
-                    # there is mutation you won't see it reflect elsewhere.
-                    # This is fine because propagate_real_tensors isn't
-                    # intended to give you exact results and some inaccuracy
-                    # is OK, although if its use case expands we would want to
-                    # do something similar to meta converter, but poking in
-                    # real tensors at the storage cloning phase
-                    real_tensor=(
-                        (t if make_constant else clone_input(t))
-                        if fake_mode.propagate_real_tensors
-                        else None
-                    ),
                 )
 
         out = self.meta_converter(
@@ -870,23 +856,26 @@ def __init__(
     ):
         log.debug("create_mode 0x%x", id(self))
         self.allow_fallback_kernels = allow_fallback_kernels
-        self.fake_tensor_converter = FakeTensorConverter()
+
+        import torch._dynamo.config
+        import torch._functorch.config
+
+        self.propagate_real_tensors = (
+            torch._functorch.config.fake_tensor_propagate_real_tensors
+        )
+        self.fake_tensor_converter = FakeTensorConverter(
+            copy_data=self.propagate_real_tensors
+        )
+
         if static_shapes is not None:
             self.static_shapes = static_shapes
         else:
             self.static_shapes = shape_env is None
 
-        import torch._dynamo.config
-        import torch._functorch.config
-
         # This is temporarily patched to True in Dynamo to grandfather in some
         # places where we unconditionally allow scalar outputs, TO BE REMOVED
         self.allow_scalar_outputs = False
 
-        self.propagate_real_tensors = (
-            torch._functorch.config.fake_tensor_propagate_real_tensors
-        )
-
         self._allow_unsafe_data_ptr_access = (
             torch._functorch.config.fake_tensor_allow_unsafe_data_ptr_access
         )
@@ -1552,7 +1541,7 @@ def maybe_to_real_tensor(t):
                 func,
                 flat_arg_fake_tensors,
                 flat_args,
-                self.shape_env.unbacked_var_to_val,
+                self.shape_env.unbacked_var_to_val if self.shape_env else None,
             )
 
         def maybe_propagate_real_tensors(fake_out):
diff --git a/torch/_subclasses/functional_tensor.py b/torch/_subclasses/functional_tensor.py
index 1762059eedf22..dfef5951ab26f 100644
--- a/torch/_subclasses/functional_tensor.py
+++ b/torch/_subclasses/functional_tensor.py
@@ -17,6 +17,27 @@
 not_implemented_log = torch._logging.getArtifactLogger(__name__, "not_implemented")
 
 
+# NOTE Some special handling for tensor conversion during export is needed.
+# Normally, when tracing through the model with tensor.to(), the maybe-aliasing
+# relationship between input and output tensors will be baked into the graph.
+# For example, if we got a tensor with device cpu and call tensor.to("cpu"),
+# it will become a no-op in the graph. For a whole graph capture, this is not
+# sound so we need to do something different. Instead, in export we will try to
+# preserve the tensor conversion by forcing a non-semantic-breaking aten::_to_copy
+# operator to be traced in the graph, and subsequently banning mutations on all
+# such converted tensors.
+# In addition to patching .to() method call in functionalization, we will have to
+# patch other similar methods like float() and cpu(), because they intentionally
+# don't fall back to .to() methods, but have the same behavior as .to() according to
+# pytorch document. https://pytorch.org/docs/stable/generated/torch.Tensor.float.html
+# thus we simply force them to go through .to() call.
+def _conversion_method_template(**extra_kwargs):
+    def _(self, *args, **kwargs):
+        return self.to(*args, **{**kwargs, **extra_kwargs})
+
+    return _
+
+
 class FunctionalTensor(torch.Tensor):
     """
     Functional tensors represent tensors that will remove mutations
@@ -225,6 +246,24 @@ def to(self, *args, **kwargs):
                 return super().to(*args, **{**kwargs, "copy": True})
         return super().to(*args, **kwargs)
 
+    def cuda(self, device=None, *args, **kwargs):
+        device = device or torch.cuda.current_device()
+        if len(args) > 0:
+            return self.to(device, *args, **kwargs)
+        else:
+            return self.to(device=device, **kwargs)
+
+    char = _conversion_method_template(dtype=torch.int8)
+    cpu = _conversion_method_template(device=torch.device("cpu"))
+    bfloat16 = _conversion_method_template(dtype=torch.bfloat16)
+    byte = _conversion_method_template(dtype=torch.uint8)
+    double = _conversion_method_template(dtype=torch.float64)
+    float = _conversion_method_template(dtype=torch.float32)
+    bool = _conversion_method_template(dtype=torch.bool)
+    half = _conversion_method_template(dtype=torch.float16)
+    int = _conversion_method_template(dtype=torch.int32)
+    long = _conversion_method_template(dtype=torch.int64)
+
 
 class FunctionalTensorMode(TorchDispatchMode):
     def __init__(self, pre_dispatch=False, export=False, _allow_token_discovery=False):
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index c674120a22ff6..780ec54888da9 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -34,6 +34,7 @@
     maybe_get_level,
     peek_interpreter_stack,
 )
+from torch.utils._mode_utils import no_dispatch
 
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 from torch.utils.weak import WeakIdKeyDictionary
@@ -153,13 +154,14 @@ class MetaTensorDescriber:
     the same ID when we see the same tensor/storage.
     """
 
-    def __init__(self):
+    def __init__(self, *, copy_data=False):
         self.next_tensor_id: MetaTensorId = 0
         self.next_storage_id: MetaStorageId = 0
         # Tensor -> int
         self.lookup_tensor = WeakIdKeyDictionary()
         # Storage -> int
         self.lookup_storage = WeakIdKeyDictionary()
+        self.copy_data = copy_data
 
     def get_tensor_id(self, t: torch.Tensor):
         if t not in self.lookup_tensor:
@@ -180,6 +182,9 @@ def describe_storage(self, s: torch.UntypedStorage):
         return MetaStorageDesc(
             id=self.get_storage_id(s),
             size=s.size(),
+            # NB: We don't do the copy yet; copy happens when we start
+            # creating the new storages
+            data=s if self.copy_data else None,
         )
 
     def describe_tensor(self, t: torch.Tensor, recurse: bool = True):
@@ -354,6 +359,7 @@ def describe_tensor(self, t: torch.Tensor, recurse: bool = True):
             functorch_stack=maybe_functorch_stack,
             autograd_meta_from=autograd_meta_from,
             current_level=current_level,
+            data=t if self.copy_data else None,
         )
 
 
@@ -361,6 +367,9 @@ def describe_tensor(self, t: torch.Tensor, recurse: bool = True):
 class MetaStorageDesc:
     id: MetaStorageId
     size: int
+    # NB: this is only populated with copy_data True, it is not directly
+    # serializable in JSON, you want to do something special here anyway
+    data: Optional[torch.UntypedStorage]
 
 
 @dataclass(frozen=True)
@@ -388,7 +397,9 @@ class MetaTensorDesc:
     # NB: Sometimes, size, stride and storage_offset contain SymInt, in which
     # case this is NOT serializable.  That only happens when you're
     # re-fakeifying a fake tensor with an existing ShapeEnv... maybe we
-    # can get rid of this use case entirely
+    # can get rid of this use case entirely.  Notably, even if we are
+    # fakeifying a real tensor into a fake tensor with symbolic shapes, the
+    # size here is NOT dynamic
     # NB: size could potentially be None as you can override it and make it
     # throw an error, but we don't currently have any subclasses that do this
     # except C++ nested tensor but we're going to have nested int to make this
@@ -434,6 +445,11 @@ class MetaTensorDesc:
     functorch_stack: Optional[List[CInterpreter]] = None
     autograd_meta_from: Optional[torch.Tensor] = None
 
+    # This is only populated on copy_data, and typically is not used at all,
+    # except for some of our meta-ification paths that don't properly use
+    # storage (pro-tip: you should use storage)
+    data: Optional[torch.Tensor] = None
+
     # Faithfully serializing functorch tensors will not be too difficult.
     # We only need to consider grad/vmap interpreters, and their internal
     # state is only bools (mostly what the grad enabled/disabled state
@@ -457,7 +473,7 @@ def shape(self):
 # meta storages. This class will hold weak references to cached tenosrs
 # and tensor storages.
 class MetaConverter:
-    def __init__(self):
+    def __init__(self, *, copy_data: bool = False):
         # Maps MetaStorageId to UntypedStorage
         self.storage_memo: weakref.WeakValueDictionary = weakref.WeakValueDictionary()
         # Maps MetaTensorId to torch.Tensor (typically a meta tensor or
@@ -467,7 +483,12 @@ def __init__(self):
         self.miss = 0
         self.del_hook = None
         self.arg_cnt = 0
-        self.describer = MetaTensorDescriber()
+        # Ensures real_storage/real_tensor are populated on the resulting
+        # metaified storage/tensor.  The naming of this attribute is load
+        # bearing: FakeTensor relies on real tensor being set to exactly this
+        # value
+        self.copy_data = copy_data
+        self.describer = MetaTensorDescriber(copy_data=copy_data)
 
     def successful(self):
         return self.hit > 0 and self.miss == 0
@@ -489,8 +510,12 @@ def meta_storage(self, s: MetaStorageDesc, callback):
         # Need to make sure to resize the meta storage too.
         if self.get_storage_memo(s) is None:
             r_s = callback(
-                lambda: torch.empty(s.size, dtype=torch.uint8, device="meta")
+                lambda: torch.empty(s.size, dtype=torch.uint8, device="meta"),
             ).untyped_storage()
+            if self.copy_data:
+                with torch.no_grad(), no_dispatch():
+                    assert s.data is not None
+                    r_s.real_storage = s.data.clone()
             self.set_storage_memo(s, r_s)
             return r_s
         else:
@@ -640,8 +665,8 @@ def empty_create_subclass(
             outer_size = outer_size if outer_size is not None else t.size
             outer_stride = outer_stride if outer_stride is not None else t.stride
 
-            transformed_tensors_dict = {
-                attr: callback(
+            def transform(attr, inner_t):
+                r = callback(
                     lambda: empty_create(
                         inner_t,
                         AttrSource(source, attr),
@@ -652,7 +677,29 @@ def empty_create_subclass(
                         ),
                     )
                 )
-                for attr, inner_t in t.attrs.items()
+                # Note [Inaccessible data is not copied]
+                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                # A more faithful reproduction would do a copy on the entire
+                # storage, but this needs to be done carefully because the
+                # underlying storage could have larger extent than is implied
+                # by size/stride.  The real fix is to properly call
+                # meta_storage recursively here.
+                if self.copy_data:
+                    with torch.no_grad(), no_dispatch():
+                        r.real_tensor = torch.empty_strided(
+                            inner_t.size,
+                            inner_t.stride,
+                            dtype=inner_t.dtype,
+                            device=inner_t.device,
+                        )
+                        assert inner_t.data is not None
+                        r.real_tensor.copy_(
+                            inner_t.data
+                        )  # Note [Inaccessible data is not copied]
+                return r
+
+            transformed_tensors_dict = {
+                attr: transform(attr, inner_t) for attr, inner_t in t.attrs.items()
             }
 
             sub = t.type.__tensor_unflatten__(
@@ -892,6 +939,11 @@ def tensor_visitor_fn(
                             device="meta",
                         )
                     )
+                    if self.copy_data:
+                        # Pray that sparse clone doesn't lose information
+                        assert t.data is not None
+                        with torch.no_grad(), no_dispatch():
+                            r.real_tensor = t.data.clone()
                     assert safe_is_leaf(r), "the callback you passed in doesn't detach"
                     # Note [is_coalesced is dispatched]
                     # Strangely enough, is_coalesced() is a dispatched operator,
@@ -939,6 +991,11 @@ def tensor_visitor_fn(
                             device="meta",
                         )
                     )
+                    if self.copy_data:
+                        # Pray sparse clone doesn't lose information
+                        assert t.data is not None
+                        with torch.no_grad(), no_dispatch():
+                            r.real_tensor = t.data.clone()
                     assert safe_is_leaf(r), "the callback you passed in doesn't detach"
                     if t.requires_grad:
                         r.requires_grad = True
@@ -961,11 +1018,24 @@ def tensor_visitor_fn(
                     sizes, strides, _storage_offset = sym_sizes_strides_storage_offset(
                         t, source
                     )
+                    # TODO: This doesn't seem right, where's the MKLDNN'ness
+                    # lol
                     r = callback(
                         lambda: torch.empty_strided(
                             sizes, strides, dtype=t.dtype, device="meta"
                         )
                     )
+                    if self.copy_data:
+                        with torch.no_grad(), no_dispatch():
+                            assert t.size is not None
+                            assert t.stride is not None
+                            r.real_tensor = torch.empty_strided(
+                                t.size, t.stride, dtype=t.dtype, device=t.device
+                            )
+                            assert t.data is not None
+                            r.real_tensor.copy_(
+                                t.data
+                            )  # Note [Inaccessible data is not copied]
                     assert safe_is_leaf(r), "the callback you passed in doesn't detach"
                     if t.requires_grad:
                         r.requires_grad = True
@@ -1056,6 +1126,19 @@ def _to_fake_tensor(t: MetaTensorDesc):
                                     device="meta",
                                 )
                             )
+                            if self.copy_data:
+                                with torch.no_grad(), no_dispatch():
+                                    r.real_tensor = torch.empty_strided(  # type: ignore[attr-defined]
+                                        t.size,
+                                        t.stride,
+                                        dtype=t.dtype,
+                                        device=t.device,
+                                    )
+                                    assert t.data is not None
+                                    # Note [Inaccessible data is not copied]
+                                    r.real_tensor.copy_(  # type: ignore[attr-defined]
+                                        t.data
+                                    )
                         return r
 
                     r = _to_fake_tensor(t)
@@ -1211,6 +1294,13 @@ def is_c_of_r(complex_dtype, real_dtype):
                                 device="meta",
                             )
                         )
+                        if self.copy_data:
+                            with torch.no_grad(), no_dispatch():
+                                assert t.size is not None
+                                assert t.stride is not None
+                                r.real_tensor = torch.empty_strided(
+                                    t.size, t.stride, dtype=t.dtype, device=t.device
+                                )
 
                     assert safe_is_leaf(r), "the callback you passed in doesn't detach"
                     if t.requires_grad:
@@ -1248,6 +1338,12 @@ def is_c_of_r(complex_dtype, real_dtype):
                     ):
                         # You're normal and happy, install the fresh storage into the memo
                         self.set_storage_memo(s, r.untyped_storage())
+                        if self.copy_data:
+                            with torch.no_grad(), no_dispatch():
+                                r.real_tensor.untyped_storage().copy_(s.data)
+                                r.untyped_storage().real_storage = (
+                                    r.real_tensor.untyped_storage()
+                                )
                     else:
                         # You're in crazy town; somehow you gave us a tensor
                         # that wasn't a view, but had nonzero storage offset,
@@ -1286,8 +1382,17 @@ def is_c_of_r(complex_dtype, real_dtype):
                         mb_fake_mode = maybe_get_fake_mode(r)
                         if mb_fake_mode is not None:
                             maybe_fake_mgr = in_kernel_invocation_manager(mb_fake_mode)
-                        with maybe_fake_mgr, torch.no_grad(), maybe_suppress():
-                            r.set_(r_s, storage_offset, sizes, strides)
+                        with torch.no_grad(), maybe_suppress():
+                            with maybe_fake_mgr:
+                                r.set_(r_s, storage_offset, sizes, strides)
+                            if self.copy_data:
+                                with torch.no_grad(), no_dispatch():
+                                    r.real_tensor.set_(
+                                        r_s.real_storage,
+                                        t.storage_offset,
+                                        t.size,
+                                        t.stride,
+                                    )
 
                 if t.grad is not None:
                     from torch._dynamo.source import AttrSource
diff --git a/torch/_subclasses/schema_check_mode.py b/torch/_subclasses/schema_check_mode.py
index 72a2082a162df..3ddb611e5e412 100644
--- a/torch/_subclasses/schema_check_mode.py
+++ b/torch/_subclasses/schema_check_mode.py
@@ -6,7 +6,6 @@
 
 import torch
 from torch.fx.operator_schemas import normalize_function
-from torch.testing._internal.jit_utils import clone_inputs
 from torch.utils import _pytree as pytree
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_map
@@ -27,6 +26,38 @@
 #  - Checks for aliasing on all inputs
 
 
+# move these 2 functions here to avoid numpy dependency in testing/_internal/common_utils.py
+
+
+def is_iterable_of_tensors(iterable):
+    # Tensor itself is iterable so we check this first
+    if isinstance(iterable, torch.Tensor):
+        return False
+    try:
+        if len(iterable) == 0:
+            return False
+        for t in iter(iterable):
+            if not isinstance(t, torch.Tensor):
+                return False
+    except TypeError as te:
+        return False
+    return True
+
+
+def clone_inputs(args):
+    inputs = []
+
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            inputs.append(arg.detach().clone())
+        elif is_iterable_of_tensors(arg):
+            inputs.append([t.detach().clone() for t in arg])
+        else:
+            inputs.append(arg)
+
+    return inputs
+
+
 class SchemaCheckMode(TorchDispatchMode):
     def __init__(self):
         # Information recorded for testing purposes. For example:
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index 8feafafea2fd4..f2e774590be3f 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -768,7 +768,7 @@ def quantize_per_token(
     _quant_min_max_bounds_check(quant_min, quant_max, dtype)
     _per_token_quant_qparam_dim_check(input, scales, zero_points)
     input = (
-        torch.round(input / scales + zero_points).clamp(quant_min, quant_max).to(dtype)
+        input.mul(1.0 / scales).add(zero_points).round().clamp(quant_min, quant_max).to(dtype)
     )
     return input
 
@@ -875,7 +875,7 @@ def quantize_per_channel_group(
     zero_points = zero_points.reshape(-1, 1)
 
     input_int8 = (
-        to_quant.div(scales)
+        to_quant.mul(1.0 / scales)
         .add(zero_points)
         .round()
         .clamp_(quant_min, quant_max)
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index e053d89d79835..5da75b608a82a 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -195,7 +195,7 @@ def __init__(
         self,
         enabled=True,
         *,
-        use_cuda=False,
+        use_cuda=False,  # Deprecated
         use_device=None,
         record_shapes=False,
         with_flops=False,
diff --git a/torch/cpu/__init__.py b/torch/cpu/__init__.py
index 14794627d752b..2f2561b69c1c1 100644
--- a/torch/cpu/__init__.py
+++ b/torch/cpu/__init__.py
@@ -11,6 +11,7 @@
 from .. import device as _device
 from . import amp
 
+
 __all__ = [
     "is_available",
     "synchronize",
@@ -49,7 +50,6 @@ def synchronize(device: _device_t = None) -> None:
 
     N.B. This function only exists to facilitate device-agnostic code.
     """
-    pass
 
 
 class Stream:
@@ -57,7 +57,7 @@ class Stream:
     N.B. This class only exists to facilitate device-agnostic code
     """
 
-    def __init__(self, priority: int = -1):
+    def __init__(self, priority: int = -1) -> None:
         pass
 
     def wait_stream(self, stream) -> None:
@@ -68,13 +68,13 @@ class Event:
     def query(self) -> bool:
         return True
 
-    def record(self, stream=None):
+    def record(self, stream=None) -> None:
         pass
 
-    def synchronize(self):
+    def synchronize(self) -> None:
         pass
 
-    def wait(self, stream=None):
+    def wait(self, stream=None) -> None:
         pass
 
 
@@ -100,6 +100,7 @@ class StreamContext(AbstractContextManager):
     N.B. This class only exists to facilitate device-agnostic code
 
     """
+
     cur_stream: Optional[Stream]
 
     def __init__(self, stream):
@@ -115,7 +116,7 @@ def __enter__(self):
         self.prev_stream = _current_stream
         _current_stream = cur_stream
 
-    def __exit__(self, type: Any, value: Any, traceback: Any):
+    def __exit__(self, type: Any, value: Any, traceback: Any) -> None:
         cur_stream = self.stream
         if cur_stream is None:
             return
@@ -146,7 +147,6 @@ def set_device(device: _device_t) -> None:
 
     N.B. This function only exists to facilitate device-agnostic code
     """
-    pass
 
 
 def current_device() -> str:
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
index 6b8d923f40909..4f8d614e16dcf 100644
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@@ -15,7 +15,7 @@
 #include <torch/csrc/utils/cpp_stacktraces.h>
 #include <torch/csrc/utils/pybind.h>
 
-#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
+#if defined(USE_DISTRIBUTED)
 #include <torch/csrc/distributed/c10d/exception.h>
 #endif
 
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 22a257909bf12..3be764220e0de 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -168,12 +168,14 @@ static PyObject* THPModule_initExtension(
     PyObject* shm_manager_path) {
   HANDLE_TH_ERRORS
 #if !defined(FBCODE_CAFFE2)
-  if (torch::get_cpp_stacktraces_enabled() && !torch::get_disable_addr2line()) {
+  if (torch::get_cpp_stacktraces_enabled()) {
     c10::SetStackTraceFetcher([]() -> std::string {
       auto tb = torch::CapturedTraceback::gather(false, false, true);
-      LOG(WARNING)
-          << "symbolizing C++ stack trace for exception; if this hangs, rerun with TORCH_DISABLE_ADDR2LINE=1..."
-          << std::endl;
+      if (torch::get_symbolize_mode() == torch::unwind::Mode::addr2line) {
+        LOG(WARNING)
+            << "symbolizing C++ stack trace for exception; if this hangs, rerun with TORCH_DISABLE_ADDR2LINE=1..."
+            << std::endl;
+      }
       auto s_tbs = torch::symbolize({tb.get()});
       std::stringstream oss;
       oss << "C++ CapturedTraceback:" << std::endl;
@@ -396,10 +398,10 @@ PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
 
   // The TensorImpls contain PyObjectSlots that have a reference to the PyObject
   // associated with the TensorImpl. Swap this field as well.
-  c10::optional<PyObject*> mb_obj_a =
+  std::optional<PyObject*> mb_obj_a =
       a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
           getPyInterpreter(), /*ignore_hermetic_tls=*/false);
-  c10::optional<PyObject*> mb_obj_b =
+  std::optional<PyObject*> mb_obj_b =
       b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
           getPyInterpreter(), /*ignore_hermetic_tls=*/false);
   TORCH_INTERNAL_ASSERT(
@@ -1803,7 +1805,7 @@ Call this whenever a new thread is created in order to propagate values from
       "_select_conv_backend",
       [](const at::Tensor& input,
          const at::Tensor& weight,
-         const c10::optional<at::Tensor>& bias_opt,
+         const std::optional<at::Tensor>& bias_opt,
          at::SymIntArrayRef stride_,
          at::SymIntArrayRef padding_,
          at::SymIntArrayRef dilation_,
@@ -1837,14 +1839,14 @@ Call this whenever a new thread is created in order to propagate values from
       "_select_conv_backend",
       [](const at::Tensor& input,
          const at::Tensor& weight,
-         const c10::optional<at::Tensor>& bias,
+         const std::optional<at::Tensor>& bias,
          at::SymIntArrayRef stride_,
          at::SymIntArrayRef padding_,
          at::SymIntArrayRef dilation_,
          bool transposed_,
          at::SymIntArrayRef output_padding_,
          c10::SymInt groups_,
-         c10::optional<std::vector<c10::SymInt>> bias_sizes_opt) {
+         std::optional<std::vector<c10::SymInt>> bias_sizes_opt) {
         c10::OptionalArrayRef<c10::SymInt> ref = c10::nullopt;
         if (bias_sizes_opt) {
           ref = (*bias_sizes_opt);
@@ -1883,7 +1885,7 @@ Call this whenever a new thread is created in order to propagate values from
       .def(py::init([](at::Tensor const& query,
                        at::Tensor const& key,
                        at::Tensor const& value,
-                       c10::optional<at::Tensor> attn_mask,
+                       std::optional<at::Tensor> attn_mask,
                        double dropout,
                        bool is_causal) {
         return sdp::sdp_params{
@@ -2034,7 +2036,7 @@ Call this whenever a new thread is created in order to propagate values from
 
   py_module.def(
       "_get_accelerator",
-      [](c10::optional<bool> check = c10::nullopt) {
+      [](std::optional<bool> check = c10::nullopt) {
         return c10::Device(
             at::getAccelerator(check.value_or(false))
                 .value_or(c10::DeviceType::CPU),
@@ -2175,7 +2177,7 @@ Call this whenever a new thread is created in order to propagate values from
       _DeviceDtypeHasher>;
   py_module.def(
       "_group_tensors_by_device_and_dtype",
-      [](const std::vector<std::vector<c10::optional<at::Tensor>>>&
+      [](const std::vector<std::vector<std::optional<at::Tensor>>>&
              nested_tensorlist,
          const bool with_indices) {
         _FlatMap map;
diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp
index 4582cb2a8340c..a7e5c5e9fb873 100644
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@@ -592,7 +592,7 @@ static void set_tensor_attr_with_capsule(
     const c10::TensorImpl* tensor,
     py::capsule& capsule,
     const char* attr_name) {
-  c10::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj(
+  std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj(
       getPyInterpreter(), /*ignore_hermetic_tls=*/false);
   TORCH_CHECK(
       mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
@@ -620,7 +620,7 @@ static c10::ArrayRef<T> get_set_cached_attr(
     const c10::TensorImpl* tensor,
     const char* base_attr_name,
     const py::object& obj) {
-  c10::optional<PyObject*> mb_obj =
+  std::optional<PyObject*> mb_obj =
       tensor->pyobj_slot()->check_pyobj(getPyInterpreter());
   TORCH_CHECK(
       mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
index a3f8263303782..aa5584abd39e4 100644
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@@ -108,7 +108,7 @@ PyObject* THPStorage_Wrap(c10::Storage storage) {
         c10::newStorageImplFromRefcountedDataPtr(storage),
         c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
   }
-  c10::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj(
+  std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj(
       getPyInterpreter(), /*ignore_hermetic_tls=*/false);
   c10::impl::PyInterpreterStatus status =
       c10::impl::PyInterpreterStatus::TAGGED_BY_US;
@@ -236,7 +236,7 @@ static void THPStorage_subclass_dealloc(PyObject* self) {
   if (type->tp_del) {
     PyObject_GC_Track(self);
     type->tp_del(self);
-    if (self->ob_refcnt > 0) {
+    if (Py_REFCNT(self) > 0) {
       // Resurrected (see above comment about resurrection from `__del__`)
       return;
     }
@@ -316,8 +316,8 @@ static PyObject* THPStorage_pynew(
     device_arg_idx = 2;
   }
 
-  c10::optional<int64_t> allocator_opt = r.toInt64Optional(allocator_arg_idx);
-  c10::optional<at::Device> device_opt = r.deviceOptional(device_arg_idx);
+  std::optional<int64_t> allocator_opt = r.toInt64Optional(allocator_arg_idx);
+  std::optional<at::Device> device_opt = r.deviceOptional(device_arg_idx);
 
   TORCH_CHECK(
       !allocator_opt.has_value() || !device_opt.has_value(),
@@ -498,7 +498,7 @@ static PyObject* THPStorage_get(THPStorage* self, PyObject* index) {
 
     at::StorageImpl* old_storage_impl = storage.unsafeGetStorageImpl();
     c10::raw::intrusive_ptr::incref(old_storage_impl);
-    c10::optional<at::Device> device_opt = old_storage_impl->device();
+    std::optional<at::Device> device_opt = old_storage_impl->device();
     auto new_storage_impl = make_storage_impl(
         c10::StorageImpl::use_byte_size_t(),
 #ifdef THQUANTIZED
diff --git a/torch/csrc/Stream.cpp b/torch/csrc/Stream.cpp
index 06dac515c1a5e..179f4f1390aff 100644
--- a/torch/csrc/Stream.cpp
+++ b/torch/csrc/Stream.cpp
@@ -82,7 +82,7 @@ static PyObject* THPStream_pynew(
   // It requires other device backends override getNewStream method. How the new
   // stream is created is backend specific. Backend should be able to correctly
   // manage the lifetime of streams.
-  c10::optional<c10::Stream> stream_opt;
+  std::optional<c10::Stream> stream_opt;
   if (r.idx == 0) {
     c10::impl::VirtualGuardImpl impl{static_cast<c10::DeviceType>(device_type)};
     stream_opt = impl.getNewStream(
diff --git a/torch/csrc/api/include/torch/expanding_array.h b/torch/csrc/api/include/torch/expanding_array.h
index aa4fecf4ff37c..f77c05119ebf7 100644
--- a/torch/csrc/api/include/torch/expanding_array.h
+++ b/torch/csrc/api/include/torch/expanding_array.h
@@ -104,15 +104,15 @@ std::ostream& operator<<(
 }
 
 /// A utility class that accepts either a container of `D`-many
-/// `c10::optional<T>` values, or a single `c10::optional<T>` value, which is
+/// `std::optional<T>` values, or a single `c10::optional<T>` value, which is
 /// internally repeated `D` times. It has the additional ability to accept
 /// containers of the underlying type `T` and convert them to a container of
-/// `c10::optional<T>`.
+/// `std::optional<T>`.
 template <size_t D, typename T = int64_t>
 class ExpandingArrayWithOptionalElem
-    : public ExpandingArray<D, c10::optional<T>> {
+    : public ExpandingArray<D, std::optional<T>> {
  public:
-  using ExpandingArray<D, c10::optional<T>>::ExpandingArray;
+  using ExpandingArray<D, std::optional<T>>::ExpandingArray;
 
   /// Constructs an `ExpandingArrayWithOptionalElem` from an `initializer_list`
   /// of the underlying type `T`. The extent of the length is checked against
@@ -130,7 +130,7 @@ class ExpandingArrayWithOptionalElem
   /// the underlying type `T`. The extent of the length is checked against the
   /// `ExpandingArrayWithOptionalElem`'s extent parameter `D` at runtime.
   /*implicit*/ ExpandingArrayWithOptionalElem(at::ArrayRef<T> values)
-      : ExpandingArray<D, c10::optional<T>>(0) {
+      : ExpandingArray<D, std::optional<T>>(0) {
     // clang-format off
     TORCH_CHECK(
         values.size() == D,
@@ -145,7 +145,7 @@ class ExpandingArrayWithOptionalElem
   /// underlying type `T`, which is repeated `D` times (where `D` is the extent
   /// parameter of the `ExpandingArrayWithOptionalElem`).
   /*implicit*/ ExpandingArrayWithOptionalElem(T single_size)
-      : ExpandingArray<D, c10::optional<T>>(0) {
+      : ExpandingArray<D, std::optional<T>>(0) {
     for (const auto i : c10::irange(this->values_.size())) {
       this->values_[i] = single_size;
     }
@@ -154,7 +154,7 @@ class ExpandingArrayWithOptionalElem
   /// Constructs an `ExpandingArrayWithOptionalElem` from a correctly sized
   /// `std::array` of the underlying type `T`.
   /*implicit*/ ExpandingArrayWithOptionalElem(const std::array<T, D>& values)
-      : ExpandingArray<D, c10::optional<T>>(0) {
+      : ExpandingArray<D, std::optional<T>>(0) {
     for (const auto i : c10::irange(this->values_.size())) {
       this->values_[i] = values[i];
     }
diff --git a/torch/csrc/api/include/torch/fft.h b/torch/csrc/api/include/torch/fft.h
index da1f7e518ae54..d9a3430a7a249 100644
--- a/torch/csrc/api/include/torch/fft.h
+++ b/torch/csrc/api/include/torch/fft.h
@@ -15,9 +15,9 @@ namespace fft {
 /// ```
 inline Tensor fft(
     const Tensor& self,
-    c10::optional<SymInt> n = c10::nullopt,
+    std::optional<SymInt> n = c10::nullopt,
     int64_t dim = -1,
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_fft_symint(self, n, dim, norm);
 }
 
@@ -31,9 +31,9 @@ inline Tensor fft(
 /// ```
 inline Tensor ifft(
     const Tensor& self,
-    c10::optional<SymInt> n = c10::nullopt,
+    std::optional<SymInt> n = c10::nullopt,
     int64_t dim = -1,
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_ifft_symint(self, n, dim, norm);
 }
 
@@ -49,7 +49,7 @@ inline Tensor fft2(
     const Tensor& self,
     OptionalIntArrayRef s = c10::nullopt,
     IntArrayRef dim = {-2, -1},
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_fft2(self, s, dim, norm);
 }
 
@@ -65,7 +65,7 @@ inline Tensor ifft2(
     const Tensor& self,
     at::OptionalIntArrayRef s = c10::nullopt,
     IntArrayRef dim = {-2, -1},
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_ifft2(self, s, dim, norm);
 }
 
@@ -81,7 +81,7 @@ inline Tensor fftn(
     const Tensor& self,
     at::OptionalIntArrayRef s = c10::nullopt,
     at::OptionalIntArrayRef dim = c10::nullopt,
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_fftn(self, s, dim, norm);
 }
 
@@ -97,7 +97,7 @@ inline Tensor ifftn(
     const Tensor& self,
     at::OptionalIntArrayRef s = c10::nullopt,
     at::OptionalIntArrayRef dim = c10::nullopt,
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_ifftn(self, s, dim, norm);
 }
 
@@ -112,9 +112,9 @@ inline Tensor ifftn(
 /// ```
 inline Tensor rfft(
     const Tensor& self,
-    c10::optional<SymInt> n = c10::nullopt,
+    std::optional<SymInt> n = c10::nullopt,
     int64_t dim = -1,
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_rfft_symint(self, n, dim, norm);
 }
 
@@ -131,9 +131,9 @@ inline Tensor rfft(
 /// ```
 inline Tensor irfft(
     const Tensor& self,
-    c10::optional<SymInt> n = c10::nullopt,
+    std::optional<SymInt> n = c10::nullopt,
     int64_t dim = -1,
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_irfft_symint(self, n, dim, norm);
 }
 
@@ -149,7 +149,7 @@ inline Tensor rfft2(
     const Tensor& self,
     at::OptionalIntArrayRef s = c10::nullopt,
     IntArrayRef dim = {-2, -1},
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_rfft2(self, s, dim, norm);
 }
 
@@ -165,7 +165,7 @@ inline Tensor irfft2(
     const Tensor& self,
     at::OptionalIntArrayRef s = c10::nullopt,
     IntArrayRef dim = {-2, -1},
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_irfft2(self, s, dim, norm);
 }
 
@@ -181,7 +181,7 @@ inline Tensor rfftn(
     const Tensor& self,
     at::OptionalIntArrayRef s = c10::nullopt,
     at::OptionalIntArrayRef dim = c10::nullopt,
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_rfftn(self, s, dim, norm);
 }
 
@@ -197,7 +197,7 @@ inline Tensor irfftn(
     const Tensor& self,
     at::OptionalIntArrayRef s = c10::nullopt,
     at::OptionalIntArrayRef dim = c10::nullopt,
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_irfftn(self, s, dim, norm);
 }
 
@@ -215,9 +215,9 @@ inline Tensor irfftn(
 /// ```
 inline Tensor hfft(
     const Tensor& self,
-    c10::optional<SymInt> n = c10::nullopt,
+    std::optional<SymInt> n = c10::nullopt,
     int64_t dim = -1,
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_hfft_symint(self, n, dim, norm);
 }
 
@@ -234,9 +234,9 @@ inline Tensor hfft(
 /// ```
 inline Tensor ihfft(
     const Tensor& self,
-    c10::optional<SymInt> n = c10::nullopt,
+    std::optional<SymInt> n = c10::nullopt,
     int64_t dim = -1,
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_ihfft_symint(self, n, dim, norm);
 }
 
@@ -255,7 +255,7 @@ inline Tensor hfft2(
     const Tensor& self,
     at::OptionalIntArrayRef s = c10::nullopt,
     IntArrayRef dim = {-2, -1},
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_hfft2(self, s, dim, norm);
 }
 
@@ -275,7 +275,7 @@ inline Tensor ihfft2(
     const Tensor& self,
     at::OptionalIntArrayRef s = c10::nullopt,
     IntArrayRef dim = {-2, -1},
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_ihfft2(self, s, dim, norm);
 }
 
@@ -294,7 +294,7 @@ inline Tensor hfftn(
     const Tensor& self,
     at::OptionalIntArrayRef s = c10::nullopt,
     IntArrayRef dim = {-2, -1},
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_hfftn(self, s, dim, norm);
 }
 
@@ -314,7 +314,7 @@ inline Tensor ihfftn(
     const Tensor& self,
     at::OptionalIntArrayRef s = c10::nullopt,
     IntArrayRef dim = {-2, -1},
-    c10::optional<c10::string_view> norm = c10::nullopt) {
+    std::optional<c10::string_view> norm = c10::nullopt) {
   return torch::fft_ihfftn(self, s, dim, norm);
 }
 
diff --git a/torch/csrc/api/include/torch/linalg.h b/torch/csrc/api/include/torch/linalg.h
index 38010fbfcd4d2..3b398fa935b91 100644
--- a/torch/csrc/api/include/torch/linalg.h
+++ b/torch/csrc/api/include/torch/linalg.h
@@ -118,8 +118,8 @@ inline std::tuple<Tensor&, Tensor&, Tensor&> lu_out(
 inline std::tuple<Tensor, Tensor, Tensor, Tensor> lstsq(
     const Tensor& self,
     const Tensor& b,
-    c10::optional<double> cond,
-    c10::optional<c10::string_view> driver) {
+    std::optional<double> cond,
+    std::optional<c10::string_view> driver) {
   return torch::linalg_lstsq(self, b, cond, driver);
 }
 
@@ -245,16 +245,16 @@ inline Tensor matrix_rank(
 
 inline Tensor matrix_rank(
     const Tensor& input,
-    c10::optional<double> atol,
-    c10::optional<double> rtol,
+    std::optional<double> atol,
+    std::optional<double> rtol,
     bool hermitian) {
   return torch::linalg_matrix_rank(input, atol, rtol, hermitian);
 }
 
 inline Tensor matrix_rank(
     const Tensor& input,
-    const c10::optional<Tensor>& atol,
-    const c10::optional<Tensor>& rtol,
+    const std::optional<Tensor>& atol,
+    const std::optional<Tensor>& rtol,
     bool hermitian) {
   return torch::linalg_matrix_rank(input, atol, rtol, hermitian);
 }
@@ -278,8 +278,8 @@ inline Tensor& matrix_rank_out(
 inline Tensor& matrix_rank_out(
     Tensor& result,
     const Tensor& input,
-    c10::optional<double> atol,
-    c10::optional<double> rtol,
+    std::optional<double> atol,
+    std::optional<double> rtol,
     bool hermitian) {
   return torch::linalg_matrix_rank_out(result, input, atol, rtol, hermitian);
 }
@@ -287,8 +287,8 @@ inline Tensor& matrix_rank_out(
 inline Tensor& matrix_rank_out(
     Tensor& result,
     const Tensor& input,
-    const c10::optional<Tensor>& atol,
-    const c10::optional<Tensor>& rtol,
+    const std::optional<Tensor>& atol,
+    const std::optional<Tensor>& rtol,
     bool hermitian) {
   return torch::linalg_matrix_rank_out(result, input, atol, rtol, hermitian);
 }
@@ -382,7 +382,7 @@ inline Tensor& solve_triangular_out(
 inline std::tuple<Tensor, Tensor, Tensor> svd(
     const Tensor& input,
     bool full_matrices,
-    c10::optional<c10::string_view> driver) {
+    std::optional<c10::string_view> driver) {
   return torch::linalg_svd(input, full_matrices, driver);
 }
 
@@ -392,20 +392,20 @@ inline std::tuple<Tensor&, Tensor&, Tensor&> svd_out(
     Tensor& Vh,
     const Tensor& input,
     bool full_matrices,
-    c10::optional<c10::string_view> driver) {
+    std::optional<c10::string_view> driver) {
   return torch::linalg_svd_out(U, S, Vh, input, full_matrices, driver);
 }
 
 inline Tensor svdvals(
     const Tensor& input,
-    c10::optional<c10::string_view> driver) {
+    std::optional<c10::string_view> driver) {
   return torch::linalg_svdvals(input, driver);
 }
 
 inline Tensor& svdvals_out(
     Tensor& result,
     const Tensor& input,
-    c10::optional<c10::string_view> driver) {
+    std::optional<c10::string_view> driver) {
   return torch::linalg_svdvals_out(result, input, driver);
 }
 
@@ -561,8 +561,8 @@ inline Tensor& householder_product_out(
 inline std::tuple<Tensor, Tensor, Tensor, Tensor> lstsq(
     const Tensor& self,
     const Tensor& b,
-    c10::optional<double> cond,
-    c10::optional<c10::string_view> driver) {
+    std::optional<double> cond,
+    std::optional<c10::string_view> driver) {
   return detail::lstsq(self, b, cond, driver);
 }
 
@@ -773,16 +773,16 @@ inline Tensor matrix_rank(
 
 inline Tensor matrix_rank(
     const Tensor& input,
-    c10::optional<double> atol,
-    c10::optional<double> rtol,
+    std::optional<double> atol,
+    std::optional<double> rtol,
     bool hermitian) {
   return detail::matrix_rank(input, atol, rtol, hermitian);
 }
 
 inline Tensor matrix_rank(
     const Tensor& input,
-    const c10::optional<Tensor>& atol,
-    const c10::optional<Tensor>& rtol,
+    const std::optional<Tensor>& atol,
+    const std::optional<Tensor>& rtol,
     bool hermitian) {
   return detail::matrix_rank(input, atol, rtol, hermitian);
 }
@@ -806,8 +806,8 @@ inline Tensor& matrix_rank_out(
 inline Tensor& matrix_rank_out(
     Tensor& result,
     const Tensor& input,
-    c10::optional<double> atol,
-    c10::optional<double> rtol,
+    std::optional<double> atol,
+    std::optional<double> rtol,
     bool hermitian) {
   return detail::matrix_rank_out(result, input, atol, rtol, hermitian);
 }
@@ -815,8 +815,8 @@ inline Tensor& matrix_rank_out(
 inline Tensor& matrix_rank_out(
     Tensor& result,
     const Tensor& input,
-    const c10::optional<Tensor>& atol,
-    const c10::optional<Tensor>& rtol,
+    const std::optional<Tensor>& atol,
+    const std::optional<Tensor>& rtol,
     bool hermitian) {
   return detail::matrix_rank_out(result, input, atol, rtol, hermitian);
 }
@@ -976,7 +976,7 @@ inline Tensor& solve_triangular_out(
 inline std::tuple<Tensor, Tensor, Tensor> svd(
     const Tensor& input,
     bool full_matrices,
-    c10::optional<c10::string_view> driver) {
+    std::optional<c10::string_view> driver) {
   return detail::svd(input, full_matrices, driver);
 }
 
@@ -986,7 +986,7 @@ inline std::tuple<Tensor&, Tensor&, Tensor&> svd_out(
     Tensor& Vh,
     const Tensor& input,
     bool full_matrices,
-    c10::optional<c10::string_view> driver) {
+    std::optional<c10::string_view> driver) {
   return detail::svd_out(U, S, Vh, input, full_matrices, driver);
 }
 
@@ -995,14 +995,14 @@ inline std::tuple<Tensor&, Tensor&, Tensor&> svd_out(
 /// See https://pytorch.org/docs/main/linalg.html#torch.linalg.svdvals
 inline Tensor svdvals(
     const Tensor& input,
-    c10::optional<c10::string_view> driver) {
+    std::optional<c10::string_view> driver) {
   return detail::svdvals(input, driver);
 }
 
 inline Tensor& svdvals_out(
     Tensor& result,
     const Tensor& input,
-    c10::optional<c10::string_view> driver) {
+    std::optional<c10::string_view> driver) {
   return detail::svdvals_out(result, input, driver);
 }
 
diff --git a/torch/csrc/api/include/torch/nested.h b/torch/csrc/api/include/torch/nested.h
index 524b4d433186c..780aab4230472 100644
--- a/torch/csrc/api/include/torch/nested.h
+++ b/torch/csrc/api/include/torch/nested.h
@@ -72,8 +72,8 @@ inline at::Tensor nested_tensor(
 /// ```
 inline at::Tensor as_nested_tensor(
     at::TensorList list,
-    c10::optional<at::ScalarType> dtype = c10::nullopt,
-    c10::optional<at::Device> device = c10::nullopt) {
+    std::optional<at::ScalarType> dtype = c10::nullopt,
+    std::optional<at::Device> device = c10::nullopt) {
   return at::_nested_tensor_from_tensor_list(
       list, dtype, c10::nullopt, device, c10::nullopt);
 }
diff --git a/torch/csrc/api/include/torch/nn/functional/activation.h b/torch/csrc/api/include/torch/nn/functional/activation.h
index 9c100287f9559..89e596f71d143 100644
--- a/torch/csrc/api/include/torch/nn/functional/activation.h
+++ b/torch/csrc/api/include/torch/nn/functional/activation.h
@@ -233,7 +233,7 @@ namespace detail {
 inline Tensor softmax(
     const Tensor& input,
     int64_t dim,
-    c10::optional<torch::Dtype> dtype) {
+    std::optional<torch::Dtype> dtype) {
   Tensor ret;
 
   if (dtype == c10::nullopt) {
@@ -270,7 +270,7 @@ namespace detail {
 inline Tensor softmin(
     const Tensor& input,
     int64_t dim,
-    c10::optional<torch::Dtype> dtype) {
+    std::optional<torch::Dtype> dtype) {
   Tensor ret;
 
   if (dtype == c10::nullopt) {
@@ -307,7 +307,7 @@ namespace detail {
 inline Tensor log_softmax(
     const Tensor& input,
     int64_t dim,
-    c10::optional<torch::Dtype> dtype) {
+    std::optional<torch::Dtype> dtype) {
   Tensor ret;
 
   if (dtype == c10::nullopt) {
diff --git a/torch/csrc/api/include/torch/nn/functional/batchnorm.h b/torch/csrc/api/include/torch/nn/functional/batchnorm.h
index 487bd78ad44fe..bc6f141281b39 100644
--- a/torch/csrc/api/include/torch/nn/functional/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/functional/batchnorm.h
@@ -17,7 +17,7 @@ inline Tensor batch_norm(
     Tensor weight,
     Tensor bias,
     bool training,
-    c10::optional<double> momentum,
+    std::optional<double> momentum,
     double eps) {
   TORCH_CHECK(
       input.dim() >= 2,
diff --git a/torch/csrc/api/include/torch/nn/functional/embedding.h b/torch/csrc/api/include/torch/nn/functional/embedding.h
index 99432c09d36be..b06b0a3dc1e85 100644
--- a/torch/csrc/api/include/torch/nn/functional/embedding.h
+++ b/torch/csrc/api/include/torch/nn/functional/embedding.h
@@ -24,8 +24,8 @@ inline void _no_grad_embedding_renorm_(
 inline Tensor embedding(
     const Tensor& input,
     const Tensor& weight,
-    c10::optional<int64_t> padding_idx,
-    c10::optional<double> max_norm,
+    std::optional<int64_t> padding_idx,
+    std::optional<double> max_norm,
     double norm_type,
     bool scale_grad_by_freq,
     bool sparse) {
@@ -90,14 +90,14 @@ inline Tensor embedding_bag(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& offsets,
-    c10::optional<double> max_norm,
+    std::optional<double> max_norm,
     double norm_type,
     bool scale_grad_by_freq,
     EmbeddingBagMode mode,
     bool sparse,
     const Tensor& per_sample_weights,
     bool include_last_offset,
-    c10::optional<int64_t> padding_idx) {
+    std::optional<int64_t> padding_idx) {
   auto input_ = input;
   auto offsets_ = offsets;
   auto per_sample_weights_ = per_sample_weights;
diff --git a/torch/csrc/api/include/torch/nn/functional/loss.h b/torch/csrc/api/include/torch/nn/functional/loss.h
index 17fa2be1afc7a..c4124c2b23859 100644
--- a/torch/csrc/api/include/torch/nn/functional/loss.h
+++ b/torch/csrc/api/include/torch/nn/functional/loss.h
@@ -346,7 +346,7 @@ inline Tensor smooth_l1_loss(
     const Tensor& input,
     const Tensor& target,
     SmoothL1LossFuncOptions::reduction_t reduction,
-    c10::optional<double> beta_opt = c10::nullopt) {
+    std::optional<double> beta_opt = c10::nullopt) {
   if (target.sizes() != input.sizes()) {
     TORCH_WARN(
         "Using a target size (",
@@ -656,7 +656,7 @@ inline Tensor triplet_margin_with_distance_loss(
     const Tensor& anchor,
     const Tensor& positive,
     const Tensor& negative,
-    c10::optional<TripletMarginWithDistanceLossFuncOptions::distance_function_t>
+    std::optional<TripletMarginWithDistanceLossFuncOptions::distance_function_t>
         distance_function,
     double margin,
     bool swap,
diff --git a/torch/csrc/api/include/torch/nn/functional/normalization.h b/torch/csrc/api/include/torch/nn/functional/normalization.h
index a45fec6ca34f9..53bd61839f745 100644
--- a/torch/csrc/api/include/torch/nn/functional/normalization.h
+++ b/torch/csrc/api/include/torch/nn/functional/normalization.h
@@ -16,7 +16,7 @@ inline Tensor normalize(
     double p,
     int64_t dim,
     double eps,
-    c10::optional<Tensor> out) {
+    std::optional<Tensor> out) {
   if (out == c10::nullopt) {
     auto denom = input.norm(p, dim, true).clamp_min(eps).expand_as(input);
     return input / denom;
diff --git a/torch/csrc/api/include/torch/nn/functional/padding.h b/torch/csrc/api/include/torch/nn/functional/padding.h
index d4b81fb53f26a..1bb6f95382904 100644
--- a/torch/csrc/api/include/torch/nn/functional/padding.h
+++ b/torch/csrc/api/include/torch/nn/functional/padding.h
@@ -27,7 +27,7 @@ inline Tensor pad(
     TORCH_CHECK(false, "Unrecognised padding mode");
   }();
 
-  c10::optional<double> fill_value;
+  std::optional<double> fill_value;
   if (value != 0.0) {
     fill_value = value;
   }
diff --git a/torch/csrc/api/include/torch/nn/functional/pooling.h b/torch/csrc/api/include/torch/nn/functional/pooling.h
index 9f9708ce657ec..be3009f62201a 100644
--- a/torch/csrc/api/include/torch/nn/functional/pooling.h
+++ b/torch/csrc/api/include/torch/nn/functional/pooling.h
@@ -57,7 +57,7 @@ inline Tensor avg_pool2d(
     ExpandingArray<2> padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   return torch::avg_pool2d(
       input,
       kernel_size,
@@ -104,7 +104,7 @@ inline Tensor avg_pool3d(
     ExpandingArray<3> padding,
     bool ceil_mode,
     bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   return torch::avg_pool3d(
       input,
       kernel_size,
@@ -632,7 +632,7 @@ inline std::vector<int64_t> _unpool_output_size(
     const IntArrayRef& kernel_size,
     const IntArrayRef& stride,
     const IntArrayRef& padding,
-    const c10::optional<std::vector<int64_t>>& output_size) {
+    const std::optional<std::vector<int64_t>>& output_size) {
   auto input_size = input.sizes();
   std::vector<int64_t> default_size;
   for (const auto d : c10::irange(kernel_size.size())) {
@@ -688,7 +688,7 @@ inline Tensor max_unpool1d(
     ExpandingArray<1> kernel_size,
     ExpandingArray<1> stride,
     ExpandingArray<1> padding,
-    const c10::optional<std::vector<int64_t>>& output_size) {
+    const std::optional<std::vector<int64_t>>& output_size) {
   auto output_size_ =
       _unpool_output_size(input, kernel_size, stride, padding, output_size);
   output_size_.push_back(1);
@@ -733,7 +733,7 @@ inline Tensor max_unpool2d(
     ExpandingArray<2> kernel_size,
     ExpandingArray<2> stride,
     ExpandingArray<2> padding,
-    const c10::optional<std::vector<int64_t>>& output_size) {
+    const std::optional<std::vector<int64_t>>& output_size) {
   auto output_size_ =
       _unpool_output_size(input, kernel_size, stride, padding, output_size);
 
@@ -776,7 +776,7 @@ inline Tensor max_unpool3d(
     ExpandingArray<3> kernel_size,
     ExpandingArray<3> stride,
     ExpandingArray<3> padding,
-    const c10::optional<std::vector<int64_t>>& output_size) {
+    const std::optional<std::vector<int64_t>>& output_size) {
   auto output_size_ =
       _unpool_output_size(input, kernel_size, stride, padding, output_size);
 
@@ -817,8 +817,8 @@ namespace detail {
 inline std::tuple<Tensor, Tensor> fractional_max_pool2d_with_indices(
     const Tensor& input,
     const ExpandingArray<2>& kernel_size,
-    const c10::optional<ExpandingArray<2>>& output_size,
-    const c10::optional<ExpandingArray<2, double>>& output_ratio,
+    const std::optional<ExpandingArray<2>>& output_size,
+    const std::optional<ExpandingArray<2, double>>& output_ratio,
     const Tensor& _random_samples) {
   if (output_size == c10::nullopt && output_ratio == c10::nullopt) {
     TORCH_CHECK(
@@ -826,7 +826,7 @@ inline std::tuple<Tensor, Tensor> fractional_max_pool2d_with_indices(
         "fractional_max_pool2d requires specifying either ",
         "an output_size or an output_ratio");
   }
-  c10::optional<ExpandingArray<2>> output_size_ = output_size;
+  std::optional<ExpandingArray<2>> output_size_ = output_size;
   if (output_size_ == c10::nullopt) {
     TORCH_INTERNAL_ASSERT(output_ratio != c10::nullopt);
     output_size_ = {
@@ -875,8 +875,8 @@ namespace detail {
 inline Tensor fractional_max_pool2d(
     const Tensor& input,
     ExpandingArray<2> kernel_size,
-    c10::optional<ExpandingArray<2>> output_size,
-    c10::optional<ExpandingArray<2, double>> output_ratio,
+    std::optional<ExpandingArray<2>> output_size,
+    std::optional<ExpandingArray<2, double>> output_ratio,
     const Tensor& _random_samples) {
   return std::get<0>(fractional_max_pool2d_with_indices(
       input, kernel_size, output_size, output_ratio, _random_samples));
@@ -910,8 +910,8 @@ namespace detail {
 inline std::tuple<Tensor, Tensor> fractional_max_pool3d_with_indices(
     const Tensor& input,
     const ExpandingArray<3>& kernel_size,
-    const c10::optional<ExpandingArray<3>>& output_size,
-    const c10::optional<ExpandingArray<3, double>>& output_ratio,
+    const std::optional<ExpandingArray<3>>& output_size,
+    const std::optional<ExpandingArray<3, double>>& output_ratio,
     const Tensor& _random_samples) {
   if (output_size == c10::nullopt && output_ratio == c10::nullopt) {
     TORCH_CHECK(
@@ -920,7 +920,7 @@ inline std::tuple<Tensor, Tensor> fractional_max_pool3d_with_indices(
         "an output_size or an output_ratio");
   }
 
-  c10::optional<ExpandingArray<3>> output_size_ = output_size;
+  std::optional<ExpandingArray<3>> output_size_ = output_size;
   if (output_size_ == c10::nullopt) {
     TORCH_INTERNAL_ASSERT(output_ratio != c10::nullopt);
     output_size_ = {
@@ -971,8 +971,8 @@ namespace detail {
 inline Tensor fractional_max_pool3d(
     const Tensor& input,
     ExpandingArray<3> kernel_size,
-    c10::optional<ExpandingArray<3>> output_size,
-    c10::optional<ExpandingArray<3, double>> output_ratio,
+    std::optional<ExpandingArray<3>> output_size,
+    std::optional<ExpandingArray<3, double>> output_ratio,
     const Tensor& _random_samples) {
   return std::get<0>(fractional_max_pool3d_with_indices(
       input, kernel_size, output_size, output_ratio, _random_samples));
diff --git a/torch/csrc/api/include/torch/nn/functional/upsampling.h b/torch/csrc/api/include/torch/nn/functional/upsampling.h
index 8fe1b3f00f85d..38c5c51f9a475 100644
--- a/torch/csrc/api/include/torch/nn/functional/upsampling.h
+++ b/torch/csrc/api/include/torch/nn/functional/upsampling.h
@@ -15,9 +15,9 @@ inline std::vector<int64_t> _interp_output_size(
     int64_t dim,
     std::tuple<
         Tensor,
-        c10::optional<std::vector<int64_t>>,
-        c10::optional<std::vector<double>>,
-        c10::optional<bool>> closed_over_args) {
+        std::optional<std::vector<int64_t>>,
+        std::optional<std::vector<double>>,
+        std::optional<bool>> closed_over_args) {
   auto [input, size, scale_factor, recompute_scale_factor] = closed_over_args;
   if (size == c10::nullopt && scale_factor == c10::nullopt) {
     TORCH_CHECK(false, "either size or scale_factor should be defined");
@@ -75,11 +75,11 @@ inline std::vector<int64_t> _interp_output_size(
 namespace detail {
 inline Tensor interpolate(
     const Tensor& input,
-    const c10::optional<std::vector<int64_t>>& size,
-    const c10::optional<std::vector<double>>& scale_factor,
+    const std::optional<std::vector<int64_t>>& size,
+    const std::optional<std::vector<double>>& scale_factor,
     InterpolateFuncOptions::mode_t mode,
-    c10::optional<bool> align_corners,
-    c10::optional<bool> recompute_scale_factor,
+    std::optional<bool> align_corners,
+    std::optional<bool> recompute_scale_factor,
     bool antialias) {
   if (std::holds_alternative<enumtype::kNearest>(mode) ||
       std::get_if<enumtype::kArea>(&mode)) {
@@ -113,7 +113,7 @@ inline Tensor interpolate(
       ")");
 
   auto scale_factor_len = input.dim() - 2;
-  std::vector<c10::optional<double>> scale_factor_list(
+  std::vector<std::optional<double>> scale_factor_list(
       scale_factor_len, c10::nullopt);
   if (scale_factor != c10::nullopt && !recompute_scale_factor.value_or(false)) {
     auto _scale_factor_repeated = *scale_factor;
diff --git a/torch/csrc/api/include/torch/nn/functional/vision.h b/torch/csrc/api/include/torch/nn/functional/vision.h
index e9cb1eb11ac0f..a6c53e0c0a9ad 100644
--- a/torch/csrc/api/include/torch/nn/functional/vision.h
+++ b/torch/csrc/api/include/torch/nn/functional/vision.h
@@ -59,7 +59,7 @@ inline Tensor grid_sample(
     const Tensor& grid,
     GridSampleFuncOptions::mode_t mode,
     GridSampleFuncOptions::padding_mode_t padding_mode,
-    c10::optional<bool> align_corners) {
+    std::optional<bool> align_corners) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int64_t mode_enum, padding_mode_enum;
 
diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h
index 65a2d6905c0a9..9c55254ddb910 100644
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -315,7 +315,7 @@ class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
 
   std::vector<int64_t> _output_padding(
       const Tensor& input,
-      const c10::optional<at::IntArrayRef>& output_size,
+      const std::optional<at::IntArrayRef>& output_size,
       const ExpandingArray<D>& stride,
       const ExpandingArray<D>& padding,
       const ExpandingArray<D>& kernel_size);
@@ -350,10 +350,10 @@ class TORCH_API ConvTranspose1dImpl
   explicit ConvTranspose1dImpl(ConvTranspose1dOptions options_);
   Tensor forward(
       const Tensor& input,
-      const c10::optional<at::IntArrayRef>& output_size = c10::nullopt);
+      const std::optional<at::IntArrayRef>& output_size = c10::nullopt);
 
  protected:
-  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(c10::optional<at::IntArrayRef>())})
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(std::optional<at::IntArrayRef>())})
 };
 
 /// A `ModuleHolder` subclass for `ConvTranspose1dImpl`.
@@ -392,10 +392,10 @@ class TORCH_API ConvTranspose2dImpl
   explicit ConvTranspose2dImpl(ConvTranspose2dOptions options_);
   Tensor forward(
       const Tensor& input,
-      const c10::optional<at::IntArrayRef>& output_size = c10::nullopt);
+      const std::optional<at::IntArrayRef>& output_size = c10::nullopt);
 
  protected:
-  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(c10::optional<at::IntArrayRef>())})
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(std::optional<at::IntArrayRef>())})
 };
 
 /// A `ModuleHolder` subclass for `ConvTranspose2dImpl`.
@@ -434,10 +434,10 @@ class TORCH_API ConvTranspose3dImpl
   explicit ConvTranspose3dImpl(ConvTranspose3dOptions options_);
   Tensor forward(
       const Tensor& input,
-      const c10::optional<at::IntArrayRef>& output_size = c10::nullopt);
+      const std::optional<at::IntArrayRef>& output_size = c10::nullopt);
 
  protected:
-  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(c10::optional<at::IntArrayRef>())})
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(std::optional<at::IntArrayRef>())})
 };
 
 /// A `ModuleHolder` subclass for `ConvTranspose3dImpl`.
diff --git a/torch/csrc/api/include/torch/nn/modules/pooling.h b/torch/csrc/api/include/torch/nn/modules/pooling.h
index a9db131b0dd08..6bcdca463b1ba 100644
--- a/torch/csrc/api/include/torch/nn/modules/pooling.h
+++ b/torch/csrc/api/include/torch/nn/modules/pooling.h
@@ -507,10 +507,10 @@ class TORCH_API MaxUnpool1dImpl : public MaxUnpoolImpl<1, MaxUnpool1dImpl> {
   Tensor forward(
       const Tensor& input,
       const Tensor& indices,
-      const c10::optional<std::vector<int64_t>>& output_size = c10::nullopt);
+      const std::optional<std::vector<int64_t>>& output_size = c10::nullopt);
 
  protected:
-  FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(c10::optional<std::vector<int64_t>>())})
+  FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(std::optional<std::vector<int64_t>>())})
 };
 
 /// A `ModuleHolder` subclass for `MaxUnpool1dImpl`.
@@ -539,10 +539,10 @@ class TORCH_API MaxUnpool2dImpl : public MaxUnpoolImpl<2, MaxUnpool2dImpl> {
   Tensor forward(
       const Tensor& input,
       const Tensor& indices,
-      const c10::optional<std::vector<int64_t>>& output_size = c10::nullopt);
+      const std::optional<std::vector<int64_t>>& output_size = c10::nullopt);
 
  protected:
-  FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(c10::optional<std::vector<int64_t>>())})
+  FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(std::optional<std::vector<int64_t>>())})
 };
 
 /// A `ModuleHolder` subclass for `MaxUnpool2dImpl`.
@@ -571,10 +571,10 @@ class TORCH_API MaxUnpool3dImpl : public MaxUnpoolImpl<3, MaxUnpool3dImpl> {
   Tensor forward(
       const Tensor& input,
       const Tensor& indices,
-      const c10::optional<std::vector<int64_t>>& output_size = c10::nullopt);
+      const std::optional<std::vector<int64_t>>& output_size = c10::nullopt);
 
  protected:
-  FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(c10::optional<std::vector<int64_t>>())})
+  FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(std::optional<std::vector<int64_t>>())})
 };
 
 /// A `ModuleHolder` subclass for `MaxUnpool3dImpl`.
diff --git a/torch/csrc/api/include/torch/nn/modules/utils.h b/torch/csrc/api/include/torch/nn/modules/utils.h
index 6d3d383465f33..869027a241492 100644
--- a/torch/csrc/api/include/torch/nn/modules/utils.h
+++ b/torch/csrc/api/include/torch/nn/modules/utils.h
@@ -32,7 +32,7 @@ inline std::vector<int64_t> _reverse_repeat_vector(
 }
 
 inline std::vector<int64_t> _list_with_default(
-    torch::ArrayRef<c10::optional<int64_t>> out_size,
+    torch::ArrayRef<std::optional<int64_t>> out_size,
     torch::IntArrayRef defaults) {
   TORCH_CHECK(
       defaults.size() > out_size.size(),
diff --git a/torch/csrc/api/include/torch/nn/options/activation.h b/torch/csrc/api/include/torch/nn/options/activation.h
index e51805d364852..165212e0e860c 100644
--- a/torch/csrc/api/include/torch/nn/options/activation.h
+++ b/torch/csrc/api/include/torch/nn/options/activation.h
@@ -252,7 +252,7 @@ struct TORCH_API SoftmaxFuncOptions {
   /// If specified, the input tensor is casted to `dtype` before the operation
   /// is performed. This is useful for preventing data type overflows. Default:
   /// None.
-  TORCH_ARG(c10::optional<torch::Dtype>, dtype) = c10::nullopt;
+  TORCH_ARG(std::optional<torch::Dtype>, dtype) = c10::nullopt;
 };
 
 } // namespace functional
@@ -293,7 +293,7 @@ struct TORCH_API SoftminFuncOptions {
   /// If specified, the input tensor is casted to `dtype` before the operation
   /// is performed. This is useful for preventing data type overflows. Default:
   /// None.
-  TORCH_ARG(c10::optional<torch::Dtype>, dtype) = c10::nullopt;
+  TORCH_ARG(std::optional<torch::Dtype>, dtype) = c10::nullopt;
 };
 
 } // namespace functional
@@ -334,7 +334,7 @@ struct TORCH_API LogSoftmaxFuncOptions {
   /// If specified, the input tensor is casted to `dtype` before the operation
   /// is performed. This is useful for preventing data type overflows. Default:
   /// None.
-  TORCH_ARG(c10::optional<torch::Dtype>, dtype) = c10::nullopt;
+  TORCH_ARG(std::optional<torch::Dtype>, dtype) = c10::nullopt;
 };
 
 } // namespace functional
diff --git a/torch/csrc/api/include/torch/nn/options/batchnorm.h b/torch/csrc/api/include/torch/nn/options/batchnorm.h
index cd2d7f164203e..943673e2aae74 100644
--- a/torch/csrc/api/include/torch/nn/options/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/options/batchnorm.h
@@ -21,7 +21,7 @@ struct TORCH_API BatchNormOptions {
 
   /// A momentum multiplier for the mean and variance.
   /// Changing this parameter after construction __is effective__.
-  TORCH_ARG(c10::optional<double>, momentum) = 0.1;
+  TORCH_ARG(std::optional<double>, momentum) = 0.1;
 
   /// Whether to learn a scale and bias that are applied in an affine
   /// transformation on the input.
@@ -82,7 +82,7 @@ struct TORCH_API BatchNormFuncOptions {
 
   /// A momentum multiplier for the mean and variance.
   /// Changing this parameter after construction __is effective__.
-  TORCH_ARG(c10::optional<double>, momentum) = 0.1;
+  TORCH_ARG(std::optional<double>, momentum) = 0.1;
 
   /// The epsilon value added for numerical stability.
   /// Changing this parameter after construction __is effective__.
diff --git a/torch/csrc/api/include/torch/nn/options/embedding.h b/torch/csrc/api/include/torch/nn/options/embedding.h
index d8d06716308e1..20eacf9073355 100644
--- a/torch/csrc/api/include/torch/nn/options/embedding.h
+++ b/torch/csrc/api/include/torch/nn/options/embedding.h
@@ -28,10 +28,10 @@ struct TORCH_API EmbeddingOptions {
   /// Embedding, the embedding vector at `padding_idx` will default to all
   /// zeros, but can be updated to another value to be used as the padding
   /// vector.
-  TORCH_ARG(c10::optional<int64_t>, padding_idx) = c10::nullopt;
+  TORCH_ARG(std::optional<int64_t>, padding_idx) = c10::nullopt;
   /// If given, each embedding vector with norm larger than `max_norm` is
   /// renormalized to have norm `max_norm`.
-  TORCH_ARG(c10::optional<double>, max_norm) = c10::nullopt;
+  TORCH_ARG(std::optional<double>, max_norm) = c10::nullopt;
   /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
   TORCH_ARG(double, norm_type) = 2.;
   /// If given, this will scale gradients by the inverse of frequency of the
@@ -55,10 +55,10 @@ struct TORCH_API EmbeddingFromPretrainedOptions {
   /// If specified, the entries at `padding_idx` do not contribute to the
   /// gradient; therefore, the embedding vector at `padding_idx` is not updated
   /// during training, i.e. it remains as a fixed "pad".
-  TORCH_ARG(c10::optional<int64_t>, padding_idx) = c10::nullopt;
+  TORCH_ARG(std::optional<int64_t>, padding_idx) = c10::nullopt;
   /// If given, each embedding vector with norm larger than `max_norm` is
   /// renormalized to have norm `max_norm`.
-  TORCH_ARG(c10::optional<double>, max_norm) = c10::nullopt;
+  TORCH_ARG(std::optional<double>, max_norm) = c10::nullopt;
   /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
   TORCH_ARG(double, norm_type) = 2.;
   /// If given, this will scale gradients by the inverse of frequency of the
@@ -84,10 +84,10 @@ struct TORCH_API EmbeddingFuncOptions {
   /// If specified, the entries at `padding_idx` do not contribute to the
   /// gradient; therefore, the embedding vector at `padding_idx` is not updated
   /// during training, i.e. it remains as a fixed "pad".
-  TORCH_ARG(c10::optional<int64_t>, padding_idx) = c10::nullopt;
+  TORCH_ARG(std::optional<int64_t>, padding_idx) = c10::nullopt;
   /// If given, each embedding vector with norm larger than `max_norm` is
   /// renormalized to have norm `max_norm`.
-  TORCH_ARG(c10::optional<double>, max_norm) = c10::nullopt;
+  TORCH_ARG(std::optional<double>, max_norm) = c10::nullopt;
   /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
   TORCH_ARG(double, norm_type) = 2.;
   /// If given, this will scale gradients by the inverse of frequency of the
@@ -120,7 +120,7 @@ struct TORCH_API EmbeddingBagOptions {
   TORCH_ARG(int64_t, embedding_dim);
   /// If given, each embedding vector with norm larger than `max_norm` is
   /// renormalized to have norm `max_norm`.
-  TORCH_ARG(c10::optional<double>, max_norm) = c10::nullopt;
+  TORCH_ARG(std::optional<double>, max_norm) = c10::nullopt;
   /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
   TORCH_ARG(double, norm_type) = 2.;
   /// If given, this will scale gradients by the inverse of frequency of the
@@ -148,7 +148,7 @@ struct TORCH_API EmbeddingBagOptions {
   /// zeros, but can be updated to another value to be used as the padding
   /// vector. Note that the embedding vector at `padding_idx` is excluded from
   /// the reduction.
-  TORCH_ARG(c10::optional<int64_t>, padding_idx) = c10::nullopt;
+  TORCH_ARG(std::optional<int64_t>, padding_idx) = c10::nullopt;
 };
 
 // ============================================================================
@@ -161,7 +161,7 @@ struct TORCH_API EmbeddingBagFromPretrainedOptions {
   TORCH_ARG(bool, freeze) = true;
   /// If given, each embedding vector with norm larger than `max_norm` is
   /// renormalized to have norm `max_norm`.
-  TORCH_ARG(c10::optional<double>, max_norm) = c10::nullopt;
+  TORCH_ARG(std::optional<double>, max_norm) = c10::nullopt;
   /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
   TORCH_ARG(double, norm_type) = 2.;
   /// If given, this will scale gradients by the inverse of frequency of the
@@ -184,7 +184,7 @@ struct TORCH_API EmbeddingBagFromPretrainedOptions {
   /// gradient; therefore, the embedding vector at padding_idx is not updated
   /// during training, i.e. it remains as a fixed "pad". Note that the embedding
   /// vector at `padding_idx` is excluded from the reduction.
-  TORCH_ARG(c10::optional<int64_t>, padding_idx) = c10::nullopt;
+  TORCH_ARG(std::optional<int64_t>, padding_idx) = c10::nullopt;
 };
 
 // ============================================================================
@@ -205,7 +205,7 @@ struct TORCH_API EmbeddingBagFuncOptions {
   TORCH_ARG(torch::Tensor, offsets) = Tensor();
   /// If given, each embedding vector with norm larger than `max_norm` is
   /// renormalized to have norm `max_norm`.
-  TORCH_ARG(c10::optional<double>, max_norm) = c10::nullopt;
+  TORCH_ARG(std::optional<double>, max_norm) = c10::nullopt;
   /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
   TORCH_ARG(double, norm_type) = 2.;
   /// If given, this will scale gradients by the inverse of frequency of the
@@ -233,7 +233,7 @@ struct TORCH_API EmbeddingBagFuncOptions {
   /// gradient; therefore, the embedding vector at padding_idx is not updated
   /// during training, i.e. it remains as a fixed "pad". Note that the embedding
   /// vector at `padding_idx` is excluded from the reduction.
-  TORCH_ARG(c10::optional<int64_t>, padding_idx) = c10::nullopt;
+  TORCH_ARG(std::optional<int64_t>, padding_idx) = c10::nullopt;
 };
 
 } // namespace functional
diff --git a/torch/csrc/api/include/torch/nn/options/loss.h b/torch/csrc/api/include/torch/nn/options/loss.h
index c9eb2b66f3e0b..f1fc7a4d41115 100644
--- a/torch/csrc/api/include/torch/nn/options/loss.h
+++ b/torch/csrc/api/include/torch/nn/options/loss.h
@@ -450,7 +450,7 @@ struct TORCH_API TripletMarginWithDistanceLossOptions {
   /// Specifies a nonnegative, real-valued function that quantifies the
   /// closeness of two tensors. If not specified, `F::pairwise_distance` will
   /// be used. Default: nullopt
-  TORCH_ARG(c10::optional<distance_function_t>, distance_function) =
+  TORCH_ARG(std::optional<distance_function_t>, distance_function) =
       c10::nullopt;
   /// Specifies a nonnegative margin representing the minimum difference
   /// between the positive and negative distances required for the loss to be 0.
@@ -548,7 +548,7 @@ struct TORCH_API SmoothL1LossOptions {
   /// Specifies the threshold at which to change between L1 and L2 loss.
   /// If beta is not specified, a value of 1.0 will be used.
   /// Default: nullopt
-  TORCH_ARG(c10::optional<double>, beta) = c10::nullopt;
+  TORCH_ARG(std::optional<double>, beta) = c10::nullopt;
 };
 
 namespace functional {
diff --git a/torch/csrc/api/include/torch/nn/options/normalization.h b/torch/csrc/api/include/torch/nn/options/normalization.h
index ae8c206736d50..a1e5b1a0aeab1 100644
--- a/torch/csrc/api/include/torch/nn/options/normalization.h
+++ b/torch/csrc/api/include/torch/nn/options/normalization.h
@@ -133,7 +133,7 @@ struct TORCH_API NormalizeFuncOptions {
   TORCH_ARG(double, eps) = 1e-12;
   /// the output tensor. If `out` is used, this
   /// operation won't be differentiable.
-  TORCH_ARG(c10::optional<Tensor>, out) = c10::nullopt;
+  TORCH_ARG(std::optional<Tensor>, out) = c10::nullopt;
 };
 
 } // namespace functional
diff --git a/torch/csrc/api/include/torch/nn/options/pooling.h b/torch/csrc/api/include/torch/nn/options/pooling.h
index 41de605e90fb0..8f6cee99bff6a 100644
--- a/torch/csrc/api/include/torch/nn/options/pooling.h
+++ b/torch/csrc/api/include/torch/nn/options/pooling.h
@@ -32,7 +32,7 @@ struct AvgPoolOptions {
   /// if specified, it will be used as divisor, otherwise size of the pooling
   /// region will be used.
 
-  TORCH_ARG(c10::optional<int64_t>, divisor_override) = c10::nullopt;
+  TORCH_ARG(std::optional<int64_t>, divisor_override) = c10::nullopt;
 };
 
 /// `AvgPoolOptions` specialized for the `AvgPool1d` module.
@@ -401,7 +401,7 @@ struct MaxUnpoolFuncOptions {
   TORCH_ARG(ExpandingArray<D>, padding) = 0;
 
   /// the targeted output size
-  TORCH_ARG(c10::optional<std::vector<int64_t>>, output_size) = c10::nullopt;
+  TORCH_ARG(std::optional<std::vector<int64_t>>, output_size) = c10::nullopt;
 };
 
 /// `MaxUnpoolFuncOptions` specialized for
@@ -450,12 +450,12 @@ struct FractionalMaxPoolOptions {
   TORCH_ARG(ExpandingArray<D>, kernel_size);
 
   /// the target output size of the image
-  TORCH_ARG(c10::optional<ExpandingArray<D>>, output_size) = c10::nullopt;
+  TORCH_ARG(std::optional<ExpandingArray<D>>, output_size) = c10::nullopt;
 
   /// If one wants to have an output size as a ratio of the input size, this
   /// option can be given. This has to be a number or tuple in the range (0, 1)
   using ExpandingArrayDouble = torch::ExpandingArray<D, double>;
-  TORCH_ARG(c10::optional<ExpandingArrayDouble>, output_ratio) = c10::nullopt;
+  TORCH_ARG(std::optional<ExpandingArrayDouble>, output_ratio) = c10::nullopt;
 
   TORCH_ARG(torch::Tensor, _random_samples) = Tensor();
 };
diff --git a/torch/csrc/api/include/torch/nn/options/upsampling.h b/torch/csrc/api/include/torch/nn/options/upsampling.h
index ca793beb97725..21df2b89998de 100644
--- a/torch/csrc/api/include/torch/nn/options/upsampling.h
+++ b/torch/csrc/api/include/torch/nn/options/upsampling.h
@@ -20,10 +20,10 @@ namespace nn {
 /// ```
 struct TORCH_API UpsampleOptions {
   /// output spatial sizes.
-  TORCH_ARG(c10::optional<std::vector<int64_t>>, size) = c10::nullopt;
+  TORCH_ARG(std::optional<std::vector<int64_t>>, size) = c10::nullopt;
 
   /// multiplier for spatial size.
-  TORCH_ARG(c10::optional<std::vector<double>>, scale_factor) = c10::nullopt;
+  TORCH_ARG(std::optional<std::vector<double>>, scale_factor) = c10::nullopt;
 
   /// the upsampling algorithm: one of "nearest", "linear", "bilinear",
   /// "bicubic" and "trilinear". Default: "nearest"
@@ -40,7 +40,7 @@ struct TORCH_API UpsampleOptions {
   /// aligned, and thus preserving the values at those pixels. This only has
   /// effect when :attr:`mode` is "linear", "bilinear", "bicubic", or
   /// "trilinear". Default: "False"
-  TORCH_ARG(c10::optional<bool>, align_corners) = c10::nullopt;
+  TORCH_ARG(std::optional<bool>, align_corners) = c10::nullopt;
 };
 
 namespace functional {
@@ -65,10 +65,10 @@ struct TORCH_API InterpolateFuncOptions {
       mode_t;
 
   /// output spatial sizes.
-  TORCH_ARG(c10::optional<std::vector<int64_t>>, size) = c10::nullopt;
+  TORCH_ARG(std::optional<std::vector<int64_t>>, size) = c10::nullopt;
 
   /// multiplier for spatial size.
-  TORCH_ARG(c10::optional<std::vector<double>>, scale_factor) = c10::nullopt;
+  TORCH_ARG(std::optional<std::vector<double>>, scale_factor) = c10::nullopt;
 
   /// the upsampling algorithm: one of "nearest", "linear", "bilinear",
   /// "bicubic", "trilinear", "area", "nearest-exact". Default: "nearest"
@@ -83,7 +83,7 @@ struct TORCH_API InterpolateFuncOptions {
   /// this operation *independent* of input size when `scale_factor` is
   /// kept the same.  It is *required* when interpolating mode is "linear",
   /// "bilinear", "bicubic" or "trilinear". Default: "False"
-  TORCH_ARG(c10::optional<bool>, align_corners) = c10::nullopt;
+  TORCH_ARG(std::optional<bool>, align_corners) = c10::nullopt;
 
   /// recompute the scale_factor for use in the
   /// interpolation calculation.  When `scale_factor` is passed as a parameter,
@@ -95,7 +95,7 @@ struct TORCH_API InterpolateFuncOptions {
   /// used in the interpolation computation.  Note that when `scale_factor` is
   /// floating-point, the recomputed scale_factor may differ from the one passed
   /// in due to rounding and precision issues.
-  TORCH_ARG(c10::optional<bool>, recompute_scale_factor) = c10::nullopt;
+  TORCH_ARG(std::optional<bool>, recompute_scale_factor) = c10::nullopt;
 
   /// flag to apply anti-aliasing. Using anti-alias
   /// option together with :attr:`align_corners` equals "False", interpolation
diff --git a/torch/csrc/api/include/torch/nn/options/vision.h b/torch/csrc/api/include/torch/nn/options/vision.h
index 814f4b6684d96..c012b40d21f69 100644
--- a/torch/csrc/api/include/torch/nn/options/vision.h
+++ b/torch/csrc/api/include/torch/nn/options/vision.h
@@ -28,7 +28,7 @@ struct TORCH_API GridSampleFuncOptions {
   /// padding mode for outside grid values. Default: Zeros
   TORCH_ARG(padding_mode_t, padding_mode) = torch::kZeros;
   /// Specifies perspective to pixel as point. Default: false
-  TORCH_ARG(c10::optional<bool>, align_corners) = c10::nullopt;
+  TORCH_ARG(std::optional<bool>, align_corners) = c10::nullopt;
 };
 
 } // namespace functional
diff --git a/torch/csrc/api/include/torch/nn/pimpl.h b/torch/csrc/api/include/torch/nn/pimpl.h
index d66d83c257ebd..a5a71a01c833c 100644
--- a/torch/csrc/api/include/torch/nn/pimpl.h
+++ b/torch/csrc/api/include/torch/nn/pimpl.h
@@ -140,27 +140,13 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator {
   }
 
  private:
-  /// In C++17, the two methods below could be written as the following:
-  /// if constexpr (std::is_default_constructible_v<Contained>) {
-  ///   return std::make_shared<Contained>();
-  /// } else {
-  ///   return nullptr;
-  /// }
-  /// In C++11, we use SFINAE instead of `if constexpr`.
-
-  template <
-      typename T = Contained,
-      typename = torch::enable_if_t<std::is_default_constructible<T>::value>>
-  std::shared_ptr<Contained> default_construct() {
-    return std::make_shared<Contained>();
-  }
-
   template <typename T = Contained>
-  torch::disable_if_t<
-      std::is_default_constructible<T>::value,
-      std::shared_ptr<Contained>>
-  default_construct() {
-    return nullptr;
+  std::shared_ptr<Contained> default_construct() {
+    if constexpr (std::is_default_constructible_v<T>) {
+      return std::make_shared<Contained>();
+    } else {
+      return nullptr;
+    }
   }
 };
 
diff --git a/torch/csrc/api/include/torch/nn/utils/clip_grad.h b/torch/csrc/api/include/torch/nn/utils/clip_grad.h
index e1023bd1eb5c7..fbb533662c7be 100644
--- a/torch/csrc/api/include/torch/nn/utils/clip_grad.h
+++ b/torch/csrc/api/include/torch/nn/utils/clip_grad.h
@@ -64,7 +64,7 @@ inline double clip_grad_norm_(
   // synchronizing the CPU and the gradients' device until the very end to
   // preserve async execution on the device. When checking for finite-ness, this
   // optional ensures we only sync once.
-  c10::optional<double> total_norm = c10::nullopt;
+  std::optional<double> total_norm = c10::nullopt;
   if (error_if_nonfinite) {
     total_norm = total_norm_tensor.item().toDouble();
     TORCH_CHECK(
diff --git a/torch/csrc/api/include/torch/nn/utils/convert_parameters.h b/torch/csrc/api/include/torch/nn/utils/convert_parameters.h
index 2ac1d317c9922..6f62d483c4d8b 100644
--- a/torch/csrc/api/include/torch/nn/utils/convert_parameters.h
+++ b/torch/csrc/api/include/torch/nn/utils/convert_parameters.h
@@ -11,9 +11,9 @@ namespace utils {
 // in the same device. Currently, the conversion between model parameters
 // and single vector form is not supported for multiple allocations,
 // e.g. parameters in different GPUs, or mixture of CPU/GPU.
-inline c10::optional<int64_t> _check_param_device(
+inline std::optional<int64_t> _check_param_device(
     const torch::Tensor& param,
-    c10::optional<int64_t> old_param_device) {
+    std::optional<int64_t> old_param_device) {
   // Meet the first parameter
   if (old_param_device == c10::nullopt) {
     old_param_device = param.is_cuda() ? param.get_device() : -1;
@@ -38,7 +38,7 @@ inline c10::optional<int64_t> _check_param_device(
 // Convert parameters to one vector
 inline torch::Tensor parameters_to_vector(
     const std::vector<torch::Tensor>& parameters) {
-  c10::optional<int64_t> param_device;
+  std::optional<int64_t> param_device;
 
   std::vector<torch::Tensor> vec;
   vec.reserve(parameters.size());
@@ -58,7 +58,7 @@ inline void vector_to_parameters(
     const torch::Tensor& vec,
     const std::vector<torch::Tensor>& parameters) {
   // Flag for the device where the parameter is located
-  c10::optional<int64_t> param_device;
+  std::optional<int64_t> param_device;
 
   // Pointer for slicing the vector for each parameter
   int64_t pointer = 0;
diff --git a/torch/csrc/api/include/torch/nn/utils/rnn.h b/torch/csrc/api/include/torch/nn/utils/rnn.h
index eea517a2b60f3..ba8b0db427150 100644
--- a/torch/csrc/api/include/torch/nn/utils/rnn.h
+++ b/torch/csrc/api/include/torch/nn/utils/rnn.h
@@ -247,7 +247,7 @@ inline std::tuple<Tensor, Tensor> pad_packed_sequence(
     PackedSequence sequence,
     bool batch_first = false,
     double padding_value = 0.0,
-    c10::optional<int64_t> total_length = torch::nullopt) {
+    std::optional<int64_t> total_length = torch::nullopt) {
   int64_t max_seq_length = sequence.batch_sizes().size(0);
   if (total_length.has_value()) {
     int64_t total_length_val = total_length.value();
diff --git a/torch/csrc/api/include/torch/optim/lbfgs.h b/torch/csrc/api/include/torch/optim/lbfgs.h
index 99aa35d36e4b5..001b0cd33f259 100644
--- a/torch/csrc/api/include/torch/optim/lbfgs.h
+++ b/torch/csrc/api/include/torch/optim/lbfgs.h
@@ -17,11 +17,11 @@ struct TORCH_API LBFGSOptions : public OptimizerCloneableOptions<LBFGSOptions> {
   LBFGSOptions(double lr = 1);
   TORCH_ARG(double, lr) = 1;
   TORCH_ARG(int64_t, max_iter) = 20;
-  TORCH_ARG(c10::optional<int64_t>, max_eval) = c10::nullopt;
+  TORCH_ARG(std::optional<int64_t>, max_eval) = c10::nullopt;
   TORCH_ARG(double, tolerance_grad) = 1e-7;
   TORCH_ARG(double, tolerance_change) = 1e-9;
   TORCH_ARG(int64_t, history_size) = 100;
-  TORCH_ARG(c10::optional<std::string>, line_search_fn) = c10::nullopt;
+  TORCH_ARG(std::optional<std::string>, line_search_fn) = c10::nullopt;
 
  public:
   void serialize(torch::serialize::InputArchive& archive) override;
@@ -45,7 +45,7 @@ struct TORCH_API LBFGSParamState
   TORCH_ARG(std::deque<Tensor>, old_dirs);
   TORCH_ARG(std::deque<Tensor>, old_stps);
   TORCH_ARG(std::deque<Tensor>, ro);
-  TORCH_ARG(c10::optional<std::vector<Tensor>>, al) = c10::nullopt;
+  TORCH_ARG(std::optional<std::vector<Tensor>>, al) = c10::nullopt;
 
  public:
   void serialize(torch::serialize::InputArchive& archive) override;
@@ -82,7 +82,7 @@ class TORCH_API LBFGS : public Optimizer {
   void load(serialize::InputArchive& archive) override;
 
  private:
-  c10::optional<int64_t> _numel_cache;
+  std::optional<int64_t> _numel_cache;
   int64_t _numel();
   Tensor _gather_flat_grad();
   void _add_grad(const double step_size, const Tensor& update);
diff --git a/torch/csrc/api/include/torch/serialize/input-archive.h b/torch/csrc/api/include/torch/serialize/input-archive.h
index 83d1a543ddacb..f77b34aad0bd4 100644
--- a/torch/csrc/api/include/torch/serialize/input-archive.h
+++ b/torch/csrc/api/include/torch/serialize/input-archive.h
@@ -76,27 +76,27 @@ class TORCH_API InputArchive final {
   /// is not specified, the module is loaded to the original device.
   void load_from(
       const std::string& filename,
-      c10::optional<torch::Device> device = c10::nullopt);
+      std::optional<torch::Device> device = c10::nullopt);
 
   /// Loads the `InputArchive` from a serialized representation stored in the
   /// given `stream`. Storage are remapped using device option. If device
   /// is not specified, the module is loaded to the original device.
   void load_from(
       std::istream& stream,
-      c10::optional<torch::Device> device = c10::nullopt);
+      std::optional<torch::Device> device = c10::nullopt);
 
   // Loads given the specified flat array.
   void load_from(
       const char* data,
       size_t size,
-      c10::optional<torch::Device> device = c10::nullopt);
+      std::optional<torch::Device> device = c10::nullopt);
 
   // Loads given the specified read and size functions.
   void load_from(
       const std::function<size_t(uint64_t pos, void* buf, size_t nbytes)>&
           read_func,
       const std::function<size_t(void)>& size_func,
-      c10::optional<torch::Device> device = c10::nullopt);
+      std::optional<torch::Device> device = c10::nullopt);
 
   // Returns the vector of keys in the input archive.
   std::vector<std::string> keys();
diff --git a/torch/csrc/api/include/torch/special.h b/torch/csrc/api/include/torch/special.h
index 7ad7e7689ebd6..d8346e1aa1d8c 100644
--- a/torch/csrc/api/include/torch/special.h
+++ b/torch/csrc/api/include/torch/special.h
@@ -596,7 +596,7 @@ inline Tensor& log1p_out(Tensor& result, const Tensor& self) {
 inline Tensor log_softmax(
     const Tensor& self,
     int64_t dim,
-    c10::optional<ScalarType> dtype) {
+    std::optional<ScalarType> dtype) {
   return torch::special_log_softmax(self, dim, dtype);
 }
 
@@ -611,7 +611,7 @@ inline Tensor log_softmax(
 inline Tensor softmax(
     const Tensor& self,
     int64_t dim,
-    c10::optional<ScalarType> dtype) {
+    std::optional<ScalarType> dtype) {
   return torch::special_softmax(self, dim, dtype);
 }
 
diff --git a/torch/csrc/api/include/torch/types.h b/torch/csrc/api/include/torch/types.h
index 92be710cf4bf4..8a23cd122b8d1 100644
--- a/torch/csrc/api/include/torch/types.h
+++ b/torch/csrc/api/include/torch/types.h
@@ -39,7 +39,7 @@ namespace torch {
 using namespace at; // NOLINT
 
 using c10::nullopt;
-using c10::optional;
+using std::optional;
 
 using Dtype = at::ScalarType;
 
diff --git a/torch/csrc/api/src/nn/modules/conv.cpp b/torch/csrc/api/src/nn/modules/conv.cpp
index 20be11f221838..197c3cf0725cd 100644
--- a/torch/csrc/api/src/nn/modules/conv.cpp
+++ b/torch/csrc/api/src/nn/modules/conv.cpp
@@ -169,12 +169,12 @@ template class ConvNdImpl<3, Conv3dImpl>;
 template <size_t D, typename Derived>
 std::vector<int64_t> ConvTransposeNdImpl<D, Derived>::_output_padding(
     const Tensor& input,
-    const c10::optional<at::IntArrayRef>& output_size,
+    const std::optional<at::IntArrayRef>& output_size,
     const ExpandingArray<D>& stride,
     const ExpandingArray<D>& padding,
     const ExpandingArray<D>& kernel_size) {
   std::vector<int64_t> ret;
-  c10::optional<at::IntArrayRef> output_size_ = output_size;
+  std::optional<at::IntArrayRef> output_size_ = output_size;
 
   if (output_size_ == c10::nullopt) {
     ret = at::IntArrayRef(this->options.output_padding()).vec();
@@ -248,7 +248,7 @@ ConvTranspose1dImpl::ConvTranspose1dImpl(ConvTranspose1dOptions options_)
 
 Tensor ConvTranspose1dImpl::forward(
     const Tensor& input,
-    const c10::optional<at::IntArrayRef>& output_size) {
+    const std::optional<at::IntArrayRef>& output_size) {
   if (!std::get_if<enumtype::kZeros>(&options.padding_mode())) {
     TORCH_CHECK(
         false, "Only `zeros` padding mode is supported for ConvTranspose1d");
@@ -285,7 +285,7 @@ ConvTranspose2dImpl::ConvTranspose2dImpl(ConvTranspose2dOptions options_)
 
 Tensor ConvTranspose2dImpl::forward(
     const Tensor& input,
-    const c10::optional<at::IntArrayRef>& output_size) {
+    const std::optional<at::IntArrayRef>& output_size) {
   if (!std::get_if<enumtype::kZeros>(&options.padding_mode())) {
     TORCH_CHECK(
         false, "Only `zeros` padding mode is supported for ConvTranspose2d");
@@ -322,7 +322,7 @@ ConvTranspose3dImpl::ConvTranspose3dImpl(ConvTranspose3dOptions options_)
 
 Tensor ConvTranspose3dImpl::forward(
     const Tensor& input,
-    const c10::optional<at::IntArrayRef>& output_size) {
+    const std::optional<at::IntArrayRef>& output_size) {
   if (!std::get_if<enumtype::kZeros>(&options.padding_mode())) {
     TORCH_CHECK(
         false, "Only `zeros` padding mode is supported for ConvTranspose3d");
diff --git a/torch/csrc/api/src/nn/modules/pooling.cpp b/torch/csrc/api/src/nn/modules/pooling.cpp
index 1a3f29e235507..0b11b914dcc1c 100644
--- a/torch/csrc/api/src/nn/modules/pooling.cpp
+++ b/torch/csrc/api/src/nn/modules/pooling.cpp
@@ -229,7 +229,7 @@ void MaxUnpoolImpl<D, Derived>::pretty_print(std::ostream& stream) const {
 Tensor MaxUnpool1dImpl::forward(
     const Tensor& input,
     const Tensor& indices,
-    const c10::optional<std::vector<int64_t>>& output_size) {
+    const std::optional<std::vector<int64_t>>& output_size) {
   return F::detail::max_unpool1d(
       input,
       indices,
@@ -242,7 +242,7 @@ Tensor MaxUnpool1dImpl::forward(
 Tensor MaxUnpool2dImpl::forward(
     const Tensor& input,
     const Tensor& indices,
-    const c10::optional<std::vector<int64_t>>& output_size) {
+    const std::optional<std::vector<int64_t>>& output_size) {
   return F::detail::max_unpool2d(
       input,
       indices,
@@ -255,7 +255,7 @@ Tensor MaxUnpool2dImpl::forward(
 Tensor MaxUnpool3dImpl::forward(
     const Tensor& input,
     const Tensor& indices,
-    const c10::optional<std::vector<int64_t>>& output_size) {
+    const std::optional<std::vector<int64_t>>& output_size) {
   return F::detail::max_unpool3d(
       input,
       indices,
diff --git a/torch/csrc/api/src/optim/lbfgs.cpp b/torch/csrc/api/src/optim/lbfgs.cpp
index bf54e9a878618..10739be623869 100644
--- a/torch/csrc/api/src/optim/lbfgs.cpp
+++ b/torch/csrc/api/src/optim/lbfgs.cpp
@@ -67,7 +67,7 @@ bool if_container_equal(T lhs, T rhs) {
 }
 
 bool operator==(const LBFGSParamState& lhs, const LBFGSParamState& rhs) {
-  auto isNull = [](const c10::optional<std::vector<Tensor>>& val) {
+  auto isNull = [](const std::optional<std::vector<Tensor>>& val) {
     return val == c10::nullopt;
   };
   return (lhs.func_evals() == rhs.func_evals()) &&
@@ -194,7 +194,7 @@ static double _cubic_interpolate(
     double x2,
     double f2,
     double g2,
-    c10::optional<std::tuple<double, double>> bounds = c10::nullopt) {
+    std::optional<std::tuple<double, double>> bounds = c10::nullopt) {
   // ported from https://github.com/torch/optim/blob/master/polyinterp.lua
   // Compute bounds of interpolation area
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
diff --git a/torch/csrc/api/src/serialize/input-archive.cpp b/torch/csrc/api/src/serialize/input-archive.cpp
index c18a041293aea..852f4eab1b52b 100644
--- a/torch/csrc/api/src/serialize/input-archive.cpp
+++ b/torch/csrc/api/src/serialize/input-archive.cpp
@@ -93,20 +93,20 @@ void InputArchive::read(const std::string& key, InputArchive& archive) {
 
 void InputArchive::load_from(
     const std::string& filename,
-    c10::optional<torch::Device> device /*= c10::nullopt*/) {
+    std::optional<torch::Device> device /*= c10::nullopt*/) {
   module_ = torch::jit::load(filename, std::move(device));
 }
 
 void InputArchive::load_from(
     std::istream& stream,
-    c10::optional<torch::Device> device /*= c10::nullopt*/) {
+    std::optional<torch::Device> device /*= c10::nullopt*/) {
   module_ = torch::jit::load(stream, std::move(device));
 }
 
 void InputArchive::load_from(
     const char* data,
     size_t size,
-    c10::optional<torch::Device> device /*= c10::nullopt*/) {
+    std::optional<torch::Device> device /*= c10::nullopt*/) {
   using caffe2::serialize::ReadAdapterInterface;
   class OurAdapter : public ReadAdapterInterface {
    public:
@@ -136,7 +136,7 @@ void InputArchive::load_from(
 void InputArchive::load_from(
     const std::function<size_t(uint64_t, void*, size_t)>& read_func,
     const std::function<size_t(void)>& size_func,
-    c10::optional<torch::Device> device /*= c10::nullopt*/) {
+    std::optional<torch::Device> device /*= c10::nullopt*/) {
   using caffe2::serialize::ReadAdapterInterface;
   class OurAdapter : public ReadAdapterInterface {
    public:
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 4c0c324ad56ec..65c7fbb853610 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -60,19 +60,19 @@ Tensor apply_loss_reduction(const Tensor& unreduced, int64_t reduction) {
   return unreduced;
 }
 
-static bool isDefined(const c10::optional<Tensor>& t) {
+static bool isDefined(const std::optional<Tensor>& t) {
   return t.has_value() && t->defined();
 }
 
-Tensor toNonOptTensor(const c10::optional<Tensor>& t) {
+Tensor toNonOptTensor(const std::optional<Tensor>& t) {
   return t.has_value() ? *t : Tensor();
 }
 
-Tensor toNonOptFwGrad(const c10::optional<Tensor>& t) {
+Tensor toNonOptFwGrad(const std::optional<Tensor>& t) {
   return (t.has_value() && t->defined()) ? t->_fw_grad(/*level */ 0) : Tensor();
 }
 
-Tensor toNonOptPrimal(const c10::optional<Tensor>& t) {
+Tensor toNonOptPrimal(const std::optional<Tensor>& t) {
   if (t.has_value() && t->defined()) {
     if (t->unsafeGetTensorImpl()->is_wrapped_number()) {
       return *t;
@@ -605,7 +605,7 @@ Tensor div_tensor_self_backward(
     const Tensor& grad,
     T other,
     ScalarType self_st,
-    const c10::optional<c10::string_view>& rounding_mode) {
+    const std::optional<c10::string_view>& rounding_mode) {
   if (rounding_mode.has_value()) {
     return at::zeros_like(grad, grad.options().dtype(self_st));
   }
@@ -617,12 +617,12 @@ template Tensor div_tensor_self_backward(
     const Tensor&,
     Tensor,
     ScalarType,
-    const c10::optional<c10::string_view>&);
+    const std::optional<c10::string_view>&);
 template Tensor div_tensor_self_backward(
     const Tensor&,
     Scalar,
     ScalarType,
-    const c10::optional<c10::string_view>&);
+    const std::optional<c10::string_view>&);
 
 template <typename T>
 Tensor div_tensor_self_backward(
@@ -639,7 +639,7 @@ Tensor div_tensor_other_backward(
     const Tensor& grad,
     const Tensor& self,
     const Tensor& other,
-    const c10::optional<c10::string_view>& rounding_mode) {
+    const std::optional<c10::string_view>& rounding_mode) {
   if (rounding_mode.has_value()) {
     return at::zeros_like(grad, grad.options().dtype(other.scalar_type()));
   }
@@ -1289,7 +1289,7 @@ Tensor convolution_jvp(
     at::SymIntArrayRef output_padding,
     const c10::SymInt& groups) {
   auto bias_t_opt =
-      bias_t.defined() ? c10::optional<at::Tensor>(bias_t) : c10::nullopt;
+      bias_t.defined() ? std::optional<at::Tensor>(bias_t) : c10::nullopt;
   return (
       at::convolution_symint(
           input_t,
@@ -1331,7 +1331,7 @@ Tensor _convolution_jvp(
     bool cudnn_enabled,
     bool allow_tf32) {
   auto bias_t_opt =
-      bias_t.defined() ? c10::optional<at::Tensor>(bias_t) : c10::nullopt;
+      bias_t.defined() ? std::optional<at::Tensor>(bias_t) : c10::nullopt;
   return (
       at::_convolution_symint(
           input_t,
@@ -1520,8 +1520,8 @@ static Tensor sparse_mask_like_grad(
 std::tuple<Tensor, Tensor, Tensor> sparse_sampled_addmm_backward(
     const Tensor& grad,
     const Tensor& self,
-    const c10::optional<Tensor>& mat1,
-    const c10::optional<Tensor>& mat2,
+    const std::optional<Tensor>& mat1,
+    const std::optional<Tensor>& mat2,
     const Scalar& alpha,
     const Scalar& beta,
     const std::array<bool, 3>& grad_input_mask) {
@@ -1819,7 +1819,7 @@ Tensor var_backward(
     Tensor grad,
     const Tensor& self,
     at::OptionalIntArrayRef dim_opt,
-    const c10::optional<at::Scalar>& correction_opt,
+    const std::optional<at::Scalar>& correction_opt,
     bool keepdim) {
   const auto correction = correction_opt.value_or(1).toSymFloat();
   if (self.dim() == 0 || !dim_opt.has_value()) {
@@ -1852,7 +1852,7 @@ Tensor std_backward(
     const Tensor& grad,
     const Tensor& self,
     at::OptionalIntArrayRef dim,
-    const c10::optional<c10::Scalar>& correction_opt,
+    const std::optional<c10::Scalar>& correction_opt,
     bool keepdim) {
   auto grad_var = (grad / (result * 2)).masked_fill_(result == 0, 0);
   return var_backward(std::move(grad_var), self, dim, correction_opt, keepdim);
@@ -1863,7 +1863,7 @@ Tensor var_mean_backward(
     const Tensor& gmean,
     const Tensor& self,
     at::OptionalIntArrayRef dim_opt,
-    const c10::optional<c10::Scalar>& correction_opt,
+    const std::optional<c10::Scalar>& correction_opt,
     bool keepdim) {
   Tensor gself;
   if (gvar.defined()) {
@@ -1887,7 +1887,7 @@ Tensor std_mean_backward(
     const Tensor& self,
     const Tensor& std,
     at::OptionalIntArrayRef dim_opt,
-    const c10::optional<c10::Scalar>& correction_opt,
+    const std::optional<c10::Scalar>& correction_opt,
     bool keepdim) {
   Tensor gself;
   if (gstd.defined()) {
@@ -2241,7 +2241,7 @@ Tensor infinitely_differentiable_mish_backward(
 Tensor infinitely_differentiable_logit_backward(
     const Tensor& grad,
     const Tensor& self,
-    c10::optional<double> eps) {
+    std::optional<double> eps) {
   if (eps) {
     const double lo = eps.value();
     const double hi = 1.0 - lo;
@@ -2262,7 +2262,7 @@ Tensor binary_cross_entropy_target_backward(
     const Tensor& grad,
     const Tensor& self,
     const Tensor& target,
-    const c10::optional<Tensor>& weight,
+    const std::optional<Tensor>& weight,
     int64_t reduction) {
   auto grad_target = at::logit(self).neg_();
 
@@ -2295,7 +2295,7 @@ Tensor binary_cross_entropy_double_backward_target(
     const Tensor& grad_output,
     const Tensor& self,
     const Tensor& target,
-    const c10::optional<Tensor>& weight,
+    const std::optional<Tensor>& weight,
     int64_t reduction) {
   auto res = -grad * grad_output;
 
@@ -2332,8 +2332,8 @@ Tensor binary_cross_entropy_with_logits_backward(
     const Tensor& grad,
     const Tensor& input,
     const Tensor& target,
-    const c10::optional<Tensor>& weight,
-    const c10::optional<Tensor>& pos_weight,
+    const std::optional<Tensor>& weight,
+    const std::optional<Tensor>& pos_weight,
     int64_t reduction) {
   // Trivial case
   if (grad._is_zerotensor()) {
@@ -2387,8 +2387,8 @@ Tensor binary_cross_entropy_with_logits_target_backward(
     const Tensor& grad_output,
     const Tensor& self,
     const Tensor& target,
-    const c10::optional<Tensor>& weight,
-    const c10::optional<Tensor>& pos_weight,
+    const std::optional<Tensor>& weight,
+    const std::optional<Tensor>& pos_weight,
     int64_t reduction) {
   if (grad_output._is_zerotensor()) {
     return at::_efficientzerotensor(target.sizes(), target.options());
@@ -2479,7 +2479,7 @@ Tensor binary_cross_entropy_double_backward(
     const Tensor& grad,
     const Tensor& input,
     const Tensor& target,
-    const c10::optional<Tensor>& weight,
+    const std::optional<Tensor>& weight,
     int64_t reduction) {
   auto eps = 1e-12;
   auto inp_pl_eps = input + eps;
@@ -2514,7 +2514,7 @@ Tensor binary_cross_entropy_double_backward_grad_output(
     const Tensor& grad,
     const Tensor& input,
     const Tensor& target,
-    const c10::optional<Tensor>& weight,
+    const std::optional<Tensor>& weight,
     int64_t reduction) {
   auto eps = 1e-12;
   // gradient wrt grad_output
@@ -3186,7 +3186,7 @@ Tensor as_strided_backward(
   auto storage = grad.new_zeros_symint(c10::SymIntArrayRef(base_size));
 
   // prepare indices tensor if we will do index_add_ later
-  c10::optional<at::Tensor> flatten_full_indices;
+  std::optional<at::Tensor> flatten_full_indices;
   if (inp_maybe_overlap || out_maybe_overlap) {
     flatten_full_indices =
         // TODO: should we symint-ify arange? Need SymScalar.
@@ -3334,8 +3334,8 @@ Tensor slice_backward_wrapper(
     const at::Tensor& grad,
     const c10::SymIntArrayRef& input_sizes,
     int64_t dim,
-    c10::optional<c10::SymInt> start,
-    c10::optional<c10::SymInt> end,
+    std::optional<c10::SymInt> start,
+    std::optional<c10::SymInt> end,
     c10::SymInt step) {
   auto start_val = start.has_value() ? start.value() : 0;
   auto end_val = end.has_value() ? end.value() : INT64_MAX;
@@ -4617,17 +4617,17 @@ static Tensor expand_as_dim1(const Tensor& src, const Tensor& target) {
 
 std::tuple<Tensor, Tensor, Tensor> batchnorm_double_backward(
     const Tensor& input,
-    const c10::optional<Tensor>& gamma,
+    const std::optional<Tensor>& gamma,
     const Tensor& ggI,
     const Tensor& ggG,
     const Tensor& ggB,
     const Tensor& gO,
-    const c10::optional<Tensor>& running_mean,
-    const c10::optional<Tensor>& running_var,
+    const std::optional<Tensor>& running_mean,
+    const std::optional<Tensor>& running_var,
     bool training,
     double eps,
-    const c10::optional<Tensor>& save_mean,
-    const c10::optional<Tensor>& save_invstd,
+    const std::optional<Tensor>& save_mean,
+    const std::optional<Tensor>& save_invstd,
     std::array<bool, 3> output_mask) {
   bool affine = isDefined(gamma);
   // TODO: Do we have a ScalarOrTensor type?  Would such a thing exist?
@@ -4756,7 +4756,7 @@ std::tuple<Tensor, Tensor, Tensor> batchnorm_double_backward(
 
 std::tuple<Tensor, Tensor, Tensor> layer_norm_double_backward(
     const Tensor& input_t,
-    const c10::optional<Tensor>& gamma,
+    const std::optional<Tensor>& gamma,
     const Tensor& ggI,
     const Tensor& ggG,
     const Tensor& ggB,
@@ -4905,7 +4905,7 @@ infinitely_differentiable_native_group_norm_backward(
     const Tensor& X,
     const Tensor& mean,
     const Tensor& rstd,
-    const c10::optional<Tensor>& gamma,
+    const std::optional<Tensor>& gamma,
     c10::SymInt N,
     const c10::SymInt& C,
     c10::SymInt HxW,
@@ -4987,9 +4987,9 @@ infinitely_differentiable_native_group_norm_backward(
 
 std::tuple<Tensor, Tensor, Tensor> _trilinear_backward(
     const Tensor& grad_out,
-    const c10::optional<Tensor>& i1,
-    const c10::optional<Tensor>& i2,
-    const c10::optional<Tensor>& i3,
+    const std::optional<Tensor>& i1,
+    const std::optional<Tensor>& i2,
+    const std::optional<Tensor>& i3,
     IntArrayRef expand1,
     IntArrayRef expand2,
     IntArrayRef expand3,
@@ -5083,7 +5083,7 @@ Tensor embedding_dense_double_backward_symint(
 
 Tensor index_backward(
     Tensor zeros_like_self,
-    const torch::List<c10::optional<Tensor>>& indices,
+    const torch::List<std::optional<Tensor>>& indices,
     const Tensor& grad) {
   return (areAnyTensorSubclassLike({zeros_like_self, grad}) ||
           areAnyOptionalTensorSubclassLike(indices))
@@ -6120,7 +6120,7 @@ static Tensor _norm_jvp(
 // Computes the jvp for `input * weight + bias` where weight and bias may be
 // undefined Possibly modifies the input inplace
 static Tensor _affine_jvp(
-    const c10::optional<Tensor>& input_p,
+    const std::optional<Tensor>& input_p,
     Tensor& input_t,
     const Tensor& weight_p,
     const Tensor& weight_t,
@@ -6161,8 +6161,8 @@ Tensor batch_norm_jvp(
     const Tensor& weight_t,
     const Tensor& bias_p,
     const Tensor& bias_t,
-    const c10::optional<Tensor>& running_mean,
-    const c10::optional<Tensor>& running_var,
+    const std::optional<Tensor>& running_mean,
+    const std::optional<Tensor>& running_var,
     const Tensor& saved_mean,
     const Tensor& saved_invstd,
     bool train,
@@ -6198,8 +6198,8 @@ Tensor batch_norm_jvp(
     result_t = input_t * invstd_p;
   }
 
-  c10::optional<Tensor> result_p = weight_p.defined()
-      ? c10::optional<Tensor>((input_p - mean_p) * invstd_p)
+  std::optional<Tensor> result_p = weight_p.defined()
+      ? std::optional<Tensor>((input_p - mean_p) * invstd_p)
       : c10::nullopt;
   return _affine_jvp(
       result_p,
@@ -6237,8 +6237,8 @@ Tensor layer_norm_jvp(
   auto invstd_p = saved_invstd.view(view_size);
   auto result_t = _norm_jvp(input_p, input_t, mean_p, invstd_p, dims, numel);
 
-  c10::optional<Tensor> result_p = weight_p.defined()
-      ? c10::optional<Tensor>((input_p - mean_p) * invstd_p)
+  std::optional<Tensor> result_p = weight_p.defined()
+      ? std::optional<Tensor>((input_p - mean_p) * invstd_p)
       : c10::nullopt;
   return _affine_jvp(
       result_p,
@@ -6280,7 +6280,7 @@ Tensor group_norm_jvp(
                       /*eps=*/0)
                       .view(input_shape);
 
-  c10::optional<Tensor> result_p = c10::nullopt;
+  std::optional<Tensor> result_p = c10::nullopt;
   if (weight_p.defined()) {
     std::vector<int64_t> view_size(input_t_reshaped.dim(), 1);
     view_size[1] = input_t_reshaped.size(1);
@@ -6983,9 +6983,9 @@ mkldnn_rnn_layer_differentiable_backward(
     const Tensor& output,
     const Tensor& hy_,
     const Tensor& cy_,
-    const c10::optional<Tensor>& grad_output_r_opt,
-    const c10::optional<Tensor>& grad_hy_r_opt,
-    const c10::optional<Tensor>& grad_cy_r_opt,
+    const std::optional<Tensor>& grad_output_r_opt,
+    const std::optional<Tensor>& grad_hy_r_opt,
+    const std::optional<Tensor>& grad_cy_r_opt,
     bool reverse,
     int64_t mode,
     int64_t hidden_size,
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index c78f2b80c806a..dedff70be1ba3 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -31,14 +31,14 @@ struct TORCH_API IndexRangeGenerator {
   size_t i = 0;
 };
 
-TORCH_API Tensor toNonOptFwGrad(const c10::optional<Tensor>& t);
-TORCH_API Tensor toNonOptPrimal(const c10::optional<Tensor>& t);
-TORCH_API Tensor toNonOptTensor(const c10::optional<Tensor>& t);
+TORCH_API Tensor toNonOptFwGrad(const std::optional<Tensor>& t);
+TORCH_API Tensor toNonOptPrimal(const std::optional<Tensor>& t);
+TORCH_API Tensor toNonOptTensor(const std::optional<Tensor>& t);
 
-TORCH_API inline c10::optional<Tensor> wrap_opt_if(
+TORCH_API inline std::optional<Tensor> wrap_opt_if(
     const Tensor& t,
     const bool cond) {
-  using OptTensor = c10::optional<Tensor>;
+  using OptTensor = std::optional<Tensor>;
   return cond ? OptTensor(t) : static_cast<OptTensor>(c10::nullopt);
 }
 
@@ -154,12 +154,12 @@ at::Tensor div_tensor_self_backward(
     const Tensor& grad,
     T other,
     ScalarType self_st,
-    const c10::optional<c10::string_view>& rounding_mode);
+    const std::optional<c10::string_view>& rounding_mode);
 at::Tensor div_tensor_other_backward(
     const Tensor& grad,
     const Tensor& self,
     const Tensor& other,
-    const c10::optional<c10::string_view>& rounding_mode);
+    const std::optional<c10::string_view>& rounding_mode);
 at::Tensor mvlgamma_backward(
     const at::Tensor& grad,
     const at::Tensor& self,
@@ -314,8 +314,8 @@ at::Tensor mm_mat1_sparse_backward(
 std::tuple<Tensor, Tensor, Tensor> sparse_sampled_addmm_backward(
     const Tensor& grad,
     const Tensor& self,
-    const c10::optional<Tensor>& mat1,
-    const c10::optional<Tensor>& mat2,
+    const std::optional<Tensor>& mat1,
+    const std::optional<Tensor>& mat2,
     const Scalar& alpha,
     const Scalar& beta,
     const std::array<bool, 3>& grad_input_mask);
@@ -367,21 +367,21 @@ at::Tensor var_backward(
     at::Tensor grad,
     const at::Tensor& self,
     at::OptionalIntArrayRef dim,
-    const c10::optional<c10::Scalar>& correction,
+    const std::optional<c10::Scalar>& correction,
     bool keepdim);
 at::Tensor var_jvp(
     const at::Tensor& self_t,
     const at::Tensor& self_p,
     const at::Tensor& result,
     at::OptionalIntArrayRef dim_opt,
-    const c10::optional<c10::Scalar>& correction,
+    const std::optional<c10::Scalar>& correction,
     bool keepdim);
 at::Tensor std_backward(
     const at::Tensor& result,
     const at::Tensor& grad,
     const at::Tensor& self,
     at::OptionalIntArrayRef dim,
-    const c10::optional<c10::Scalar>& correction,
+    const std::optional<c10::Scalar>& correction,
     bool keepdim);
 Tensor mean_backward(
     const Tensor& grad,
@@ -394,7 +394,7 @@ Tensor var_mean_backward(
     const Tensor& gmean,
     const Tensor& self,
     at::OptionalIntArrayRef dim_opt,
-    const c10::optional<c10::Scalar>& correction,
+    const std::optional<c10::Scalar>& correction,
     bool keepdim);
 Tensor std_mean_backward(
     const Tensor& gstd,
@@ -402,7 +402,7 @@ Tensor std_mean_backward(
     const Tensor& self,
     const Tensor& std,
     at::OptionalIntArrayRef dim_opt,
-    const c10::optional<c10::Scalar>& correction,
+    const std::optional<c10::Scalar>& correction,
     bool keepdim);
 at::Tensor cholesky_backward(
     const at::Tensor& grad,
@@ -465,33 +465,33 @@ at::Tensor infinitely_differentiable_mish_backward(
 Tensor infinitely_differentiable_logit_backward(
     const Tensor& grad,
     const Tensor& self,
-    c10::optional<double> eps);
+    std::optional<double> eps);
 Tensor binary_cross_entropy_target_backward(
     const Tensor& grad,
     const Tensor& self,
     const Tensor& target,
-    const c10::optional<Tensor>& weight,
+    const std::optional<Tensor>& weight,
     int64_t reduction);
 Tensor binary_cross_entropy_double_backward_target(
     const Tensor& grad,
     const Tensor& grad_output,
     const Tensor& self,
     const Tensor& target,
-    const c10::optional<Tensor>& weight,
+    const std::optional<Tensor>& weight,
     int64_t reduction);
 Tensor binary_cross_entropy_with_logits_backward(
     const Tensor& grad,
     const Tensor& input,
     const Tensor& target,
-    const c10::optional<Tensor>& weight_opt,
-    const c10::optional<Tensor>& pos_weight_opt,
+    const std::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& pos_weight_opt,
     int64_t reduction);
 at::Tensor binary_cross_entropy_with_logits_target_backward(
     const at::Tensor& grad_output,
     const at::Tensor& self,
     const at::Tensor& target,
-    const c10::optional<at::Tensor>& weight,
-    const c10::optional<at::Tensor>& pos_weight,
+    const std::optional<at::Tensor>& weight,
+    const std::optional<at::Tensor>& pos_weight,
     int64_t reduction);
 at::Tensor log_sigmoid_double_backward(
     const at::Tensor& grad,
@@ -506,13 +506,13 @@ at::Tensor binary_cross_entropy_double_backward(
     const at::Tensor& grad,
     const at::Tensor& input,
     const at::Tensor& target,
-    const c10::optional<at::Tensor>& weight,
+    const std::optional<at::Tensor>& weight,
     int64_t reduction);
 at::Tensor binary_cross_entropy_double_backward_grad_output(
     const at::Tensor& grad,
     const at::Tensor& input,
     const at::Tensor& target,
-    const c10::optional<at::Tensor>& weight,
+    const std::optional<at::Tensor>& weight,
     int64_t reduction);
 at::Tensor smooth_l1_loss_double_backward(
     const at::Tensor& grad,
@@ -577,7 +577,7 @@ at::Tensor embedding_dense_double_backward_symint(
     const c10::SymInt& padding_idx);
 at::Tensor index_backward(
     at::Tensor zeros_like_self,
-    const torch::List<c10::optional<Tensor>>& indices,
+    const torch::List<std::optional<Tensor>>& indices,
     const at::Tensor& grad);
 at::Tensor _cudnn_ctc_loss_backward(
     const at::Tensor& grad_out,
@@ -611,8 +611,8 @@ Tensor slice_backward_wrapper(
     const at::Tensor& grad,
     const c10::SymIntArrayRef& input_sizes,
     int64_t dim,
-    c10::optional<c10::SymInt> start,
-    c10::optional<c10::SymInt> end,
+    std::optional<c10::SymInt> start,
+    std::optional<c10::SymInt> end,
     c10::SymInt step);
 std::tuple<Tensor, Tensor> linalg_eig_jvp(
     const Tensor& dA,
@@ -667,9 +667,9 @@ std::tuple<Tensor, Tensor> linalg_solve_triangular_backward(
     std::array<bool, 2> output_mask);
 std::tuple<Tensor, Tensor, Tensor> _trilinear_backward(
     const Tensor& grad_out,
-    const c10::optional<Tensor>& i1,
-    const c10::optional<Tensor>& i2,
-    const c10::optional<Tensor>& i3,
+    const std::optional<Tensor>& i1,
+    const std::optional<Tensor>& i2,
+    const std::optional<Tensor>& i3,
     IntArrayRef expand1,
     IntArrayRef expand2,
     IntArrayRef expand3,
@@ -692,17 +692,17 @@ Tensor linalg_matrix_exp_differential(
     bool adjoint);
 std::tuple<Tensor, Tensor, Tensor> batchnorm_double_backward(
     const Tensor& input,
-    const c10::optional<Tensor>& gamma,
+    const std::optional<Tensor>& gamma,
     const Tensor& ggI,
     const Tensor& ggG,
     const Tensor& ggB,
     const Tensor& gO,
-    const c10::optional<Tensor>& running_mean,
-    const c10::optional<Tensor>& running_var,
+    const std::optional<Tensor>& running_mean,
+    const std::optional<Tensor>& running_var,
     bool training,
     double eps,
-    const c10::optional<Tensor>& save_mean,
-    const c10::optional<Tensor>& save_invstd,
+    const std::optional<Tensor>& save_mean,
+    const std::optional<Tensor>& save_invstd,
     std::array<bool, 3> output_mask);
 std::tuple<Tensor, Tensor> _euclidean_dist_backward(
     const Tensor& grad,
@@ -752,7 +752,7 @@ infinitely_differentiable_native_group_norm_backward(
     const Tensor& X,
     const Tensor& mean,
     const Tensor& rstd,
-    const c10::optional<Tensor>& gamma,
+    const std::optional<Tensor>& gamma,
     c10::SymInt N,
     const c10::SymInt& C,
     c10::SymInt HxW,
@@ -790,7 +790,7 @@ Tensor amaxamin_jvp(
     bool keepdim);
 std::tuple<Tensor, Tensor, Tensor> layer_norm_double_backward(
     const Tensor& input,
-    const c10::optional<Tensor>& gamma,
+    const std::optional<Tensor>& gamma,
     const Tensor& ggI,
     const Tensor& ggG,
     const Tensor& ggB,
@@ -919,8 +919,8 @@ Tensor batch_norm_jvp(
     const Tensor& weight_t,
     const Tensor& bias_p,
     const Tensor& bias_t,
-    const c10::optional<Tensor>& running_mean,
-    const c10::optional<Tensor>& running_var,
+    const std::optional<Tensor>& running_mean,
+    const std::optional<Tensor>& running_var,
     const Tensor& saved_mean,
     const Tensor& saved_invstd,
     bool train,
@@ -1082,9 +1082,9 @@ mkldnn_rnn_layer_differentiable_backward(
     const Tensor& output,
     const Tensor& hy_,
     const Tensor& cy_,
-    const c10::optional<Tensor>& grad_output_r_opt,
-    const c10::optional<Tensor>& grad_hy_r_opt,
-    const c10::optional<Tensor>& grad_cy_r_opt,
+    const std::optional<Tensor>& grad_output_r_opt,
+    const std::optional<Tensor>& grad_hy_r_opt,
+    const std::optional<Tensor>& grad_cy_r_opt,
     bool reverse,
     int64_t mode,
     int64_t hidden_size,
diff --git a/torch/csrc/autograd/TraceTypeManual.cpp b/torch/csrc/autograd/TraceTypeManual.cpp
index 4134ef6d992ba..46e4014d8dd13 100644
--- a/torch/csrc/autograd/TraceTypeManual.cpp
+++ b/torch/csrc/autograd/TraceTypeManual.cpp
@@ -51,7 +51,7 @@ Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) {
 const Tensor& resize_(
     const Tensor& self,
     IntArrayRef size,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   if (torch::jit::tracer::isTracing()) {
     if (jit::tracer::ArgumentStash::hasIntArrayRef("size")) {
       jit::tracer::ArgumentStash::popIntArrayRef("size");
@@ -70,7 +70,7 @@ const Tensor& resize_(
 const Tensor& resize_as_(
     const Tensor& self,
     const Tensor& the_template,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   if (torch::jit::tracer::isTracing()) {
     jit::tracer::warn("resize_as_", jit::tracer::WARN_RESIZE);
     jit::tracer::delValueTrace(self);
diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
index 38a63640c11e6..20f66694677e8 100644
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -240,7 +240,7 @@ const Tensor& resize_(
     c10::DispatchKeySet ks,
     const Tensor& self,
     SymIntArrayRef size,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   auto& self_ = unpack(self, "self", 0);
   if (self.requires_grad()) {
     AT_ERROR("cannot resize variables that require grad");
@@ -262,7 +262,7 @@ const Tensor& resize_as_(
     c10::DispatchKeySet ks,
     const Tensor& self,
     const Tensor& the_template,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   auto& self_ = unpack(self, "self", 0);
   auto& the_template_ = unpack(the_template, "the_template", 1);
   if (self.requires_grad()) {
@@ -400,7 +400,7 @@ static const Tensor& resize_(
     c10::DispatchKeySet ks,
     const Tensor& self,
     SymIntArrayRef size,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   // Hold sizes to verify if we actually resize `self`.
   // Explicitly copy data, since resizing can move original data
   // and make references invalid.
@@ -424,7 +424,7 @@ static const Tensor& resize_as_(
     c10::DispatchKeySet ks,
     const Tensor& self,
     const Tensor& the_template,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    std::optional<MemoryFormat> optional_memory_format) {
   // Hold sizes to verify if we actually resize `self`.
   // Explicitly copy data, since resizing can move original data
   // and make references invalid.
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index b8fa4b6c101a7..d5fe8a70dae17 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -166,7 +166,7 @@ struct Flatten : IterArgs<Flatten> {
   void operator()(const at::Tensor& x) {
     out.emplace_back(x);
   }
-  void operator()(const c10::optional<at::Tensor>& x) {
+  void operator()(const std::optional<at::Tensor>& x) {
     if (x.has_value())
       out.emplace_back(x.value());
   }
@@ -233,8 +233,8 @@ inline at::Tensor as_view(
   }
 
   // If they cannot be shared, create the required view infos
-  c10::optional<ViewInfo> new_bw_info;
-  c10::optional<ViewInfo> new_fw_info;
+  std::optional<ViewInfo> new_bw_info;
+  std::optional<ViewInfo> new_fw_info;
 
   if (is_bw_differentiable) {
     auto bw_view_func = view_func ? view_func->clone_and_set() : nullptr;
@@ -298,7 +298,7 @@ inline void check_no_requires_grad(
 }
 
 inline void check_no_requires_grad(
-    const c10::optional<at::Tensor>& tensor,
+    const std::optional<at::Tensor>& tensor,
     const char* name,
     const char* fn_name = "") {
   if (tensor.has_value()) {
@@ -320,14 +320,14 @@ inline void check_no_requires_grad(
 }
 
 inline void check_no_requires_grad(
-    const c10::List<c10::optional<at::Tensor>>& tensors,
+    const c10::List<std::optional<at::Tensor>>& tensors,
     const char* name,
     const char* fn_name = "") {
   // GradMode check is expensive, so check it only once for TensorLists
   if (!GradMode::is_enabled()) {
     return;
   }
-  for (c10::optional<at::Tensor> tensor : tensors) {
+  for (std::optional<at::Tensor> tensor : tensors) {
     if (tensor.has_value()) {
       check_no_requires_grad(*tensor, name, fn_name, /*check_grad_mode*/ false);
     }
@@ -345,11 +345,11 @@ inline std::vector<SavedVariable> make_saved_variable_list(
 
 // Assumed that saved tensor lists are never inplace outputs
 inline std::vector<SavedVariable> make_saved_variable_list(
-    const c10::List<c10::optional<at::Tensor>>& tensors,
+    const c10::List<std::optional<at::Tensor>>& tensors,
     const bool is_output = false) {
   return fmap(
       tensors,
-      [&is_output](const c10::optional<at::Tensor>& tensor) -> SavedVariable {
+      [&is_output](const std::optional<at::Tensor>& tensor) -> SavedVariable {
         if (tensor.has_value()) {
           return SavedVariable{*tensor, is_output /* is output */};
         } else {
diff --git a/torch/csrc/autograd/autograd.cpp b/torch/csrc/autograd/autograd.cpp
index fd4265619fccd..4a550e7006389 100644
--- a/torch/csrc/autograd/autograd.cpp
+++ b/torch/csrc/autograd/autograd.cpp
@@ -165,7 +165,7 @@ static variable_list run_backward(
 void backward(
     const variable_list& tensors,
     const variable_list& grad_tensors,
-    c10::optional<bool> retain_graph,
+    std::optional<bool> retain_graph,
     bool create_graph,
     const variable_list& inputs) {
   variable_list gradients = _make_grads(tensors, grad_tensors);
@@ -186,7 +186,7 @@ variable_list grad(
     const variable_list& outputs,
     const variable_list& inputs,
     const variable_list& grad_outputs,
-    c10::optional<bool> retain_graph,
+    std::optional<bool> retain_graph,
     bool create_graph,
     bool allow_unused) {
   variable_list gradients = _make_grads(outputs, grad_outputs);
diff --git a/torch/csrc/autograd/autograd.h b/torch/csrc/autograd/autograd.h
index 3537df9bc4a7d..94ee179225a4c 100644
--- a/torch/csrc/autograd/autograd.h
+++ b/torch/csrc/autograd/autograd.h
@@ -47,7 +47,7 @@ namespace torch::autograd {
 TORCH_API void backward(
     const variable_list& tensors,
     const variable_list& grad_tensors = {},
-    c10::optional<bool> retain_graph = c10::nullopt,
+    std::optional<bool> retain_graph = c10::nullopt,
     bool create_graph = false,
     const variable_list& inputs = {});
 
@@ -81,7 +81,7 @@ TORCH_API variable_list grad(
     const variable_list& outputs,
     const variable_list& inputs,
     const variable_list& grad_outputs = {},
-    c10::optional<bool> retain_graph = c10::nullopt,
+    std::optional<bool> retain_graph = c10::nullopt,
     bool create_graph = false,
     bool allow_unused = false);
 
diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
index 2cfca6817e855..acc8986efa6a2 100644
--- a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+++ b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
@@ -339,12 +339,12 @@ static void autogradNotImplementedFallbackImpl(
       std::vector<c10::IValue>(stack->begin() + stack_start, stack->end());
   std::vector<c10::intrusive_ptr<c10::TensorImpl>> impl_saved;
   impl_saved.reserve(num_tensor_inputs);
-  std::vector<c10::optional<c10::Storage>> storage_saved;
+  std::vector<std::optional<c10::Storage>> storage_saved;
   storage_saved.reserve(num_tensor_inputs);
   _foreach_tensor(
       [&](size_t idx, size_t _, const at::Tensor& t) {
         storage_saved.push_back(
-            t.has_storage() ? c10::optional<c10::Storage>(t.storage())
+            t.has_storage() ? std::optional<c10::Storage>(t.storage())
                             : c10::nullopt);
         impl_saved.push_back(t.getIntrusivePtr());
       },
diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp
index 41e2f1991a52b..1cf94bbe048fe 100644
--- a/torch/csrc/autograd/custom_function.cpp
+++ b/torch/csrc/autograd/custom_function.cpp
@@ -28,7 +28,7 @@ namespace torch::autograd {
 static void _process_forward_mode_AD(
     const variable_list& inputs,
     std::unordered_map<at::TensorImpl*, size_t> inputs_mapping,
-    const at::ArrayRef<c10::optional<Variable>> raw_outputs,
+    const at::ArrayRef<std::optional<Variable>> raw_outputs,
     const optional_variable_list& outputs,
     const std::unordered_set<at::TensorImpl*>& non_differentiable,
     const std::unordered_set<at::TensorImpl*>& dirty_inputs,
@@ -258,7 +258,7 @@ static optional_variable_list _process_backward_mode_ad(
     const std::unordered_map<at::TensorImpl*, size_t>& inputs_mapping,
     const std::unordered_set<at::TensorImpl*>& non_differentiable,
     const std::unordered_set<at::TensorImpl*>& dirty_inputs,
-    const at::ArrayRef<c10::optional<Variable>> raw_outputs,
+    const at::ArrayRef<std::optional<Variable>> raw_outputs,
     const std::shared_ptr<Node>& cdata,
     const std::unordered_set<at::TensorImpl*>& to_save_if_setup_context,
     const _view_as_self_fn_t& view_as_self_fn) {
@@ -438,7 +438,7 @@ optional_variable_list _wrap_outputs(
     const variable_list& input_vars,
     const std::unordered_set<at::TensorImpl*>& non_differentiable,
     const std::unordered_set<at::TensorImpl*>& dirty_inputs,
-    const at::ArrayRef<c10::optional<Variable>> raw_outputs,
+    const at::ArrayRef<std::optional<Variable>> raw_outputs,
     const std::shared_ptr<Node>& cdata,
     const _jvp_fn_t& jvp_user_function,
     const std::unordered_set<at::TensorImpl*>& to_save_if_setup_context,
diff --git a/torch/csrc/autograd/custom_function.h b/torch/csrc/autograd/custom_function.h
index ebabc45334a5d..8c20bd8078207 100644
--- a/torch/csrc/autograd/custom_function.h
+++ b/torch/csrc/autograd/custom_function.h
@@ -12,15 +12,15 @@
 
 namespace torch::autograd {
 
-using optional_variable_list = std::vector<c10::optional<Variable>>;
+using optional_variable_list = std::vector<std::optional<Variable>>;
 using _jvp_fn_t = std::function<variable_list(variable_list, variable_list)>;
 using _view_as_self_fn_t = std::function<at::Tensor(at::Tensor)>;
 
-TORCH_API std::vector<c10::optional<Variable>> _wrap_outputs(
+TORCH_API std::vector<std::optional<Variable>> _wrap_outputs(
     const variable_list& input_vars,
     const std::unordered_set<at::TensorImpl*>& non_differentiable,
     const std::unordered_set<at::TensorImpl*>& dirty_inputs,
-    const at::ArrayRef<c10::optional<Variable>> raw_outputs,
+    const at::ArrayRef<std::optional<Variable>> raw_outputs,
     const std::shared_ptr<Node>& cdata,
     const _jvp_fn_t& jvp_user_function,
     const std::unordered_set<at::TensorImpl*>& to_save_if_setup_context,
@@ -41,7 +41,7 @@ using forward_t = decltype(X::forward(nullptr, std::declval<Args>()...));
 /// `forward` can take as many arguments as you want and should return either a
 /// variable list or a Variable. Use of any direct Variable arguments will be
 /// registered in the graph but no vectors/sets or any other data structures
-/// will be traversed. You can use c10::optional<Tensor> as one of the arguments
+/// will be traversed. You can use std::optional<Tensor> as one of the arguments
 /// and it will be registered as a variable in the graph if the argument has a
 /// value. It should take a pointer to `torch::autograd::AutogradContext` as the
 /// first argument. Variables can be saved in the `ctx` using
@@ -247,7 +247,7 @@ struct ExtractVariables : IterArgs<ExtractVariables> {
   variable_list& list_;
   ExtractVariables(std::vector<bool>& is_var, variable_list& list)
       : is_var_(is_var), list_(list) {}
-  void operator()(const c10::optional<at::Tensor>& x) {
+  void operator()(const std::optional<at::Tensor>& x) {
     // NOLINTNEXTLINE(bugprone-branch-clone)
     if (x.has_value() && x.value().defined()) {
       is_var_.push_back(true);
@@ -282,30 +282,30 @@ inline void extract_vars(
 
 template <typename T>
 std::enable_if_t<std::is_same_v<T, variable_list>, T> to_output_type(
-    std::vector<c10::optional<Variable>>& output_list) {
+    std::vector<std::optional<Variable>>& output_list) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   variable_list result;
   std::transform(
       output_list.begin(),
       output_list.end(),
       std::back_inserter(result),
-      [](const c10::optional<Variable>& var) { return *var; });
+      [](const std::optional<Variable>& var) { return *var; });
   return result;
 }
 
 template <typename T>
 std::enable_if_t<std::is_same_v<T, Variable>, T> to_output_type(
-    std::vector<c10::optional<Variable>>& output_list) {
+    std::vector<std::optional<Variable>>& output_list) {
   return *output_list[0];
 }
 
-inline std::vector<c10::optional<Variable>> to_optional(Variable& output) {
-  return std::vector<c10::optional<Variable>>{output};
+inline std::vector<std::optional<Variable>> to_optional(Variable& output) {
+  return std::vector<std::optional<Variable>>{output};
 }
 
-inline std::vector<c10::optional<Variable>> to_optional(variable_list& output) {
+inline std::vector<std::optional<Variable>> to_optional(variable_list& output) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  std::vector<c10::optional<Variable>> result;
+  std::vector<std::optional<Variable>> result;
   std::transform(
       output.begin(),
       output.end(),
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index becc73396e66d..c8c3538a061f1 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -239,7 +239,7 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
    * elements are on different devices (across multiple GPUs, for example)
    * they may have different streams.
    */
-  c10::optional<c10::Stream> stream() {
+  std::optional<c10::Stream> stream() {
     auto opt_device_type = at::getAccelerator();
     if (!opt_device_type.has_value()) {
       return c10::nullopt;
@@ -703,7 +703,7 @@ struct MakeNextFunctionList : IterArgs<MakeNextFunctionList> {
   void operator()(const Variable* variable) {
     operator()(*variable);
   }
-  void operator()(const c10::optional<Variable>& variable) {
+  void operator()(const std::optional<Variable>& variable) {
     if (variable.has_value()) {
       operator()(*variable);
     } else {
diff --git a/torch/csrc/autograd/functions/comm.cpp b/torch/csrc/autograd/functions/comm.cpp
index 9bcd511285734..e2f23f363d7a0 100644
--- a/torch/csrc/autograd/functions/comm.cpp
+++ b/torch/csrc/autograd/functions/comm.cpp
@@ -17,9 +17,9 @@ namespace torch {
 namespace autograd {
 Scatter::Scatter(
     std::vector<at::Device> devices,
-    c10::optional<std::vector<int64_t>> chunk_sizes,
+    std::optional<std::vector<int64_t>> chunk_sizes,
     int64_t dim,
-    c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>> streams,
+    std::optional<std::vector<c10::optional<at::cuda::CUDAStream>>> streams,
     bool unsqueeze_scalars)
     : devices_(std::move(devices)),
       chunk_sizes_(std::move(chunk_sizes)),
diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h
index 9b1f0daf50bce..b0e6900729955 100644
--- a/torch/csrc/autograd/functions/comm.h
+++ b/torch/csrc/autograd/functions/comm.h
@@ -17,9 +17,9 @@ namespace autograd {
 struct TORCH_CUDA_CU_API Scatter : public Node {
   explicit Scatter(
       std::vector<at::Device> devices,
-      c10::optional<std::vector<int64_t>> chunk_sizes = c10::nullopt,
+      std::optional<std::vector<int64_t>> chunk_sizes = c10::nullopt,
       int64_t dim = 0,
-      c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>> streams =
+      std::optional<std::vector<c10::optional<at::cuda::CUDAStream>>> streams =
           c10::nullopt,
       bool unsqueeze_scalars = false);
   ~Scatter() override;
@@ -27,9 +27,9 @@ struct TORCH_CUDA_CU_API Scatter : public Node {
   variable_list apply(variable_list&& inputs) override;
 
   std::vector<at::Device> devices_;
-  c10::optional<std::vector<int64_t>> chunk_sizes_;
+  std::optional<std::vector<int64_t>> chunk_sizes_;
   int64_t dim_;
-  c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>> streams_;
+  std::optional<std::vector<c10::optional<at::cuda::CUDAStream>>> streams_;
   bool unsqueeze_scalars_;
 };
 
diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h
index 3cc2575da8f5d..db916dc0bbbfa 100644
--- a/torch/csrc/autograd/functions/utils.h
+++ b/torch/csrc/autograd/functions/utils.h
@@ -46,7 +46,7 @@ struct ComputeRequiresGrad : IterArgs<ComputeRequiresGrad> {
       out = true;
     }
   }
-  void operator()(const c10::optional<at::Tensor>& tensor) {
+  void operator()(const std::optional<at::Tensor>& tensor) {
     if (tensor.has_value()) {
       (*this)(*tensor);
     }
@@ -88,7 +88,7 @@ inline void set_history(
   }
 }
 
-inline bool isFwGradDefined(const c10::optional<at::Tensor>& t) {
+inline bool isFwGradDefined(const std::optional<at::Tensor>& t) {
   return t.has_value() && t->defined() && t->_fw_grad(/*level */ 0).defined();
 }
 
@@ -101,7 +101,7 @@ inline bool isFwGradDefinedTensorList(const at::ITensorListRef& variables) {
 }
 
 inline bool isFwGradDefinedTensorList(
-    const c10::List<c10::optional<at::Tensor>>& li) {
+    const c10::List<std::optional<at::Tensor>>& li) {
   bool ret = false;
   for (auto i : c10::irange(li.size())) {
     auto t = li.get(i);
diff --git a/torch/csrc/autograd/graph_task.h b/torch/csrc/autograd/graph_task.h
index 03a9647cad833..e4a7ae4dad18e 100644
--- a/torch/csrc/autograd/graph_task.h
+++ b/torch/csrc/autograd/graph_task.h
@@ -125,7 +125,7 @@ struct GraphTask : std::enable_shared_from_this<GraphTask> {
 
   // Per-device current streams of the execute() that called this GraphTask.
   // These will be synced with leaf_streams in exec_post_processing.
-  std::vector<c10::optional<c10::Stream>> caller_current_streams_;
+  std::vector<std::optional<c10::Stream>> caller_current_streams_;
 
   // Collects caller_current_streams_ for the accelerator device.
   void stash_current_streams();
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index e04d853198fbb..9eb1031ff02c0 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -1081,7 +1081,7 @@ static PyObject* push_on_torch_dispatch_stack(
     using c10::impl::TorchDispatchModeKey;
     // When we push a mode onto the mode stack, we need to
     // check if it's an "infra" mode, by checking its _mode_key attribute.
-    c10::optional<c10::impl::TorchDispatchModeKey> mode_key = c10::nullopt;
+    std::optional<c10::impl::TorchDispatchModeKey> mode_key = c10::nullopt;
     py::object maybe_mode_key_obj =
         PyObject_FastGetAttrString(arg, "_mode_key");
     if (maybe_mode_key_obj) {
@@ -1105,7 +1105,7 @@ static PyObject* pop_torch_dispatch_stack(
     PyObject* _unused,
     PyObject* maybe_mode_key) {
   HANDLE_TH_ERRORS
-  c10::optional<c10::impl::TorchDispatchModeKey> mode_key = c10::nullopt;
+  std::optional<c10::impl::TorchDispatchModeKey> mode_key = c10::nullopt;
   PyObject* r = nullptr;
   if (maybe_mode_key != Py_None) {
     mode_key = py::cast<c10::impl::TorchDispatchModeKey>(maybe_mode_key);
diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp
index 2adfc1fc7efae..6c12bbadc5d2d 100644
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@@ -129,8 +129,8 @@ static void accumulate(
 void InputBuffer::add(
     size_t pos,
     Variable&& var,
-    const c10::optional<c10::Stream>& opt_producer_stream,
-    const c10::optional<c10::Stream>& opt_consumer_stream) {
+    const std::optional<c10::Stream>& opt_producer_stream,
+    const std::optional<c10::Stream>& opt_consumer_stream) {
   TORCH_INTERNAL_ASSERT(pos < buffer.size());
   if (!var.defined()) {
     return;
@@ -159,7 +159,7 @@ void InputBuffer::add(
   //      Accumulation happens on the var device's default stream.
 
   TORCH_INTERNAL_ASSERT(device_of(var));
-  c10::optional<c10::Stream> opt_accumulate_stream = c10::nullopt;
+  std::optional<c10::Stream> opt_accumulate_stream = c10::nullopt;
   const auto device_type = device_of(var).value().type();
   // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
   if (device_of(var)->is_cuda() || device_of(var)->is_privateuseone()) {
@@ -179,7 +179,7 @@ void InputBuffer::add(
         record_stream_any_impl(var, *opt_accumulate_stream);
       }
     } else {
-      c10::optional<c10::Stream> opt_sync_stream = c10::nullopt;
+      std::optional<c10::Stream> opt_sync_stream = c10::nullopt;
       const auto guard = c10::impl::VirtualGuardImpl{device_type};
       if (on_consumer && !on_producer) {
         // (3a)
diff --git a/torch/csrc/autograd/input_buffer.h b/torch/csrc/autograd/input_buffer.h
index d8ef3396cb6d8..7e471ef528bb0 100644
--- a/torch/csrc/autograd/input_buffer.h
+++ b/torch/csrc/autograd/input_buffer.h
@@ -27,8 +27,8 @@ struct InputBuffer {
   TORCH_API void add(
       size_t pos,
       Variable&& var,
-      const c10::optional<c10::Stream>& opt_producer_stream,
-      const c10::optional<c10::Stream>& opt_consumer_stream);
+      const std::optional<c10::Stream>& opt_producer_stream,
+      const std::optional<c10::Stream>& opt_consumer_stream);
 
   at::Device device() const;
 
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 0c73c8b7a72a1..64b85dd72f592 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -18,6 +18,7 @@
 #include <torch/csrc/profiler/perf.h>
 #include <torch/csrc/profiler/standalone/itt_observer.h>
 #include <torch/csrc/profiler/standalone/nvtx_observer.h>
+#include <torch/csrc/profiler/standalone/privateuse1_observer.h>
 #include <torch/csrc/profiler/util.h>
 
 #include <ATen/Context.h>
@@ -80,16 +81,18 @@ struct OpArgData {
   std::vector<std::string> dtypes;
   std::vector<c10::IValue> concrete_inputs;
   std::vector<std::vector<int64_t>> shapes_for_kineto_event;
+  std::vector<shape> strides;
 };
 
 auto parseArgData(
     const std::vector<op_input_t>& input_shapes,
     const std::vector<op_input_t>& concrete_inputs) {
   if (input_shapes.empty()) {
-    return OpArgData{false, {}, {}, {}, {}};
+    return OpArgData{false, {}, {}, {}, {}, {}};
   }
 
   std::vector<shape> shapes(input_shapes.size());
+  std::vector<shape> strides(input_shapes.size());
   std::vector<std::vector<int64_t>> shapes_for_kineto_event(
       input_shapes.size());
 
@@ -103,14 +106,19 @@ auto parseArgData(
               shapes[i] = t.sizes_;
               shapes_for_kineto_event[i] = t.sizes_;
               dtypes[i] = std::string(scalarTypeToTypeMeta(t.dtype_).name());
+              strides[i] = t.strides_;
             },
             [&](const std::vector<TensorMetadata>& l) {
               std::vector<std::vector<int64_t>> shape;
               shape.reserve(l.size());
+              std::vector<std::vector<int64_t>> stride;
+              stride.reserve(l.size());
               for (const auto& t : l) {
                 shape.emplace_back(t.sizes_);
+                stride.emplace_back(t.strides_);
               }
               shapes[i] = shape;
+              strides[i] = stride;
               dtypes[i] = "TensorList";
             },
             [&](const c10::IValue& val) { dtypes[i] = "Scalar"; },
@@ -141,7 +149,12 @@ auto parseArgData(
   }
 
   return OpArgData{
-      true, shapes, dtypes, concrete_inputs_list, shapes_for_kineto_event};
+      true,
+      shapes,
+      dtypes,
+      concrete_inputs_list,
+      shapes_for_kineto_event,
+      strides};
 }
 
 struct MetadataBase {
@@ -194,7 +207,7 @@ struct AddTensorboardFields : public MetadataBase {
     result->visit_if_base<PyExtraFieldsBase>([&, this](const auto& i) -> void {
       this->addMetadata("Python id", std::to_string(i.id_));
 
-      c10::optional<std::string> parent_id;
+      std::optional<std::string> parent_id;
       std::shared_ptr<Result> parent = result->parent_.lock();
       while (parent && !parent_id.has_value()) {
         parent->visit_if_base<PyExtraFieldsBase>(
@@ -236,6 +249,7 @@ struct AddGenericMetadata : public MetadataBase {
     if (arg_data.has_data) {
       if (get_record_concrete_inputs_enabled()) {
         addMetadata("Input Dims", variantShapesToStr(arg_data.shapes));
+        addMetadata("Input Strides", variantShapesToStr(arg_data.strides));
       } else {
         addMetadata(
             "Input Dims", shapesToStr(arg_data.shapes_for_kineto_event));
@@ -625,6 +639,9 @@ void enableProfiler(
   } else if (config.state == ProfilerState::ITT) {
     torch::profiler::impl::pushITTCallbacks(config, scopes);
     return;
+  } else if (config.state == ProfilerState::PRIVATEUSE1) {
+    torch::profiler::impl::pushPRIVATEUSE1CallbacksStub(config, scopes);
+    return;
   }
 
   TORCH_CHECK(
@@ -660,7 +677,8 @@ std::unique_ptr<ProfilerResult> disableProfiler() {
            config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK ||
            config.state == ProfilerState::KINETO_ONDEMAND ||
            config.state == ProfilerState::NVTX ||
-           config.state == ProfilerState::ITT),
+           config.state == ProfilerState::ITT ||
+           config.state == ProfilerState::PRIVATEUSE1),
       "Can't disable Kineto profiler when it's not running");
 
   state_ptr->removeCallback();
@@ -672,9 +690,11 @@ std::unique_ptr<ProfilerResult> disableProfiler() {
     return std::make_unique<ProfilerResult>();
   }
 
-  // Shared among NVTX, KINETO, KINETO_GPU_FALLBACK, KINETO_PRIVATEUSE1_FALLBACK
+  // Shared among NVTX, PRIVATEUSE1, KINETO, KINETO_GPU_FALLBACK,
+  // KINETO_PRIVATEUSE1_FALLBACK
   std::unique_ptr<ProfilerResult> result;
-  if (state_ptr->config().state == ProfilerState::NVTX) {
+  if (state_ptr->config().state == ProfilerState::NVTX ||
+      state_ptr->config().state == ProfilerState::PRIVATEUSE1) {
     result = std::make_unique<ProfilerResult>();
   }
 
diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp
index 04c676fc2b497..b9387479667e8 100644
--- a/torch/csrc/autograd/profiler_legacy.cpp
+++ b/torch/csrc/autograd/profiler_legacy.cpp
@@ -169,7 +169,7 @@ struct ProfilerLegacyThreadLocalState : public ProfilerStateBase {
   std::unordered_map<uint64_t, std::shared_ptr<RangeEventList>>
       event_lists_map_;
 
-  c10::optional<std::vector<std::vector<LegacyEvent>>> remoteProfiledEvents_;
+  std::optional<std::vector<std::vector<LegacyEvent>>> remoteProfiledEvents_;
 };
 
 thread_event_lists ProfilerLegacyThreadLocalState::consolidate() {
@@ -429,7 +429,7 @@ void enableProfilerLegacy(
 }
 
 thread_event_lists disableProfilerLegacy(
-    c10::optional<ProfilerDisableOptions> profilerDisableOptions) {
+    std::optional<ProfilerDisableOptions> profilerDisableOptions) {
   auto cleanupTLSState =
       profilerDisableOptions ? profilerDisableOptions->cleanupTLSState : true;
   auto consolidate =
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h
index e74ddd8a2296e..9bd88b0b3dc51 100644
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -335,7 +335,7 @@ TORCH_API void enableProfilerLegacy(
     const torch::profiler::impl::ProfilerConfig&);
 using thread_event_lists = std::vector<std::vector<LegacyEvent>>;
 TORCH_API thread_event_lists disableProfilerLegacy(
-    c10::optional<ProfilerDisableOptions> profilerDisableOptions =
+    std::optional<ProfilerDisableOptions> profilerDisableOptions =
         c10::nullopt);
 
 // adds profiledEvents to the current thread local recorded events. Each event
@@ -376,9 +376,9 @@ struct TORCH_API RecordProfile {
 struct TORCH_API TLSLegacyProfilerGuard {
   explicit TLSLegacyProfilerGuard(
       const torch::profiler::impl::ProfilerConfig& cfg,
-      c10::optional<std::function<void(const thread_event_lists&)>>
+      std::optional<std::function<void(const thread_event_lists&)>>
           resultCallback = c10::nullopt,
-      c10::optional<ProfilerDisableOptions> profilerDisableOptions =
+      std::optional<ProfilerDisableOptions> profilerDisableOptions =
           c10::nullopt)
       : cb_(std::move(resultCallback)),
         profilerDisableOptions_(profilerDisableOptions) {
@@ -397,9 +397,9 @@ struct TORCH_API TLSLegacyProfilerGuard {
   }
 
  private:
-  c10::optional<std::function<void(const thread_event_lists&)>> cb_;
+  std::optional<std::function<void(const thread_event_lists&)>> cb_;
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
-  const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
+  const std::optional<ProfilerDisableOptions> profilerDisableOptions_;
 };
 
 } // namespace profiler
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index da1cedfdb5a97..799188be9a686 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -220,7 +220,7 @@ struct ExtendedPyCallConfig {
 
   struct Cache {
     // `nn.Module.forward` or `optim.Optimizer._optimizer_step_code`
-    c10::optional<CodeLocation> location_;
+    std::optional<CodeLocation> location_;
     ska::flat_hash_map<key_t, ClsAndParameters> cls_and_parameters_;
     ska::flat_hash_map<cls_t, at::StringView> cls_names_;
   };
@@ -300,7 +300,7 @@ class ValueCache {
         load<C>(callsite.value_)};
   }
 
-  c10::optional<TensorMetadata> recordIfTensor(py::handle p);
+  std::optional<TensorMetadata> recordIfTensor(py::handle p);
   std::vector<std::pair<std::string, TensorMetadata>> unpackTensorMap(
       const py::dict& tensor_map);
   void trimPrefixes();
@@ -348,9 +348,9 @@ TensorMetadata toTensorMetadata(PyObject* self) {
       m.layout_ == at::kStrided ? t.strides().vec() : std::vector<int64_t>()};
 }
 
-c10::optional<TensorMetadata> ValueCache::recordIfTensor(py::handle p) {
+std::optional<TensorMetadata> ValueCache::recordIfTensor(py::handle p) {
   return THPVariable_CheckExact(p.ptr())
-      ? c10::optional<TensorMetadata>{toTensorMetadata(p.ptr())}
+      ? std::optional<TensorMetadata>{toTensorMetadata(p.ptr())}
       : c10::nullopt;
 }
 
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 341d2886699a1..33300b001819b 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -619,7 +619,7 @@ static void _wrap_outputs(
   auto non_differentiable = _parse_non_differentiable(self);
   auto dirty_inputs = _mark_dirty(self);
 
-  std::vector<c10::optional<Variable>> raw_output_vars;
+  std::vector<std::optional<Variable>> raw_output_vars;
   raw_output_vars.reserve(num_outputs);
   for (const auto i : c10::irange(num_outputs)) {
     PyObject* obj = PyTuple_GET_ITEM(raw_output, i);
@@ -746,7 +746,7 @@ static void _wrap_outputs(
 static void _get_tensors_to_save(
     THPFunction* self,
     std::unordered_set<at::TensorImpl*>& to_save_if_setup_context,
-    std::vector<c10::optional<at::Tensor>>& tensors_to_save,
+    std::vector<std::optional<at::Tensor>>& tensors_to_save,
     bool overridden_setup_context,
     bool is_executable) {
   if (self->saved_for_forward && overridden_setup_context) {
@@ -804,7 +804,7 @@ static void _get_tensors_to_save(
 }
 // Save any variables that requested by to_save
 static void _save_variables(
-    const std::vector<c10::optional<at::Tensor>>& tensors_to_save,
+    const std::vector<std::optional<at::Tensor>>& tensors_to_save,
     const std::shared_ptr<PyNode>& cdata_ptr,
     THPFunction* self) {
   if (!self->to_save)
@@ -1106,7 +1106,7 @@ PyObject* process_outputs(
   }
 
   std::unordered_set<at::TensorImpl*> to_save_if_setup_context{};
-  std::vector<c10::optional<at::Tensor>> tensors_to_save{};
+  std::vector<std::optional<at::Tensor>> tensors_to_save{};
   _get_tensors_to_save(
       grad_fn,
       to_save_if_setup_context,
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 5161cbc53a8c4..078b0f92124cb 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -267,7 +267,7 @@ PyObject* THPVariable_Wrap(at::TensorBase var) {
         c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
   }
 
-  c10::optional<PyObject*> mb_obj =
+  std::optional<PyObject*> mb_obj =
       var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
           getPyInterpreter(), /*ignore_hermetic_tls=*/false);
   c10::impl::PyInterpreterStatus status{};
@@ -587,14 +587,14 @@ static PyObject* view_func_impl(
         auto& view_func = view_info.view_fn();
 
         // Determine new SymInt / tensor state as needed.
-        c10::optional<std::vector<c10::SymInt>> new_symints = c10::nullopt;
+        std::optional<std::vector<c10::SymInt>> new_symints = c10::nullopt;
         if (symint_visitor_fn != Py_None) {
           new_symints = map_py_func(
               py::cast<py::function>(symint_visitor_fn),
               view_func.get_symints());
         }
 
-        c10::optional<std::vector<at::Tensor>> new_tensors = c10::nullopt;
+        std::optional<std::vector<at::Tensor>> new_tensors = c10::nullopt;
         if (tensor_visitor_fn != Py_None) {
           new_tensors = map_py_func(
               py::cast<py::function>(tensor_visitor_fn),
@@ -815,7 +815,7 @@ static PyObject* THPVariable_make_wrapper_subclass(
     auto sym_sizes = r.symintlist(1);
     auto sym_strides_own = r.symintlistOptional(2);
     auto sym_strides =
-        static_cast<c10::optional<c10::SymIntArrayRef>>(sym_strides_own);
+        static_cast<std::optional<c10::SymIntArrayRef>>(sym_strides_own);
     auto sym_storage_offset = r.toSymIntOptional(3);
 
     c10::SymInt size_bytes;
@@ -1931,7 +1931,7 @@ void THPVariable_subclass_dealloc(PyObject* self) {
   if (type->tp_del) {
     PyObject_GC_Track(self);
     type->tp_del(self);
-    if (self->ob_refcnt > 0) {
+    if (Py_REFCNT(self) > 0) {
       /* Resurrected */
       return;
     }
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index e3cdd04f0965a..fdcafd6cd7091 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -178,7 +178,7 @@ static inline Variable applySlicing(
     variable_list& outIndices,
     bool is_tracing,
     const at::Device& self_device,
-    const c10::optional<int64_t>& self_ndim,
+    const std::optional<int64_t>& self_ndim,
     int64_t specified_dims) {
   int64_t size =
       PyTuple_GET_SIZE(index); // NOLINT(cppcoreguidelines-pro-type-cstyle-cast)
@@ -200,9 +200,9 @@ static inline Variable applySlicing(
     // nested tensor does not have a size (yet) so for now we represent its size
     // as null may need to be changed after we reach a better solution for
     // nested tensor size
-    c10::optional<SymIntArrayRef> result_sizes = result.is_nested()
-        ? c10::optional<SymIntArrayRef>(c10::nullopt)
-        : c10::optional<SymIntArrayRef>(result.sym_sizes());
+    std::optional<SymIntArrayRef> result_sizes = result.is_nested()
+        ? std::optional<SymIntArrayRef>(c10::nullopt)
+        : std::optional<SymIntArrayRef>(result.sym_sizes());
     result = at::indexing::handleDimInMultiDimIndexing(
         /*prev_dim_result=*/result,
         /*original_tensor=*/self,
diff --git a/torch/csrc/autograd/record_function_ops.cpp b/torch/csrc/autograd/record_function_ops.cpp
index e5153ae4028aa..e3a3299dc9c59 100644
--- a/torch/csrc/autograd/record_function_ops.cpp
+++ b/torch/csrc/autograd/record_function_ops.cpp
@@ -20,7 +20,7 @@ namespace profiler {
 // callbacks.
 static void record_function_enter(
     const std::string& name,
-    const c10::optional<std::string>& args,
+    const std::optional<std::string>& args,
     at::RecordFunction& rec) {
   if (rec.isActive()) {
     if (rec.needsInputs() && args.has_value()) {
@@ -35,7 +35,7 @@ static void record_function_enter(
 // Legacy signature using cpp_custom_type_hack
 static at::Tensor record_function_enter_legacy(
     const std::string& name,
-    const c10::optional<std::string>& args) {
+    const std::optional<std::string>& args) {
   auto rec = std::make_unique<at::RecordFunction>(at::RecordScope::USER_SCOPE);
   record_function_enter(name, args, *rec);
   return at::cpp_custom_type_hack::create(std::move(rec), at::TensorOptions());
@@ -44,7 +44,7 @@ static at::Tensor record_function_enter_legacy(
 // New signature using custom_class
 c10::intrusive_ptr<PythonRecordFunction> record_function_enter_new(
     const std::string& name,
-    const c10::optional<std::string>& args) {
+    const std::optional<std::string>& args) {
   auto rec =
       c10::make_intrusive<PythonRecordFunction>(at::RecordScope::USER_SCOPE);
   record_function_enter(name, args, rec->record);
diff --git a/torch/csrc/autograd/record_function_ops.h b/torch/csrc/autograd/record_function_ops.h
index d37aba7dfff85..a145523c1bf8a 100644
--- a/torch/csrc/autograd/record_function_ops.h
+++ b/torch/csrc/autograd/record_function_ops.h
@@ -17,7 +17,7 @@ struct PythonRecordFunction : public torch::CustomClassHolder {
 // callbacks.
 TORCH_API c10::intrusive_ptr<PythonRecordFunction> record_function_enter_new(
     const std::string& name,
-    const c10::optional<std::string>& args = c10::nullopt);
+    const std::optional<std::string>& args = c10::nullopt);
 
 // Schedules RecordFunction's end callbacks to be run on completion of a future.
 TORCH_API c10::intrusive_ptr<c10::ivalue::Future> _call_end_callbacks_on_fut_new(
diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp
index 4bd44339c3b45..c4d4566434325 100644
--- a/torch/csrc/autograd/saved_variable.cpp
+++ b/torch/csrc/autograd/saved_variable.cpp
@@ -117,7 +117,7 @@ void SavedVariable::reset_data() {
 }
 
 SavedVariable::SavedVariable(
-    const c10::optional<Variable>& variable,
+    const std::optional<Variable>& variable,
     bool is_output,
     bool is_inplace_on_view)
     : SavedVariable(
diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h
index c9a358ede89e6..e249209f9f63b 100644
--- a/torch/csrc/autograd/saved_variable.h
+++ b/torch/csrc/autograd/saved_variable.h
@@ -26,7 +26,7 @@ class TORCH_API SavedVariable {
       bool is_output,
       bool is_inplace_on_view = false);
   SavedVariable(
-      const c10::optional<Variable>& variable,
+      const std::optional<Variable>& variable,
       bool is_output,
       bool is_inplace_on_view = false);
   SavedVariable(SavedVariable&&) = default;
diff --git a/torch/csrc/autograd/utils/python_arg_parsing.h b/torch/csrc/autograd/utils/python_arg_parsing.h
index 7701e97fe9189..326221e44d147 100644
--- a/torch/csrc/autograd/utils/python_arg_parsing.h
+++ b/torch/csrc/autograd/utils/python_arg_parsing.h
@@ -12,11 +12,11 @@ namespace utils {
 // The parameter allow_copy is to accept copy for Tensor.to (and by proxy
 // PackedSequences.to) but not nn.Module.to.
 inline std::tuple<
-    c10::optional<at::Device>,
-    c10::optional<at::ScalarType>,
+    std::optional<at::Device>,
+    std::optional<at::ScalarType>,
     bool,
     bool,
-    c10::optional<at::MemoryFormat>>
+    std::optional<at::MemoryFormat>>
 parse_to_conversion(PythonArgs& r, bool allow_copy) {
   if (r.idx == 0) {
     if (!allow_copy && !r.isNone(3))
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index 07e37463cbd38..da987001e2ecc 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -42,8 +42,8 @@ static std::unique_ptr<ViewFunc> create_view_func_matching(const Variable& t) {
 
 DifferentiableViewMeta::DifferentiableViewMeta(
     at::TensorImpl* self_impl,
-    c10::optional<ViewInfo> backward_info,
-    c10::optional<ViewInfo> forward_info,
+    std::optional<ViewInfo> backward_info,
+    std::optional<ViewInfo> forward_info,
     bool shared_view_info,
     CreationMeta creation_meta)
     : AutogradMeta(self_impl),
@@ -581,10 +581,10 @@ bool VariableHooks::retains_grad(const at::TensorBase& self) const {
 void VariableHooks::_backward(
     const Tensor& self,
     at::TensorList inputs,
-    const c10::optional<Tensor>& gradient,
-    c10::optional<bool> keep_graph,
+    const std::optional<Tensor>& gradient,
+    std::optional<bool> keep_graph,
     bool create_graph) const {
-  // TODO torch::autograd::backward should take the c10::optional<Tensor>
+  // TODO torch::autograd::backward should take the std::optional<Tensor>
   // gradient directly instead of us having to unwrap it to Tensor _gradient
   // here.
   Tensor _gradient = gradient.has_value() ? *gradient : Tensor();
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index aa9ee76f3dc95..d60f37085f380 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -682,8 +682,8 @@ TORCH_API void handle_view_on_rebase(
 struct TORCH_API DifferentiableViewMeta : public AutogradMeta {
  private:
   /// Information about the views
-  c10::optional<ViewInfo> backward_info_;
-  c10::optional<ViewInfo> forward_info_;
+  std::optional<ViewInfo> backward_info_;
+  std::optional<ViewInfo> forward_info_;
 
   // Optimization to reduce the number of ViewInfo we create.
   // In the (very common) case where backward_info_ == forward_info_, we only
@@ -766,8 +766,8 @@ struct TORCH_API DifferentiableViewMeta : public AutogradMeta {
 
   DifferentiableViewMeta(
       at::TensorImpl* self_impl,
-      c10::optional<ViewInfo> backward_info,
-      c10::optional<ViewInfo> forward_info,
+      std::optional<ViewInfo> backward_info,
+      std::optional<ViewInfo> forward_info,
       bool shared_view_info,
       CreationMeta creation_meta = CreationMeta::DEFAULT);
 };
@@ -796,8 +796,8 @@ struct TORCH_API DifferentiableViewMeta : public AutogradMeta {
 // Differentiable view. Track history with DifferentiableViewMeta.
 inline Variable make_variable_differentiable_view(
     const at::Tensor& data,
-    c10::optional<ViewInfo> backward_info,
-    c10::optional<ViewInfo> forward_info,
+    std::optional<ViewInfo> backward_info,
+    std::optional<ViewInfo> forward_info,
     bool shared_view_info,
     CreationMeta creation_meta,
     bool allow_tensor_metadata_change = true) {
@@ -927,8 +927,8 @@ struct VariableHooks final : at::impl::VariableHooksInterface {
   void _backward(
       const at::Tensor& self,
       at::TensorList inputs,
-      const c10::optional<at::Tensor>& gradient,
-      c10::optional<bool> keep_graph,
+      const std::optional<at::Tensor>& gradient,
+      std::optional<bool> keep_graph,
       bool create_graph) const override;
   void requires_grad_(const at::TensorBase& self, bool _requires_grad)
       const override;
diff --git a/torch/csrc/cuda/Graph.cpp b/torch/csrc/cuda/Graph.cpp
index 83c60d059f8dd..472151fec6097 100644
--- a/torch/csrc/cuda/Graph.cpp
+++ b/torch/csrc/cuda/Graph.cpp
@@ -30,7 +30,7 @@ void THCPGraph_init(PyObject* module) {
       .def(
           "capture_begin",
           [](::at::cuda::CUDAGraph& self,
-             c10::optional<c10::cuda::MempoolId_t> pool_opt,
+             std::optional<c10::cuda::MempoolId_t> pool_opt,
              std::string capture_error_mode) {
             cudaStreamCaptureMode capture_mode;
             c10::cuda::MempoolId_t pool = pool_opt.has_value()
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index e622c254a5003..030c5a2b5ccf6 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -956,8 +956,8 @@ static void registerCudaDeviceProperties(PyObject* module) {
   m.def(
       "_cuda_record_memory_history",
       static_cast<void (*)(
-          c10::optional<std::string>,
-          c10::optional<std::string>,
+          std::optional<std::string>,
+          std::optional<std::string>,
           const std::string&,
           size_t)>(torch::cuda::_record_memory_history));
 
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index c8bbec87caefb..c7c3cb396304c 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -37,7 +37,7 @@ struct unique_type_checker {
     unique = type_id_.value() == type_id;
   }
 
-  c10::optional<size_t> type_id_;
+  std::optional<size_t> type_id_;
   bool unique = true;
 };
 
@@ -232,7 +232,7 @@ std::vector<at::Tensor>& scatter_out(
     const at::Tensor& tensor,
     std::vector<at::Tensor>& out_tensors,
     int64_t dim,
-    const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
+    const std::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
         streams) {
   TORCH_CHECK(
       !out_tensors.empty(),
@@ -313,9 +313,9 @@ std::vector<at::Tensor>& scatter_out(
 std::vector<at::Tensor> scatter(
     const at::Tensor& tensor,
     at::IntArrayRef devices,
-    const c10::optional<std::vector<int64_t>>& chunk_sizes,
+    const std::optional<std::vector<int64_t>>& chunk_sizes,
     int64_t dim,
-    const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
+    const std::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
         streams) {
   TORCH_CHECK(!devices.empty(), "Expected at least one device to scatter to");
   if (chunk_sizes.has_value()) {
@@ -446,7 +446,7 @@ at::Tensor& gather_out(
 at::Tensor gather(
     at::TensorList tensors,
     int64_t dim,
-    c10::optional<int32_t> destination_index) {
+    std::optional<int32_t> destination_index) {
   TORCH_CHECK(!tensors.empty(), "Expected at least one tensor to gather from");
   int64_t total_size = 0;
   auto& first = tensors.front();
diff --git a/torch/csrc/cuda/comm.h b/torch/csrc/cuda/comm.h
index cf89b365d0ce4..4bc0f60195a26 100644
--- a/torch/csrc/cuda/comm.h
+++ b/torch/csrc/cuda/comm.h
@@ -28,15 +28,15 @@ TORCH_CUDA_CU_API std::vector<at::Tensor>& scatter_out(
     const at::Tensor& tensor,
     std::vector<at::Tensor>& out_tensors,
     int64_t dim = 0,
-    const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
+    const std::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
         streams = c10::nullopt);
 
 TORCH_CUDA_CU_API std::vector<at::Tensor> scatter(
     const at::Tensor& tensor,
     at::IntArrayRef devices,
-    const c10::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
+    const std::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
     int64_t dim = 0,
-    const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
+    const std::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
         streams = c10::nullopt);
 
 TORCH_CUDA_CU_API at::Tensor& gather_out(
@@ -47,6 +47,6 @@ TORCH_CUDA_CU_API at::Tensor& gather_out(
 TORCH_CUDA_CU_API at::Tensor gather(
     at::TensorList tensors,
     int64_t dim,
-    c10::optional<int32_t> destination_index);
+    std::optional<int32_t> destination_index);
 
 } // namespace torch::cuda
diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp
index 49fefd97e2da1..82696abaee227 100644
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@@ -130,8 +130,8 @@ static void checkOptionIn(
 }
 
 void _record_memory_history(
-    c10::optional<std::string> enabled,
-    c10::optional<std::string> context,
+    std::optional<std::string> enabled,
+    std::optional<std::string> context,
     const std::string& stacks,
     size_t max_entries) {
   if (enabled) {
diff --git a/torch/csrc/cuda/memory_snapshot.h b/torch/csrc/cuda/memory_snapshot.h
index f5f9bdbed1620..eb22767a78f90 100644
--- a/torch/csrc/cuda/memory_snapshot.h
+++ b/torch/csrc/cuda/memory_snapshot.h
@@ -17,8 +17,8 @@ TORCH_CUDA_CU_API void _record_memory_history(
     bool record_cpp_context = false);
 
 TORCH_CUDA_CU_API void _record_memory_history(
-    c10::optional<std::string> enabled = "all",
-    c10::optional<std::string> context = "all",
+    std::optional<std::string> enabled = "all",
+    std::optional<std::string> context = "all",
     const std::string& stacks = "all",
     size_t max_entries = SIZE_MAX);
 
diff --git a/torch/csrc/cuda/nccl.h b/torch/csrc/cuda/nccl.h
index ebf51b7633abb..b118bd4600a56 100644
--- a/torch/csrc/cuda/nccl.h
+++ b/torch/csrc/cuda/nccl.h
@@ -111,7 +111,7 @@ TORCH_CUDA_CPP_API void check_inputs(
 } // namespace detail
 
 using comm_list = std::vector<ncclComm_t>;
-using stream_list = std::vector<c10::optional<at::cuda::CUDAStream>>;
+using stream_list = std::vector<std::optional<at::cuda::CUDAStream>>;
 
 TORCH_CUDA_CPP_API std::uint64_t version();
 TORCH_CUDA_CPP_API const char* version_suffix();
diff --git a/torch/csrc/cuda/python_comm.cpp b/torch/csrc/cuda/python_comm.cpp
index e65bb15103aab..ec9da9ac2d679 100644
--- a/torch/csrc/cuda/python_comm.cpp
+++ b/torch/csrc/cuda/python_comm.cpp
@@ -46,10 +46,10 @@ void initCommMethods(PyObject* module) {
           "_scatter",
           [](at::Tensor& tensor,
              std::vector<int64_t>& devices,
-             c10::optional<std::vector<int64_t>> chunk_sizes,
+             std::optional<std::vector<int64_t>> chunk_sizes,
              int64_t dim,
-             c10::optional<py::object> py_streams) {
-            c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>
+             std::optional<py::object> py_streams) {
+            std::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>
                 streams;
             if (py_streams) {
               py::handle handle = *py_streams;
@@ -69,8 +69,8 @@ void initCommMethods(PyObject* module) {
           [](at::Tensor& tensor,
              std::vector<at::Tensor>& out_tensors,
              int64_t dim,
-             c10::optional<py::object> py_streams) {
-            c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>
+             std::optional<py::object> py_streams) {
+            std::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>
                 streams;
             if (py_streams) {
               py::handle handle = *py_streams;
@@ -88,7 +88,7 @@ void initCommMethods(PyObject* module) {
           "_gather",
           [](std::vector<at::Tensor>& tensors,
              int64_t dim,
-             c10::optional<int32_t> destination_index) {
+             std::optional<int32_t> destination_index) {
             return gather(tensors, dim, destination_index);
           },
           py::arg("tensors"),
diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp
index db6f6c680701d..5060f9289a9e1 100644
--- a/torch/csrc/cuda/python_nccl.cpp
+++ b/torch/csrc/cuda/python_nccl.cpp
@@ -56,11 +56,11 @@ static void destroy_nccl_comm(PyObject* capsule) {
   END_HANDLE_TH_ERRORS_RET()
 }
 
-static std::vector<c10::optional<at::cuda::CUDAStream>> unpack_streams(
+static std::vector<std::optional<at::cuda::CUDAStream>> unpack_streams(
     PyObject* obj,
     size_t size) {
   if (obj == Py_None) {
-    return std::vector<c10::optional<at::cuda::CUDAStream>>(size, c10::nullopt);
+    return std::vector<std::optional<at::cuda::CUDAStream>>(size, c10::nullopt);
   }
   auto streams = THPUtils_PySequence_to_CUDAStreamList(obj);
   if (streams.size() != size) {
@@ -147,7 +147,7 @@ PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args) {
 
   std::vector<at::Tensor> inputs = extract_tensors(_inputs);
   auto output = extract_tensor(_output);
-  std::vector<c10::optional<at::cuda::CUDAStream>> streams =
+  std::vector<std::optional<at::cuda::CUDAStream>> streams =
       unpack_streams(_streams, inputs.size());
   auto user_comms = unpack_comms(_comms, inputs.size());
 
diff --git a/torch/csrc/cuda/utils.cpp b/torch/csrc/cuda/utils.cpp
index e62e176473f2a..e2ad6622e6ffb 100644
--- a/torch/csrc/cuda/utils.cpp
+++ b/torch/csrc/cuda/utils.cpp
@@ -6,7 +6,7 @@
 #ifdef USE_CUDA
 // NB: It's a list of *optional* CUDAStream; when nullopt, that means to use
 // whatever the current stream of the device the input is associated with was.
-std::vector<c10::optional<at::cuda::CUDAStream>>
+std::vector<std::optional<at::cuda::CUDAStream>>
 THPUtils_PySequence_to_CUDAStreamList(PyObject* obj) {
   if (!PySequence_Check(obj)) {
     throw std::runtime_error(
@@ -18,7 +18,7 @@ THPUtils_PySequence_to_CUDAStreamList(PyObject* obj) {
         "expected PySequence, but got " + std::string(THPUtils_typename(obj)));
   }
 
-  std::vector<c10::optional<at::cuda::CUDAStream>> streams;
+  std::vector<std::optional<at::cuda::CUDAStream>> streams;
   Py_ssize_t length = PySequence_Fast_GET_SIZE(seq.get());
   for (Py_ssize_t i = 0; i < length; i++) {
     PyObject* stream = PySequence_Fast_GET_ITEM(seq.get(), i);
diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
index 05a39ddc905aa..501cf59d86bad 100644
--- a/torch/csrc/distributed/c10d/Backend.hpp
+++ b/torch/csrc/distributed/c10d/Backend.hpp
@@ -375,7 +375,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
   }
 
   // See similar functions in ProcessGroup.hpp for context.
-  c10::optional<at::Device> getBoundDeviceId() const {
+  std::optional<at::Device> getBoundDeviceId() const {
     return bound_device_id_;
   }
 
@@ -386,7 +386,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     // backends may perform
   }
 
-  void setBoundDeviceId(c10::optional<at::Device> device) {
+  void setBoundDeviceId(std::optional<at::Device> device) {
     if (device) {
       TORCH_CHECK(device->has_index(), "setBoundDeviceId must have an index");
     }
@@ -410,7 +410,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
 
   std::function<void(std::shared_ptr<WorkInfo>)> onCompletionHook_;
 
-  c10::optional<at::Device> bound_device_id_;
+  std::optional<at::Device> bound_device_id_;
 };
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index 9a0c77a8623c3..e26ab22f1a9f3 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -159,7 +159,7 @@ std::string ncclGetErrorWithVersion(ncclResult_t error) {
 // thrown in the NCCL codebase.
 std::string getNcclErrorDetailStr(
     ncclResult_t error,
-    c10::optional<std::string> processGroupFailureReason /* = c10::nullopt */
+    std::optional<std::string> processGroupFailureReason /* = c10::nullopt */
 ) {
   // Prioritize failure reason provided by PG NCCL first, as it can abort
   // communicators when it encounters collective timeouts, etc.
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index a4b96a2a40762..5690c0591a7af 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -182,7 +182,7 @@ int nccl_nonblocking_timeout();
 // thrown in the NCCL codebase.
 std::string getNcclErrorDetailStr(
     ncclResult_t error,
-    c10::optional<std::string> processGroupFailureReason = c10::nullopt);
+    std::optional<std::string> processGroupFailureReason = c10::nullopt);
 
 // Write NCCL debug info to local disk or any storage users define.
 // There are some constrains we set for the debug info writer:
@@ -339,13 +339,13 @@ class NCCLComm {
 
   ncclComm_t getNcclComm();
 
-  c10::optional<std::string> getNcclCommFailureReason() const {
+  std::optional<std::string> getNcclCommFailureReason() const {
     std::unique_lock<std::mutex> lock(mutex_);
     return commFailureReason_;
   }
 
   void ncclCommAbort(
-      c10::optional<std::string> commFailureReason = c10::nullopt) {
+      std::optional<std::string> commFailureReason = c10::nullopt) {
     std::unique_lock<std::mutex> lock(mutex_);
 #ifdef ENABLE_NCCL_ERROR_CHECKING
     if (aborted_) {
@@ -491,7 +491,7 @@ class NCCLComm {
   int rank_;
   // Optional reason for communicator failure, provided by ProcessGroupNCCL for
   // better error messaging.
-  c10::optional<std::string> commFailureReason_;
+  std::optional<std::string> commFailureReason_;
   bool initialized_{false};
 #ifdef NCCL_HAS_COMM_REGISTER
   // Stores handlers for tensors registered by NCCL
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index cf8b7cd966ef5..ae822ad397504 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -168,7 +168,7 @@ IMPL_BROADCAST(PrivateUse1)
           at::TensorList tensors,                                             \
           const c10::intrusive_ptr<ProcessGroup>& process_group,              \
           const c10::intrusive_ptr<ReduceOp>& reduce_op,                      \
-          const c10::optional<at::Tensor>& sparse_indices,                    \
+          const std::optional<at::Tensor>& sparse_indices,                    \
           int64_t timeout) {                                                  \
     auto tensor_vec = tensors.vec();                                          \
     auto work = process_group->getBackend(c10::DeviceType::DEV) -> allreduce( \
@@ -460,7 +460,7 @@ allreduce_sparse_cuda_(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const c10::intrusive_ptr<ReduceOp>& reduce_op,
-    const c10::optional<at::Tensor>& sparse_indices,
+    const std::optional<at::Tensor>& sparse_indices,
     int64_t timeout) {
   auto tensor_vec = tensors.vec();
   auto work = process_group->getBackend(c10::DeviceType::CUDA)
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index 8c805020e8cf6..acf8c9c354a76 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -162,7 +162,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                     at::TensorList,
                     const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                     const c10::intrusive_ptr<::c10d::ReduceOp>&,
-                    const c10::optional<at::Tensor>& sparse_indices,
+                    const std::optional<at::Tensor>& sparse_indices,
                     int64_t)>();
 
     return std::get<1>(op.call(
@@ -620,7 +620,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   void setBackend(
       c10::DeviceType deviceType,
       BackendType backendType,
-      const c10::optional<c10::intrusive_ptr<Backend>>& backend) {
+      const std::optional<c10::intrusive_ptr<Backend>>& backend) {
     // TODO: should we add these entries after the backend setting succeeds?
     deviceTypeToBackendType_[deviceType] = backendType;
     deviceTypes_.insert(deviceType);
@@ -703,11 +703,11 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   // optimizations such as automatic use of ncclCommSplit.  The device
   // is specified in `init_process_group` and eventually makes it
   // here and then down into the actual backend instances.
-  c10::optional<at::Device> getBoundDeviceId() const {
+  std::optional<at::Device> getBoundDeviceId() const {
     return bound_device_id_;
   }
 
-  void setBoundDeviceId(c10::optional<at::Device> device) {
+  void setBoundDeviceId(std::optional<at::Device> device) {
     if (device) {
       TORCH_CHECK(device->has_index(), "setBoundDeviceId must have an index");
     }
@@ -742,7 +742,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   std::unordered_map<BackendType, c10::intrusive_ptr<Backend>>
       backendTypeToBackend_;
 
-  c10::optional<at::Device> bound_device_id_;
+  std::optional<at::Device> bound_device_id_;
 };
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index ada56cbee1990..cba0249829e68 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -479,7 +479,7 @@ void returnFutureWithOutput(
 
 inline void ProcessGroupGloo::AsyncWork::recordAsyncWorkProfilingInfo(
     const char* profilingTitle,
-    const c10::optional<std::vector<at::Tensor>>& inputTensors) {
+    const std::optional<std::vector<at::Tensor>>& inputTensors) {
   auto recordingFunction =
       std::make_shared<at::RecordFunction>(at::RecordScope::USER_SCOPE);
   if (recordingFunction->isActive()) {
@@ -511,7 +511,7 @@ ProcessGroupGloo::AsyncWork::AsyncWork(
     OpType opType,
     uint64_t seq,
     const char* profilingTitle,
-    const c10::optional<std::vector<at::Tensor>>& inputTensors)
+    const std::optional<std::vector<at::Tensor>>& inputTensors)
     // Profiler: Pass nullptr as profilingTitle to parent constructor to
     // replace default profiler implementation with async version that reports
     // correct timestamps for work that is asynchronously executed.
@@ -547,7 +547,7 @@ ProcessGroupGloo::SendWork::SendWork(
           -1,
           OpType::SEND,
           "gloo:send",
-          c10::optional<std::vector<at::Tensor>>({tensor})),
+          std::optional<std::vector<at::Tensor>>({tensor})),
       tensor_(tensor),
       buffer_(std::move(buffer)),
       seq_(seq) {}
@@ -588,7 +588,7 @@ ProcessGroupGloo::RecvWork::RecvWork(
           -1,
           opType,
           profilingTitle,
-          c10::optional<std::vector<at::Tensor>>({tensor})),
+          std::optional<std::vector<at::Tensor>>({tensor})),
       tensor_(tensor),
       buffer_(std::move(buffer)),
       srcRank_(-1),
@@ -2424,7 +2424,7 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
             OpType::SCATTER,
             seq,
             "gloo:scatter",
-            !inputs.empty() ? c10::optional<std::vector<at::Tensor>>(inputs[0])
+            !inputs.empty() ? std::optional<std::vector<at::Tensor>>(inputs[0])
                             : c10::nullopt),
         context(context),
         outputs(outputs),
@@ -2620,7 +2620,7 @@ class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork {
             OpType::ALLTOALL,
             seq,
             "gloo:all_to_all",
-            c10::optional<std::vector<at::Tensor>>({inputTensor})),
+            std::optional<std::vector<at::Tensor>>({inputTensor})),
         context(context),
         outputTensor(outputTensor),
         inputTensor(inputTensor),
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index d40b205c25601..87c87b8f1ae9b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -73,7 +73,7 @@ class TORCH_API ProcessGroupGloo : public Backend {
         OpType opType,
         uint64_t seq,
         const char* profilingTitle = nullptr,
-        const c10::optional<std::vector<at::Tensor>>& inputTensors =
+        const std::optional<std::vector<at::Tensor>>& inputTensors =
             c10::nullopt);
 
     ~AsyncWork() override = default;
@@ -95,7 +95,7 @@ class TORCH_API ProcessGroupGloo : public Backend {
     void finishWorkGlooError(const std::exception_ptr& eptr);
     inline void recordAsyncWorkProfilingInfo(
         const char* profilingTitle,
-        const c10::optional<std::vector<at::Tensor>>& inputTensors);
+        const std::optional<std::vector<at::Tensor>>& inputTensors);
 
     const std::vector<std::vector<at::Tensor>> outputTensors_;
     c10::intrusive_ptr<at::ivalue::Future> future_;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
index 29d05a9693b14..6d02f89f6005b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
@@ -121,7 +121,7 @@ ProcessGroupMPI::AsyncWork::AsyncWork(
     MPI_Request request,
     std::vector<at::Tensor> outputTensors,
     const char* profilingTitle,
-    const c10::optional<std::vector<at::Tensor>>& inputTensors)
+    const std::optional<std::vector<at::Tensor>>& inputTensors)
     : Work(-1, OpType::UNKNOWN, profilingTitle, inputTensors),
       outputTensors_(std::move(outputTensors)),
       request_(request) {
@@ -379,7 +379,7 @@ void ProcessGroupMPI::runLoop() {
 c10::intrusive_ptr<Work> ProcessGroupMPI::enqueue(
     std::unique_ptr<WorkEntry> entry,
     const char* profilingTitle,
-    const c10::optional<std::vector<at::Tensor>>& inputTensors) {
+    const std::optional<std::vector<at::Tensor>>& inputTensors) {
   auto work =
       c10::make_intrusive<WorkMPI>(entry->dst, profilingTitle, inputTensors);
   std::unique_lock<std::mutex> lock(pgMutex_);
@@ -410,7 +410,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::broadcast(
   return enqueue(
       std::move(entry),
       "mpi:broadcast",
-      c10::optional<std::vector<at::Tensor>>(tensors));
+      std::optional<std::vector<at::Tensor>>(tensors));
 }
 
 c10::intrusive_ptr<Work> ProcessGroupMPI::allreduce(
@@ -436,7 +436,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::allreduce(
   return enqueue(
       std::move(entry),
       "mpi:all_reduce",
-      c10::optional<std::vector<at::Tensor>>(tensors));
+      std::optional<std::vector<at::Tensor>>(tensors));
 }
 
 c10::intrusive_ptr<Work> ProcessGroupMPI::allreduce_coalesced(
@@ -473,7 +473,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::reduce(
   return enqueue(
       std::move(entry),
       "mpi:reduce",
-      c10::optional<std::vector<at::Tensor>>(tensors));
+      std::optional<std::vector<at::Tensor>>(tensors));
 }
 
 c10::intrusive_ptr<Work> ProcessGroupMPI::allgather(
@@ -522,7 +522,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::allgather(
   return enqueue(
       std::move(entry),
       "mpi:all_gather",
-      c10::optional<std::vector<at::Tensor>>(inputTensors));
+      std::optional<std::vector<at::Tensor>>(inputTensors));
 }
 
 c10::intrusive_ptr<Work> ProcessGroupMPI::allgather_coalesced(
@@ -598,14 +598,14 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::gather(
     return enqueue(
         std::move(entry),
         "mpi:gather",
-        c10::optional<std::vector<at::Tensor>>(inputTensors));
+        std::optional<std::vector<at::Tensor>>(inputTensors));
   } else {
     auto entry =
         std::make_unique<WorkEntry>(&inputTensors, nullptr, std::move(runFunc));
     return enqueue(
         std::move(entry),
         "mpi:gather",
-        c10::optional<std::vector<at::Tensor>>(inputTensors));
+        std::optional<std::vector<at::Tensor>>(inputTensors));
   }
 }
 
@@ -672,7 +672,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::scatter(
         std::move(entry),
         "mpi:scatter",
         !inputTensors.empty()
-            ? c10::optional<std::vector<at::Tensor>>(inputTensors[0])
+            ? std::optional<std::vector<at::Tensor>>(inputTensors[0])
             : c10::nullopt);
   } else {
     auto entry = std::make_unique<WorkEntry>(
@@ -681,7 +681,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::scatter(
         std::move(entry),
         "mpi:scatter",
         !inputTensors.empty()
-            ? c10::optional<std::vector<at::Tensor>>(inputTensors[0])
+            ? std::optional<std::vector<at::Tensor>>(inputTensors[0])
             : c10::nullopt);
   }
 }
@@ -734,7 +734,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::alltoall_base(
     return enqueue(
         std::move(entry),
         "mpi:all_to_all",
-        c10::optional<std::vector<at::Tensor>>(inputTensors));
+        std::optional<std::vector<at::Tensor>>(inputTensors));
   } else {
     // Need alltoallv
     c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_);
@@ -772,7 +772,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::alltoall_base(
     return enqueue(
         std::move(entry),
         "mpi:all_to_all",
-        c10::optional<std::vector<at::Tensor>>(inputTensors));
+        std::optional<std::vector<at::Tensor>>(inputTensors));
   }
 }
 
@@ -835,7 +835,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::alltoall(
   return enqueue(
       std::move(entry),
       "mpi:all_to_all",
-      c10::optional<std::vector<at::Tensor>>(inputTensors));
+      std::optional<std::vector<at::Tensor>>(inputTensors));
 }
 
 c10::intrusive_ptr<Work> ProcessGroupMPI::send(
@@ -864,7 +864,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::send(
       request,
       std::vector<at::Tensor>(),
       "mpi:send",
-      c10::optional<std::vector<at::Tensor>>(tensors));
+      std::optional<std::vector<at::Tensor>>(tensors));
 }
 
 c10::intrusive_ptr<Work> ProcessGroupMPI::recv(
@@ -893,7 +893,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::recv(
       request,
       tensors,
       "mpi:recv",
-      c10::optional<std::vector<at::Tensor>>(tensors));
+      std::optional<std::vector<at::Tensor>>(tensors));
 }
 
 c10::intrusive_ptr<Work> ProcessGroupMPI::recvAnysource(
@@ -921,7 +921,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::recvAnysource(
       request,
       tensors,
       "mpi:recvAnySource",
-      c10::optional<std::vector<at::Tensor>>(tensors));
+      std::optional<std::vector<at::Tensor>>(tensors));
 }
 
 c10::intrusive_ptr<Work> ProcessGroupMPI::barrier(const BarrierOptions& opts) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
index dd586dda7024b..6e52e680e5c20 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
@@ -86,7 +86,7 @@ class TORCH_API ProcessGroupMPI : public Backend {
     explicit WorkMPI(
         std::vector<at::Tensor> outputTensors,
         const char* profilingTitle = nullptr,
-        const c10::optional<std::vector<at::Tensor>>& inputTensors =
+        const std::optional<std::vector<at::Tensor>>& inputTensors =
             c10::nullopt)
         : Work(-1, OpType::UNKNOWN, profilingTitle, inputTensors),
           outputTensors_(std::move(outputTensors)),
@@ -114,7 +114,7 @@ class TORCH_API ProcessGroupMPI : public Backend {
         MPI_Request request,
         std::vector<at::Tensor> outputTensors,
         const char* profilingTitle = nullptr,
-        const c10::optional<std::vector<at::Tensor>>& inputTensors =
+        const std::optional<std::vector<at::Tensor>>& inputTensors =
             c10::nullopt);
 
     ~AsyncWork() override;
@@ -243,7 +243,7 @@ class TORCH_API ProcessGroupMPI : public Backend {
   c10::intrusive_ptr<Work> enqueue(
       std::unique_ptr<WorkEntry> entry,
       const char* profilingTitle = nullptr,
-      const c10::optional<std::vector<at::Tensor>>& inputTensors =
+      const std::optional<std::vector<at::Tensor>>& inputTensors =
           c10::nullopt);
 
   bool stop_;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 6cca50daff6c4..7437a4ef1846a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -352,9 +352,9 @@ std::string dump_nccl_trace() {
 }
 #endif
 
-c10::optional<std::function<void(std::function<void(const std::string&)>)>>&
+std::optional<std::function<void(std::function<void(const std::string&)>)>>&
 get_cpp_trace_dumper() {
-  static c10::optional<
+  static std::optional<
       std::function<void(std::function<void(const std::string&)>)>>
       dumper(c10::nullopt);
   return dumper;
@@ -431,7 +431,7 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(
     OpType opType,
     uint64_t seq,
     const char* profilingTitle,
-    const c10::optional<std::vector<at::Tensor>>& inputs,
+    const std::optional<std::vector<at::Tensor>>& inputs,
     bool desyncDebug,
     bool enableTiming,
     DebugLevel distDebugLevel)
@@ -546,7 +546,7 @@ bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const {
 }
 
 bool ProcessGroupNCCL::WorkNCCL::checkTimeout(
-    c10::optional<std::chrono::milliseconds> timeout) {
+    std::optional<std::chrono::milliseconds> timeout) {
   auto currentTimepoint = std::chrono::steady_clock::now();
   auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
       currentTimepoint - workStartTime_);
@@ -1036,7 +1036,7 @@ void ProcessGroupNCCL::waitForFutureOrTimeout(
 
 void ProcessGroupNCCL::abortCommsFromMap(
     std::unordered_map<std::string, std::shared_ptr<NCCLComm>>& ncclCommsMap,
-    c10::optional<std::string> abortReason) {
+    std::optional<std::string> abortReason) {
   // The process may control multiple devices, loop through the communicators on
   // each device
   for (auto& it : ncclCommsMap) {
@@ -1069,7 +1069,7 @@ void ProcessGroupNCCL::abortCommsFromMap(
 }
 
 // Abort all communicators on this rank
-bool ProcessGroupNCCL::abort(c10::optional<std::string> abortReason) {
+bool ProcessGroupNCCL::abort(std::optional<std::string> abortReason) {
   // Remove record from global ncclCommDevIdxMapMutex before aboarting,
   // so that a new cache segment would not register to already aborded
   // communicators. Note that ncclCommDevIdxMap is a global container which may
@@ -1088,7 +1088,7 @@ bool ProcessGroupNCCL::abort(c10::optional<std::string> abortReason) {
   return true;
 }
 
-void ProcessGroupNCCL::shutdown(c10::optional<std::string> reason) {
+void ProcessGroupNCCL::shutdown(std::optional<std::string> reason) {
   // Don't join threads here since the purpose of this method is to abort all
   // communicators and signal the threads to exit. Joining on the threads could
   // potentially block and hence avoid it in this method.
@@ -1188,7 +1188,7 @@ void ProcessGroupNCCL::heartbeatMonitor() {
                                             : heartbeatTimeoutInSec_ * 1000;
   auto lastTimePollStore = std::chrono::steady_clock::now();
   auto lastTimeHeartBeatCheck = std::chrono::steady_clock::now();
-  c10::optional<DumpPipe> dumpPipe = c10::nullopt;
+  std::optional<DumpPipe> dumpPipe = c10::nullopt;
   if (uid_ == 0) {
     // DumpPipe is one per-trainer process, and its convenient to name them
     // after 'global' ranks in the system, So we assume processgroup (uid)==0 is
@@ -2241,7 +2241,7 @@ c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
       opType,
       seq_,
       profilingTitle,
-      profilingTitle != nullptr ? c10::optional<std::vector<at::Tensor>>(inputs)
+      profilingTitle != nullptr ? std::optional<std::vector<at::Tensor>>(inputs)
                                 : c10::nullopt,
       desyncDebug_,
       enableTiming_.load(),
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index fac9b6f38204e..4217d2fa4cea5 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -248,7 +248,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
         OpType opType,
         uint64_t seq,
         const char* profilingTitle = nullptr,
-        const c10::optional<std::vector<at::Tensor>>& inputs = c10::nullopt,
+        const std::optional<std::vector<at::Tensor>>& inputs = c10::nullopt,
         bool desyncDebug = false,
         bool enableTiming = false,
         DebugLevel distDebugLevel = DebugLevel::Off);
@@ -305,7 +305,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // and False otherwise.
     // In case of timeout, set exception on the WorkNCCL object.
     bool checkTimeout(
-        c10::optional<std::chrono::milliseconds> timeout = c10::nullopt);
+        std::optional<std::chrono::milliseconds> timeout = c10::nullopt);
 
     std::vector<at::Tensor> result() override;
 
@@ -399,7 +399,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     bool timingEnabled_;
     // unique id used to tell the trace buffer that this
     // work has completed
-    c10::optional<uint64_t> trace_id_;
+    std::optional<uint64_t> trace_id_;
     DebugLevel distDebugLevel_;
     friend class ProcessGroupNCCL;
   };
@@ -621,16 +621,16 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Helper function for iteratively aborting communicators in the provided map
   void abortCommsFromMap(
       std::unordered_map<std::string, std::shared_ptr<NCCLComm>>& ncclCommsMap,
-      c10::optional<std::string> abortReason);
+      std::optional<std::string> abortReason);
 
   c10::intrusive_ptr<intra_node_comm::IntraNodeComm> initIntraNodeComm();
 
   // Provides an API to abort the ProcessGroup (similar to ncclCommAbort)
   // instead of relying on ProcessGroupNCCL destructor.
   // return true if abort is successful, otherwise false
-  bool abort(c10::optional<std::string> abortReason = c10::nullopt);
+  bool abort(std::optional<std::string> abortReason = c10::nullopt);
 
-  void shutdown(c10::optional<std::string> reason = c10::nullopt);
+  void shutdown(std::optional<std::string> reason = c10::nullopt);
 
   void eagerConnectSingleDevice(at::Device device) override;
 
@@ -1092,7 +1092,7 @@ TORCH_API std::string dump_nccl_trace();
 // Gets a mutable reference to a global optional function.  Heartbeat Monitor
 // will use this function to dump traces, if available. Inside fbcode, we store
 // a function here that uses an internal tool for process tracing
-TORCH_API c10::optional<
+TORCH_API std::optional<
     std::function<void(std::function<void(const std::string&)>)>>&
 get_cpp_trace_dumper();
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp b/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp
index 22fc58134566c..ab1e1e4c4899e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp
@@ -119,7 +119,7 @@ class TORCH_API ProcessGroupUCC : public Backend {
         OpType opType,
         uint64_t seq,
         const char* prof_title,
-        const c10::optional<std::vector<at::Tensor>>& inputs,
+        const std::optional<std::vector<at::Tensor>>& inputs,
         const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger)
         : Work(-1, opType, prof_title, inputs), logger_(logger), seq_(seq) {}
     ~WorkUCC();
diff --git a/torch/csrc/distributed/c10d/Store.hpp b/torch/csrc/distributed/c10d/Store.hpp
index 525440e767b47..af715ba98a794 100644
--- a/torch/csrc/distributed/c10d/Store.hpp
+++ b/torch/csrc/distributed/c10d/Store.hpp
@@ -13,7 +13,7 @@ namespace c10d {
 // callback function will be given arguments (optional<string> oldValue,
 // optional<string> newValue)
 using WatchKeyCallback =
-    std::function<void(c10::optional<std::string>, c10::optional<std::string>)>;
+    std::function<void(std::optional<std::string>, c10::optional<std::string>)>;
 
 class TORCH_API Store : public torch::CustomClassHolder {
  public:
diff --git a/torch/csrc/distributed/c10d/TCPStore.cpp b/torch/csrc/distributed/c10d/TCPStore.cpp
index a95f0ebdb1e26..aee1d7677dc4d 100644
--- a/torch/csrc/distributed/c10d/TCPStore.cpp
+++ b/torch/csrc/distributed/c10d/TCPStore.cpp
@@ -268,7 +268,7 @@ using detail::Socket;
 TCPStore::TCPStore(
     const std::string& masterAddr,
     std::uint16_t masterPort,
-    c10::optional<int> numWorkers,
+    std::optional<int> numWorkers,
     bool isServer,
     const std::chrono::milliseconds& timeout,
     bool waitWorkers)
@@ -277,7 +277,7 @@ TCPStore::TCPStore(
           TCPStoreOptions{
               masterPort,
               isServer,
-              numWorkers ? c10::optional<std::size_t>(*numWorkers)
+              numWorkers ? std::optional<std::size_t>(*numWorkers)
                          : c10::nullopt,
               waitWorkers,
               timeout}} {}
diff --git a/torch/csrc/distributed/c10d/TCPStore.hpp b/torch/csrc/distributed/c10d/TCPStore.hpp
index 03a7f124ca710..7080d50136e96 100644
--- a/torch/csrc/distributed/c10d/TCPStore.hpp
+++ b/torch/csrc/distributed/c10d/TCPStore.hpp
@@ -49,7 +49,7 @@ struct TCPStoreOptions {
 
   std::uint16_t port = kDefaultPort;
   bool isServer = false;
-  c10::optional<std::size_t> numWorkers = c10::nullopt;
+  std::optional<std::size_t> numWorkers = c10::nullopt;
   bool waitWorkers = true;
   std::chrono::milliseconds timeout = Store::kDefaultTimeout;
 
@@ -60,7 +60,7 @@ struct TCPStoreOptions {
   // If specified, and if isServer is true, the underlying TCPServer will take
   // over the bound socket associated to this fd. This option is useful to avoid
   // port assignment races in certain scenarios.
-  c10::optional<int> masterListenFd = c10::nullopt;
+  std::optional<int> masterListenFd = c10::nullopt;
 
   // A boolean value indicating whether to use the experimental libUV backend.
   bool useLibUV = false;
@@ -73,7 +73,7 @@ class TORCH_API TCPStore : public Store {
   [[deprecated("Use TCPStore(host, opts) instead.")]] explicit TCPStore(
       const std::string& masterAddr,
       std::uint16_t masterPort,
-      c10::optional<int> numWorkers = c10::nullopt,
+      std::optional<int> numWorkers = c10::nullopt,
       bool isServer = false,
       const std::chrono::milliseconds& timeout = kDefaultTimeout,
       bool waitWorkers = true);
@@ -152,7 +152,7 @@ class TORCH_API TCPStore : public Store {
   detail::SocketAddress addr_;
   std::shared_ptr<detail::TCPServer> server_;
   std::unique_ptr<detail::TCPClient> client_;
-  c10::optional<std::size_t> numWorkers_;
+  std::optional<std::size_t> numWorkers_;
 
   const std::string initKey_ = "init/";
   const std::string keyPrefix_ = "/";
diff --git a/torch/csrc/distributed/c10d/TraceUtils.h b/torch/csrc/distributed/c10d/TraceUtils.h
index 32f0e1f41df01..181f2208160b7 100644
--- a/torch/csrc/distributed/c10d/TraceUtils.h
+++ b/torch/csrc/distributed/c10d/TraceUtils.h
@@ -15,6 +15,34 @@
 #include <vector>
 namespace c10d {
 
+static c10::IValue entries_key = "entries";
+static c10::IValue nccl_comm_key = "nccl_comm_state";
+static c10::IValue version_key = "version";
+// Update whenever changing contents or formatting of the dump
+// (minor when adding fields, major when changing existing fields)
+static c10::IValue version_val = "1.5";
+static c10::IValue pg_config_key = "pg_config";
+static c10::IValue record_id_key = "record_id";
+static c10::IValue pg_id_key = "pg_id";
+static c10::IValue pg_name_key = "process_group";
+static c10::IValue seq_id_key = "seq_id";
+static c10::IValue op_id_key = "op_id";
+static c10::IValue profiling_name_key = "profiling_name";
+static c10::IValue input_sizes_key = "input_sizes";
+static c10::IValue output_sizes_key = "output_sizes";
+static c10::IValue time_created_key = "time_created_ns";
+static c10::IValue duration_key = "duration_ms";
+
+static c10::IValue frames_key = "frames";
+static c10::IValue state_key = "state";
+static c10::IValue line_key = "line";
+static c10::IValue name_key = "name";
+static c10::IValue filename_key = "filename";
+static c10::IValue retired_key = "retired";
+static c10::IValue time_discovered_started_key = "time_discovered_started_ns";
+static c10::IValue time_discovered_completed_key =
+    "time_discovered_completed_ns";
+
 /* Trace Utils Related to TORCH_NCCL_DESYNC_DEBUG */
 
 inline std::string getTraceStartKey(const std::string& pgName, int rank) {
@@ -417,18 +445,18 @@ struct NCCLTraceBuffer {
     // timestamp when the entry was created, likely close to the time the work
     // was 'enqueued'- not necessarily started
     c10::time_t time_created_;
-    c10::optional<float> duration_;
+    std::optional<float> duration_;
 
     // timestamp when our CPU threads discovered that the kernel started.
     // will always be _after_ it actually started, and can be very late
     // if the watchdog thread got stuck on CUDA APIs.
-    c10::optional<c10::time_t> time_discovered_started_;
+    std::optional<c10::time_t> time_discovered_started_;
 
     // timestamp when our CPU threads discovered that the kernel completed.
     // will always be _after_ it actually complated, and can be the same time
     // as the discovery of the start if the watchdog thread is stuck on CUDA
     // APIs
-    c10::optional<c10::time_t> time_discovered_completed_;
+    std::optional<c10::time_t> time_discovered_completed_;
 
     // size information for input/output tensors
     c10::SmallVector<int, 4> input_dims_;
@@ -448,7 +476,7 @@ struct NCCLTraceBuffer {
   std::map<std::tuple<std::string, std::string>, std::vector<uint64_t>>
       pg_name_to_ranks_ = {};
 
-  c10::optional<size_t> record(
+  std::optional<size_t> record(
       size_t pg_id,
       const std::tuple<std::string, std::string>& pg_name,
       size_t seq_id,
@@ -551,7 +579,7 @@ struct NCCLTraceBuffer {
   never hang. (timing must also be enabled for compute_duration - see
   TORCH_NCCL_ENABLE_TIMING).
   */
-  void retire_id(c10::optional<size_t> id, bool compute_duration = true) {
+  void retire_id(std::optional<size_t> id, bool compute_duration = true) {
     if (!enabled_ || !id) {
       return;
     }
@@ -559,7 +587,7 @@ struct NCCLTraceBuffer {
     bool can_compute_duration = false;
     Event* startEvent = nullptr;
     Event* endEvent = nullptr;
-    c10::optional<float> duration = c10::nullopt;
+    std::optional<float> duration = c10::nullopt;
 
     std::unique_lock<std::mutex> guard(mutex_);
 
@@ -601,37 +629,11 @@ struct NCCLTraceBuffer {
   }
 
   std::string dump(
-      const c10::optional<std::unordered_map<
+      const std::optional<std::unordered_map<
           std::string,
           std::unordered_map<std::string, std::string>>>& ncclDumpMap) {
     auto result = dump_entries();
     auto entries = new_list();
-    c10::IValue entries_key = "entries";
-    c10::IValue nccl_comm_key = "nccl_comm_state";
-    c10::IValue version_key = "version";
-    // Update whenever changing contents or formatting of the dump
-    // (minor when adding fields, major when changing existing fields)
-    c10::IValue version_val = "1.5";
-    c10::IValue pg_config_key = "pg_config";
-    c10::IValue record_id_key = "record_id";
-    c10::IValue pg_id_key = "pg_id";
-    c10::IValue pg_name_key = "process_group";
-    c10::IValue seq_id_key = "seq_id";
-    c10::IValue op_id_key = "op_id";
-    c10::IValue profiling_name_key = "profiling_name";
-    c10::IValue input_sizes_key = "input_sizes";
-    c10::IValue output_sizes_key = "output_sizes";
-    c10::IValue time_created_key = "time_created_ns";
-    c10::IValue duration_key = "duration_ms";
-
-    c10::IValue frames_key = "frames";
-    c10::IValue state_key = "state";
-    c10::IValue line_key = "line";
-    c10::IValue name_key = "name";
-    c10::IValue filename_key = "filename";
-    c10::IValue retired_key = "retired";
-    c10::IValue time_discovered_started_key = "time_discovered_started_ns";
-    c10::IValue time_discovered_completed_key = "time_discovered_completed_ns";
 
     std::vector<torch::CapturedTraceback*> tracebacks;
     for (auto& e : result) {
diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp
index fab819798e555..669957a726735 100644
--- a/torch/csrc/distributed/c10d/Types.hpp
+++ b/torch/csrc/distributed/c10d/Types.hpp
@@ -121,7 +121,7 @@ struct BroadcastOptions {
 struct AllreduceOptions {
   ReduceOp reduceOp = ReduceOp::SUM;
   std::chrono::milliseconds timeout = kUnsetTimeout;
-  c10::optional<at::Tensor> sparseIndices = c10::nullopt;
+  std::optional<at::Tensor> sparseIndices = c10::nullopt;
 };
 
 struct AllreduceCoalescedOptions : AllreduceOptions {};
@@ -162,7 +162,7 @@ struct AllToAllOptions {
 struct BarrierOptions {
   std::vector<int64_t> device_ids;
   std::chrono::milliseconds timeout = kUnsetTimeout;
-  c10::optional<at::Device> device;
+  std::optional<at::Device> device;
 };
 
 struct DistributedBackendOptions {
diff --git a/torch/csrc/distributed/c10d/Work.cpp b/torch/csrc/distributed/c10d/Work.cpp
index 66c35b11e6c0f..8beb8f2936208 100644
--- a/torch/csrc/distributed/c10d/Work.cpp
+++ b/torch/csrc/distributed/c10d/Work.cpp
@@ -9,7 +9,7 @@ Work::Work(
     int rank,
     OpType opType,
     const char* profilingTitle,
-    const c10::optional<std::vector<at::Tensor>>& inputTensors)
+    const std::optional<std::vector<at::Tensor>>& inputTensors)
     : rank_(rank), opType_(opType) {
   if (profilingTitle != nullptr) {
     auto recordingFunction =
diff --git a/torch/csrc/distributed/c10d/Work.hpp b/torch/csrc/distributed/c10d/Work.hpp
index d106183231706..d29b838321176 100644
--- a/torch/csrc/distributed/c10d/Work.hpp
+++ b/torch/csrc/distributed/c10d/Work.hpp
@@ -50,7 +50,7 @@ class TORCH_API Work : public torch::CustomClassHolder {
       int rank = -1,
       OpType opType = OpType::UNKNOWN,
       const char* profilingTitle = nullptr,
-      const c10::optional<std::vector<at::Tensor>>& inputTensors =
+      const std::optional<std::vector<at::Tensor>>& inputTensors =
           c10::nullopt);
 
   ~Work() override;
diff --git a/torch/csrc/distributed/c10d/comm.hpp b/torch/csrc/distributed/c10d/comm.hpp
index d2c608532ba53..6f9203e214348 100644
--- a/torch/csrc/distributed/c10d/comm.hpp
+++ b/torch/csrc/distributed/c10d/comm.hpp
@@ -26,7 +26,7 @@ class TORCH_API GradBucket {
       std::vector<size_t> lengths,
       std::vector<c10::IntArrayRef> sizes_vec,
       std::vector<at::Tensor> parameters,
-      c10::optional<at::Tensor> sparse_grad_indices)
+      std::optional<at::Tensor> sparse_grad_indices)
       : index_(index),
         bucket_count_(bucket_count),
         buffer_(std::move(tensor)),
@@ -72,7 +72,7 @@ class TORCH_API GradBucket {
     return index_ == bucket_count_ - 1;
   }
 
-  c10::optional<at::Tensor>& getSparseGradIndices() {
+  std::optional<at::Tensor>& getSparseGradIndices() {
     return sparse_grad_indices_;
   }
 
@@ -92,7 +92,7 @@ class TORCH_API GradBucket {
 
   // Predefined sparse indices for this bucket (only used for sparse tensors).
   // The gradients will be updated to have indices with these tensor values
-  c10::optional<at::Tensor> sparse_grad_indices_;
+  std::optional<at::Tensor> sparse_grad_indices_;
 };
 
 // Base class of both `PythonCommHook` and `CppCommHook`.
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 7cbd898499c38..483becbce0094 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1409,14 +1409,14 @@ Example::
       .def(
           py::init([](const std::string& host,
                       uint16_t port,
-                      c10::optional<int> worldSize,
+                      std::optional<int> worldSize,
                       bool isServer,
                       std::chrono::milliseconds timeout,
                       bool waitWorkers,
                       bool multiTenant,
-                      c10::optional<int> masterListenFd,
+                      std::optional<int> masterListenFd,
                       bool useLibUV) {
-            c10::optional<std::size_t> numWorkers = c10::nullopt;
+            std::optional<std::size_t> numWorkers = c10::nullopt;
             if (worldSize.has_value() && worldSize.value() > -1) {
               numWorkers = static_cast<std::size_t>(worldSize.value());
             }
@@ -1801,14 +1801,14 @@ that adds a prefix to each key inserted to the store.
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  const c10::Device& device,
                  const ::c10d::ProcessGroup::BackendType& backendType,
-                 const c10::optional<c10::intrusive_ptr<::c10d::Backend>>&
+                 const std::optional<c10::intrusive_ptr<::c10d::Backend>>&
                      backend) {
                 self->setBackend(device.type(), backendType, backend);
               },
               py::arg("device"),
               py::arg("backend_type"),
               py::arg("backend") =
-                  c10::optional<c10::intrusive_ptr<::c10d::Backend>>(),
+                  std::optional<c10::intrusive_ptr<::c10d::Backend>>(),
               py::call_guard<py::gil_scoped_release>())
           .def(
               "_get_backend",
@@ -2432,7 +2432,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
           py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
                       size_t rank,
                       size_t world_size,
-                      c10::optional<size_t> buffer_size) {
+                      std::optional<size_t> buffer_size) {
             auto comm = c10::make_intrusive<IntraNodeComm>(
                 store, rank, world_size, buffer_size);
             if (!comm->rendezvous()) {
@@ -2744,7 +2744,7 @@ such as `dist.all_reduce(tensor, async_op=True)`.
          const std::vector<size_t>& bucket_size_limits,
          const std::vector<bool>& expect_sparse_gradient,
          const std::vector<int64_t>& tensor_indices,
-         const c10::optional<std::shared_ptr<::c10d::Logger>>& logger) {
+         const std::optional<std::shared_ptr<::c10d::Logger>>& logger) {
         if (logger.has_value()) {
           std::weak_ptr<::c10d::Logger> logger_weakref = logger.value();
           return ::c10d::compute_bucket_assignment_by_size(
@@ -2766,14 +2766,14 @@ such as `dist.all_reduce(tensor, async_op=True)`.
       py::arg("bucket_size"),
       py::arg("expect_sparse_gradient") = std::vector<bool>(),
       py::arg("tensor_indices") = std::vector<int64_t>(),
-      py::arg("logger") = c10::optional<std::shared_ptr<::c10d::Logger>>{},
+      py::arg("logger") = std::optional<std::shared_ptr<::c10d::Logger>>{},
       py::call_guard<py::gil_scoped_release>());
 
   module.def(
       "_verify_params_across_processes",
       [](const c10::intrusive_ptr<::c10d::ProcessGroup>& process_group,
          const std::vector<at::Tensor>& params,
-         const c10::optional<std::shared_ptr<::c10d::Logger>>& logger) {
+         const std::optional<std::shared_ptr<::c10d::Logger>>& logger) {
         if (logger.has_value()) {
           std::weak_ptr<::c10d::Logger> logger_weakref = logger.value();
           verify_params_across_processes(
@@ -2784,7 +2784,7 @@ such as `dist.all_reduce(tensor, async_op=True)`.
       },
       py::arg("process_group"),
       py::arg("params"),
-      py::arg("logger") = c10::optional<std::shared_ptr<::c10d::Logger>>{},
+      py::arg("logger") = std::optional<std::shared_ptr<::c10d::Logger>>{},
       py::call_guard<py::gil_scoped_release>());
 
   module.def(
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cpp b/torch/csrc/distributed/c10d/intra_node_comm.cpp
index d18262ecfa3f5..ceec7bbd0f9ce 100644
--- a/torch/csrc/distributed/c10d/intra_node_comm.cpp
+++ b/torch/csrc/distributed/c10d/intra_node_comm.cpp
@@ -207,7 +207,7 @@ IntraNodeComm::IntraNodeComm(
     c10::intrusive_ptr<c10d::Store> store,
     size_t rank,
     size_t worldSize,
-    c10::optional<size_t> bufferSize)
+    std::optional<size_t> bufferSize)
     : store_(std::move(store)),
       rank_(rank),
       worldSize_(worldSize),
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cu b/torch/csrc/distributed/c10d/intra_node_comm.cu
index 6d72bde221253..ce479cd187bc4 100644
--- a/torch/csrc/distributed/c10d/intra_node_comm.cu
+++ b/torch/csrc/distributed/c10d/intra_node_comm.cu
@@ -732,7 +732,7 @@ static __global__ void barrierKernel(
   }
 }
 
-void IntraNodeComm::barrier(c10::optional<std::vector<int64_t>> ranks) {
+void IntraNodeComm::barrier(std::optional<std::vector<int64_t>> ranks) {
   if (!ranks.has_value()) {
     ranks = std::vector<int64_t>(worldSize_);
     std::iota(ranks->begin(), ranks->end(), 0);
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.hpp b/torch/csrc/distributed/c10d/intra_node_comm.hpp
index ab27ecef97338..fe591978c5332 100644
--- a/torch/csrc/distributed/c10d/intra_node_comm.hpp
+++ b/torch/csrc/distributed/c10d/intra_node_comm.hpp
@@ -33,7 +33,7 @@ class TORCH_API IntraNodeComm : public c10::intrusive_ptr_target {
       c10::intrusive_ptr<c10d::Store> store,
       size_t rank,
       size_t worldSize,
-      c10::optional<size_t> bufferSize = c10::nullopt);
+      std::optional<size_t> bufferSize = c10::nullopt);
 
   ~IntraNodeComm() override;
 
@@ -61,7 +61,7 @@ class TORCH_API IntraNodeComm : public c10::intrusive_ptr_target {
   /**
    * Perform a barrier among the specified ranks.
    */
-  void barrier(c10::optional<std::vector<int64_t>> ranks = c10::nullopt);
+  void barrier(std::optional<std::vector<int64_t>> ranks = c10::nullopt);
 
   /**
    * Puts the given tensor into the p2p buffer of the current rank at the
diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
index 3ce4880930cb2..711039bf48595 100644
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -247,7 +247,7 @@ void Logger::calculate_avg_time(
     Timer::Event start_event,
     Timer::Event end_event) {
   TORCH_CHECK(num_iterations_stats_recorded_ > 0);
-  c10::optional<int64_t> maybe_time_duration =
+  std::optional<int64_t> maybe_time_duration =
       timer.measureDifference(start_event, end_event);
   if (!maybe_time_duration.has_value()) {
     return;
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index a885bd2e9e7cb..d600426192cef 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -51,7 +51,7 @@ class CpuTimer : public Timer {
  public:
   explicit CpuTimer(c10::Device /* unused */) {}
 
-  c10::optional<int64_t> measureDifference(Event start, Event end) override {
+  std::optional<int64_t> measureDifference(Event start, Event end) override {
     int64_t start_time = getTimeRef(start);
     int64_t end_time = getTimeRef(end);
     // If cpu_end_time is not recorded in this iteration,
@@ -2096,7 +2096,7 @@ compute_bucket_assignment_by_size(
     const std::vector<size_t>& bucket_size_limits,
     const std::vector<bool>& expect_sparse_gradient,
     const std::vector<int64_t>& tensor_indices,
-    const c10::optional<std::weak_ptr<c10d::Logger>>& logger) {
+    const std::optional<std::weak_ptr<c10d::Logger>>& logger) {
   // Either expect_sparse_gradient is not specified or it has as many elements
   // as the vector with tensors.
   TORCH_INTERNAL_ASSERT(
@@ -2221,7 +2221,7 @@ compute_bucket_assignment_by_size(
 void verify_params_across_processes(
     const c10::intrusive_ptr<c10d::ProcessGroup>& process_group,
     const std::vector<at::Tensor>& params,
-    const c10::optional<std::weak_ptr<c10d::Logger>>& logger) {
+    const std::optional<std::weak_ptr<c10d::Logger>>& logger) {
   // First verify number of parameters to avoid inconsistent inputs into
   // broadcast which can cause a crash.
   // See https://github.com/pytorch/pytorch/issues/73547
diff --git a/torch/csrc/distributed/c10d/reducer.hpp b/torch/csrc/distributed/c10d/reducer.hpp
index e940a56bd650a..1f72b0eb37b9f 100644
--- a/torch/csrc/distributed/c10d/reducer.hpp
+++ b/torch/csrc/distributed/c10d/reducer.hpp
@@ -261,10 +261,10 @@ class TORCH_API Reducer {
   std::weak_ptr<c10d::Logger> logger_;
   // List of futures installed by Reducer::install_futures that should be
   // awaited at the end of backwards pass.
-  c10::optional<c10::List<c10::intrusive_ptr<c10::ivalue::Future>>>
+  std::optional<c10::List<c10::intrusive_ptr<c10::ivalue::Future>>>
       installed_futures_{c10::nullopt};
   // Mixed precision parameter dtype for bucket type checking.
-  c10::optional<c10::ScalarType> mixed_precision_param_dtype_{c10::nullopt};
+  std::optional<c10::ScalarType> mixed_precision_param_dtype_{c10::nullopt};
 
   // Work handle for allreduce on local_used_map_
   c10::intrusive_ptr<c10d::Work> local_used_work_;
@@ -389,7 +389,7 @@ class TORCH_API Reducer {
     bool expect_sparse_gradient = false;
 
     // Sparse indices tensor
-    c10::optional<at::Tensor> sparse_tensor_indices = c10::nullopt;
+    std::optional<at::Tensor> sparse_tensor_indices = c10::nullopt;
 
     // TODO(@pietern)
     // Memory copies from gradient tensors into the bucket are potentially
@@ -576,12 +576,12 @@ compute_bucket_assignment_by_size(
     const std::vector<size_t>& bucket_size,
     const std::vector<bool>& expect_sparse_gradient = {},
     const std::vector<int64_t>& tensor_indices = {},
-    const c10::optional<std::weak_ptr<c10d::Logger>>& logger = {});
+    const std::optional<std::weak_ptr<c10d::Logger>>& logger = {});
 
 // Verify models across all processes are the same as model on rank 0 with
 // respect to no. of params and matching dtype/size/layout.
 TORCH_API void verify_params_across_processes(
     const c10::intrusive_ptr<c10d::ProcessGroup>& process_group,
     const std::vector<at::Tensor>& params,
-    const c10::optional<std::weak_ptr<c10d::Logger>>& logger);
+    const std::optional<std::weak_ptr<c10d::Logger>>& logger);
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/reducer_cuda.cpp b/torch/csrc/distributed/c10d/reducer_cuda.cpp
index b63e9d3d6f3c8..84bff02072b60 100644
--- a/torch/csrc/distributed/c10d/reducer_cuda.cpp
+++ b/torch/csrc/distributed/c10d/reducer_cuda.cpp
@@ -48,7 +48,7 @@ class CudaTimer : public Timer {
     getEvent(event).record();
   }
 
-  c10::optional<int64_t> measureDifference(Event start, Event end) override {
+  std::optional<int64_t> measureDifference(Event start, Event end) override {
     c10::DeviceGuard g(device);
     at::cuda::CUDAEvent& start_event = getEvent(start);
     at::cuda::CUDAEvent& end_event = getEvent(end);
diff --git a/torch/csrc/distributed/c10d/reducer_timer.hpp b/torch/csrc/distributed/c10d/reducer_timer.hpp
index 5f57051455f62..f9b9f11c8c963 100644
--- a/torch/csrc/distributed/c10d/reducer_timer.hpp
+++ b/torch/csrc/distributed/c10d/reducer_timer.hpp
@@ -39,12 +39,12 @@ class TORCH_API Timer {
 
   // Return the difference between when two events occurred, in nanoseconds.
   // Or nullopt if one of them hasn't been recorded.
-  virtual c10::optional<int64_t> measureDifference(Event start, Event end) = 0;
+  virtual std::optional<int64_t> measureDifference(Event start, Event end) = 0;
 
   virtual ~Timer() = default;
 
   // Return host-side timestamp, or nullopt if it has not yet been recorded.
-  c10::optional<int64_t> getTimestamp(Event event) {
+  std::optional<int64_t> getTimestamp(Event event) {
     auto time = getTimeRef(event);
     if (time == kUnsetTime) {
       return c10::nullopt;
diff --git a/torch/csrc/distributed/c10d/sequence_num.hpp b/torch/csrc/distributed/c10d/sequence_num.hpp
index 8c80642f42784..ce31f4b552728 100644
--- a/torch/csrc/distributed/c10d/sequence_num.hpp
+++ b/torch/csrc/distributed/c10d/sequence_num.hpp
@@ -59,7 +59,7 @@ class TORCH_API SequenceNum {
   SequenceNum(const SequenceNum& other);
 
  private:
-  c10::optional<uint64_t> num_;
+  std::optional<uint64_t> num_;
   mutable std::mutex lock_;
 };
 
diff --git a/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.cpp b/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.cpp
index 255a16af6bb0d..3a37e7b02a5f0 100644
--- a/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.cpp
+++ b/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.cpp
@@ -9,7 +9,7 @@ namespace distributed {
 namespace rpc {
 const std::string REMOTE_PROFILING_KEY_PREFIX = "#remote_op: ";
 constexpr int kAutoIncrementBits = 48;
-/*static */ thread_local c10::optional<std::string>
+/*static */ thread_local std::optional<std::string>
     RemoteProfilerManager::currentThreadLocalKey_ = c10::nullopt;
 /*static */ RemoteProfilerManager& RemoteProfilerManager::getInstance() {
   static RemoteProfilerManager* handler = new RemoteProfilerManager();
diff --git a/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.h b/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.h
index d85ee5a393078..c6f8b353806b5 100644
--- a/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.h
+++ b/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.h
@@ -50,7 +50,7 @@ class TORCH_API RemoteProfilerManager {
   local_id_t getNextLocalId();
   std::unordered_map<ProfilingId, std::string, ProfilingId::Hash>
       profiledRpcKeys_;
-  static thread_local c10::optional<std::string> currentThreadLocalKey_;
+  static thread_local std::optional<std::string> currentThreadLocalKey_;
   std::mutex mutex_;
   local_id_t currentLocalId_;
 };
diff --git a/torch/csrc/distributed/rpc/py_rref.h b/torch/csrc/distributed/rpc/py_rref.h
index 432141a97cf5c..2c9fd3433d045 100644
--- a/torch/csrc/distributed/rpc/py_rref.h
+++ b/torch/csrc/distributed/rpc/py_rref.h
@@ -75,8 +75,8 @@ class PYBIND11_EXPORT PyRRef {
 
  private:
   c10::intrusive_ptr<RRef> rref_;
-  c10::optional<c10::intrusive_ptr<JitFuture>> profilingFuture_;
-  c10::optional<py::object> type_;
+  std::optional<c10::intrusive_ptr<JitFuture>> profilingFuture_;
+  std::optional<py::object> type_;
 };
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h
index 0b04c08287087..8f9222a2e8647 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.h
+++ b/torch/csrc/distributed/rpc/rpc_agent.h
@@ -170,7 +170,7 @@ class TORCH_API RpcAgent {
       RpcRetryOptions retryOptions = RpcRetryOptions());
 
   // Return a reference to the ``WorkerInfo`` of this RpcAgent.
-  // NB: not using ``c10::optional<const std::string&>`` here because we might
+  // NB: not using ``std::optional<const std::string&>`` here because we might
   // need to create a separate RPC API lib and avoid forcing all ``RpcAgent``
   // implementations to depend on libtorch.
   const WorkerInfo& getWorkerInfo() const;
diff --git a/torch/csrc/distributed/rpc/rref_impl.cpp b/torch/csrc/distributed/rpc/rref_impl.cpp
index a770379438901..98d8f1afcb86b 100644
--- a/torch/csrc/distributed/rpc/rref_impl.cpp
+++ b/torch/csrc/distributed/rpc/rref_impl.cpp
@@ -248,7 +248,7 @@ OwnerRRef::OwnerRRef(
     worker_id_t ownerId,
     const RRefId& rrefId,
     TypePtr type,
-    c10::optional<IValue> value,
+    std::optional<IValue> value,
     std::vector<c10::Device> devices)
     : RRef(ownerId, rrefId, type) {
   future_ = c10::make_intrusive<JitFuture>(type_, std::move(devices));
diff --git a/torch/csrc/distributed/rpc/rref_impl.h b/torch/csrc/distributed/rpc/rref_impl.h
index ccb00b45e1d5e..d6da3f2ea455f 100644
--- a/torch/csrc/distributed/rpc/rref_impl.h
+++ b/torch/csrc/distributed/rpc/rref_impl.h
@@ -366,7 +366,7 @@ class TORCH_API OwnerRRef final : public RRef {
       worker_id_t ownerId,
       const RRefId& rrefId,
       TypePtr type,
-      c10::optional<IValue> value,
+      std::optional<IValue> value,
       std::vector<c10::Device> devices);
 
   inline bool isOwner() const override {
diff --git a/torch/csrc/distributed/rpc/script_call.h b/torch/csrc/distributed/rpc/script_call.h
index 2fc0efb8cdc71..dacded5cc1e62 100644
--- a/torch/csrc/distributed/rpc/script_call.h
+++ b/torch/csrc/distributed/rpc/script_call.h
@@ -58,10 +58,10 @@ class TORCH_API ScriptCall : public RpcCommandBase {
 
   // This field has value if this ScriptCall represents invocation of a builtin
   // operator.
-  c10::optional<std::shared_ptr<Operator>> op_;
+  std::optional<std::shared_ptr<Operator>> op_;
   // This field has non empty string if this ScriptCall represents invocation of
   // an annotated torchscript function defined by users.
-  c10::optional<const c10::QualifiedName> qualifiedName_;
+  std::optional<const c10::QualifiedName> qualifiedName_;
   std::vector<at::IValue> stack_;
   const bool isAsyncExecution_;
 };
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 0f0cf00201612..8af4336c07467 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -111,7 +111,7 @@ std::vector<c10::Stream> getCurrentStreamsForDevices(
 
 std::vector<c10::Device> getDevicesOfTensors(
     const std::vector<torch::Tensor>& tensors) {
-  c10::optional<c10::impl::VirtualGuardImpl> impl;
+  std::optional<c10::impl::VirtualGuardImpl> impl;
   size_t deviceCount = 0;
   std::vector<bool> indexBitset;
   for (const torch::Tensor& tensor : tensors) {
diff --git a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
index 968f599752d64..50cc97785f61d 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
@@ -74,7 +74,7 @@ C10_REGISTER_CREATOR(
 
 class TensorpipeCudaConverter : public TensorpipeDeviceTypeConverter {
  public:
-  c10::optional<std::vector<char>> prepareTensorForSending(
+  std::optional<std::vector<char>> prepareTensorForSending(
       const c10::Storage& storage,
       const std::vector<c10::Stream>& streams,
       tensorpipe::Message& message) const override {
diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
index 0b3715f44f86d..929ae30f8a6d4 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
@@ -38,7 +38,7 @@ inline c10::Device indexToDevice(c10::DeviceIndex index) {
 
 class TensorpipeCpuConverter : public TensorpipeDeviceTypeConverter {
  public:
-  c10::optional<std::vector<char>> prepareTensorForSending(
+  std::optional<std::vector<char>> prepareTensorForSending(
       const c10::Storage& storage,
       const std::vector<c10::Stream>& /* streams */,
       tensorpipe::Message& message) const override {
@@ -192,7 +192,7 @@ std::tuple<tensorpipe::Message, TensorpipeWriteBuffers> tensorpipeSerialize(
         tensor.device());
 
     TORCH_INTERNAL_ASSERT(tpMessage.tensors.size() == i);
-    c10::optional<std::vector<char>> maybeCopiedTensor =
+    std::optional<std::vector<char>> maybeCopiedTensor =
         converter->prepareTensorForSending(
             tensor.storage(), streams, tpMessage);
     TORCH_INTERNAL_ASSERT(tpMessage.tensors.size() == i + 1);
diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.h b/torch/csrc/distributed/rpc/tensorpipe_utils.h
index 1011a9c34c3d8..d269a5bfbf565 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_utils.h
+++ b/torch/csrc/distributed/rpc/tensorpipe_utils.h
@@ -27,7 +27,7 @@ class TensorpipeDeviceTypeConverter {
   // cannot include the TensorPipe headers because it's a private dependency.
   // Thus we bend over backwards and entrust this method with appending that
   // object to the `tensors` field of the tensorpipe::Message object we pass.
-  virtual c10::optional<std::vector<char>> prepareTensorForSending(
+  virtual std::optional<std::vector<char>> prepareTensorForSending(
       const c10::Storage& storage,
       const std::vector<c10::Stream>& streams,
       tensorpipe::Message& message) const = 0;
diff --git a/torch/csrc/dynamo/compiled_autograd.h b/torch/csrc/dynamo/compiled_autograd.h
index a92d6ade0c002..ca2fd412cf8d4 100644
--- a/torch/csrc/dynamo/compiled_autograd.h
+++ b/torch/csrc/dynamo/compiled_autograd.h
@@ -232,7 +232,7 @@ class CompiledNodeArgs {
     collect(t.list);
   }
   template <typename T>
-  void collect(const c10::optional<T>& t) {
+  void collect(const std::optional<T>& t) {
     if (cond(t.has_value())) {
       collect(*t);
     }
@@ -520,20 +520,20 @@ class CompiledNodeArgs {
 
 struct TraceState {
   TraceState(
-      const std::vector<c10::optional<c10::SymInt>>& ss,
+      const std::vector<std::optional<c10::SymInt>>& ss,
       size_t num_outputs)
       : sym_sizes(ss), outputs(num_outputs) {}
 
   void debug_asserts() {
     TORCH_INTERNAL_ASSERT(sym_sizes_index == sym_sizes.size());
   }
-  c10::optional<c10::SymInt> next_sym_size() {
+  std::optional<c10::SymInt> next_sym_size() {
     TORCH_INTERNAL_ASSERT(sym_sizes_index < sym_sizes.size());
     return sym_sizes[sym_sizes_index++];
   }
 
   size_t sym_sizes_index{0};
-  std::vector<c10::optional<c10::SymInt>> sym_sizes;
+  std::vector<std::optional<c10::SymInt>> sym_sizes;
   variable_list outputs;
 };
 
@@ -664,13 +664,13 @@ class SwapSavedVariables {
   }
 
   template <typename T>
-  void before(c10::optional<T>& t) {
+  void before(std::optional<T>& t) {
     if (t.has_value()) {
       before(*t);
     }
   }
   template <typename T>
-  void after(c10::optional<T>& t) {
+  void after(std::optional<T>& t) {
     if (t.has_value()) {
       after(*t);
     }
diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c
index bf710b9ff7e9f..c301da9829f50 100644
--- a/torch/csrc/dynamo/cpython_defs.c
+++ b/torch/csrc/dynamo/cpython_defs.c
@@ -13,6 +13,17 @@
   } else {                                                              \
   }
 
+#if IS_PYTHON_3_13_PLUS
+// Gave up after fixing a few of these
+// pycore_opcode.h is gone (new is pycore_opcode_metadata.h ?)
+// f_code is gone (new is f_executable?)
+
+// Fake definitions for what we removed
+const uint8_t* THP_PyOpcode_Caches = NULL;
+const int THP_PyOpcode_Caches_size = 0;
+
+#else
+
 // NOTE: all `assert`s below are converted to `CHECK`s
 
 #if IS_PYTHON_3_11_PLUS
@@ -29,8 +40,8 @@
 #define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt, _PyOpcode_Caches
 #include <internal/pycore_opcode.h>
 #undef NEED_OPCODE_TABLES
-#undef Py_BUILD_CORE
 #include <internal/pycore_frame.h>
+#undef Py_BUILD_CORE
 
 // As a simple way to reduce the impact of ABI changes on the CPython side, this check forces
 // us to manually re-check that the function didn't change on the next major version
@@ -677,3 +688,5 @@ const uint8_t* THP_PyOpcode_Caches = NULL;
 const int THP_PyOpcode_Caches_size = 0;
 
 #endif
+
+#endif // CPython 3.13
\ No newline at end of file
diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h
index b762f87d69df3..d4432b8bb43d4 100644
--- a/torch/csrc/dynamo/cpython_defs.h
+++ b/torch/csrc/dynamo/cpython_defs.h
@@ -8,7 +8,9 @@
 
 #if IS_PYTHON_3_11_PLUS
 
+#define Py_BUILD_CORE
 #include <internal/pycore_frame.h>
+#undef Py_BUILD_CORE
 
 int THP_PyFrame_FastToLocalsWithError(
     _PyInterpreterFrame* frame,
diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
index b6a26f635ec4c..cbe9ab37a5dd6 100644
--- a/torch/csrc/dynamo/eval_frame.c
+++ b/torch/csrc/dynamo/eval_frame.c
@@ -8,6 +8,31 @@
 #include <opcode.h>
 #include <stdbool.h>
 
+
+
+PyObject* guard_error_hook = NULL;
+const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup";
+
+static int active_dynamo_threads = 0;
+
+static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT;
+
+inline static PyObject* eval_frame_callback_get(void) {
+  void* result = PyThread_tss_get(&eval_frame_callback_key);
+  if (unlikely(result == NULL)) {
+    return (PyObject*)Py_None;
+  } else {
+    return (PyObject*)result;
+  }
+}
+
+inline static void eval_frame_callback_set(PyObject* obj) {
+  PyThread_tss_set(&eval_frame_callback_key, obj);
+}
+
+// 3.13 Not supported at all. See cpython_defs.c for hints
+#if !(IS_PYTHON_3_13_PLUS)
+
 // Problem in CPython includes when mixing core and non-core build
 // The fix was not backported to 3.12 so this is needed here
 // https://github.com/python/cpython/issues/105268
@@ -138,24 +163,6 @@ THP_PyFrame_FastToLocalsWithError(THP_EVAL_API_FRAME_OBJECT *frame, int *free_va
 }
 #endif
 
-PyObject* guard_error_hook = NULL;
-const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup";
-
-static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT;
-
-inline static PyObject* eval_frame_callback_get(void) {
-  void* result = PyThread_tss_get(&eval_frame_callback_key);
-  if (unlikely(result == NULL)) {
-    return (PyObject*)Py_None;
-  } else {
-    return (PyObject*)result;
-  }
-}
-
-inline static void eval_frame_callback_set(PyObject* obj) {
-  PyThread_tss_set(&eval_frame_callback_key, obj);
-}
-
 static PyObject* _custom_eval_frame_shim(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
@@ -674,7 +681,29 @@ static PyObject* _custom_eval_frame(
   }
 }
 
-static int active_dynamo_threads = 0;
+#else // IS_PYTHON_3_13_PLUS
+
+// Fake definitions for everything we removed
+
+typedef struct THPPyInterpreterFrame {
+  PyObject_HEAD
+  _PyInterpreterFrame* frame; // Borrowed reference
+} THPPyInterpreterFrame;
+
+inline static void enable_eval_frame_shim(PyThreadState* tstate) {}
+inline static void enable_eval_frame_default(PyThreadState* tstate) {}
+
+static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL};
+
+static PyTypeObject THPPyInterpreterFrameType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "torch._C.dynamo.eval_frame._PyInterpreterFrame",
+    .tp_basicsize = sizeof(THPPyInterpreterFrame),
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_getset = THPPyInterpreterFrame_properties,
+};
+
+#endif // CPython 3.13
 
 static PyObject* increment_working_threads(PyThreadState* tstate) {
   active_dynamo_threads = active_dynamo_threads + 1;
diff --git a/torch/csrc/dynamo/python_compiled_autograd.cpp b/torch/csrc/dynamo/python_compiled_autograd.cpp
index dd5ea7cbd094f..fb27b39b28e6a 100644
--- a/torch/csrc/dynamo/python_compiled_autograd.cpp
+++ b/torch/csrc/dynamo/python_compiled_autograd.cpp
@@ -203,12 +203,12 @@ struct CacheNode {
     return pyinput;
   }
 
-  std::vector<c10::optional<SymInt>> unwrap_dynamic_inputs(
+  std::vector<std::optional<SymInt>> unwrap_dynamic_inputs(
       PyObject* pyresult) const {
     TORCH_INTERNAL_ASSERT(PyList_CheckExact(pyresult));
     size_t idx = 0;
     size_t result_len = PyList_GET_SIZE(pyresult);
-    std::vector<c10::optional<SymInt>> result;
+    std::vector<std::optional<SymInt>> result;
     result.reserve(expected_sizes.size());
     for (const auto& i : expected_sizes) {
       if (i.dyn_type == SizeInput::DYNAMIC) {
diff --git a/torch/csrc/functorch/init.cpp b/torch/csrc/functorch/init.cpp
index c2996fe5278a7..6bce80ad27766 100644
--- a/torch/csrc/functorch/init.cpp
+++ b/torch/csrc/functorch/init.cpp
@@ -375,7 +375,7 @@ static int64_t currentLevel() {
   return current_level;
 }
 
-static c10::optional<int64_t> maybe_current_level() {
+static std::optional<int64_t> maybe_current_level() {
   auto maybe_layer = maybeCurrentDynamicLayer();
   if (maybe_layer.has_value()) {
     int current_level = maybe_layer->layerId();
@@ -438,7 +438,7 @@ struct PreserveDynamicLayerStack {
 
 } // anonymous namespace
 
-static std::tuple<Tensor, c10::optional<int64_t>> unwrapBatched(
+static std::tuple<Tensor, std::optional<int64_t>> unwrapBatched(
     const Tensor& tensor,
     int64_t level) {
   auto* batched = maybeGetBatchedImpl(tensor);
@@ -534,7 +534,7 @@ void initFuncTorchBindings(PyObject* module) {
     return maybe_get_level(tensor) != -1;
   });
   m.def(
-      "get_interpreter_stack", []() -> c10::optional<std::vector<Interpreter>> {
+      "get_interpreter_stack", []() -> std::optional<std::vector<Interpreter>> {
         const auto& stack = getDynamicLayerStack();
         if (stack.empty()) {
           return c10::nullopt;
@@ -545,7 +545,7 @@ void initFuncTorchBindings(PyObject* module) {
         }
         return result;
       });
-  m.def("peek_interpreter_stack", []() -> c10::optional<Interpreter> {
+  m.def("peek_interpreter_stack", []() -> std::optional<Interpreter> {
     const auto& stack = getDynamicLayerStack();
     if (stack.empty()) {
       return c10::nullopt;
diff --git a/torch/csrc/inductor/aoti_eager/kernel_holder.cpp b/torch/csrc/inductor/aoti_eager/kernel_holder.cpp
index 55c0d71c55f4b..238050f501223 100644
--- a/torch/csrc/inductor/aoti_eager/kernel_holder.cpp
+++ b/torch/csrc/inductor/aoti_eager/kernel_holder.cpp
@@ -4,6 +4,7 @@
 #include <ATen/ATen.h>
 
 #include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/csrc/Dtype.h>
 #include <torch/csrc/PyInterpreter.h>
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
@@ -12,6 +13,11 @@
 #endif
 #include <torch/csrc/jit/frontend/function_schema_parser.h>
 
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
+
 namespace torch::inductor {
 
 namespace {
@@ -75,8 +81,8 @@ bool unpack_ivalue(
     // ivalue is scalar
     unpack_scalar_ivalue(ivalue, device, inputs);
   } else if (
-      *argument.real_type() == *c10::getTypePtr<c10::optional<at::Tensor>>()) {
-    // ivalue is c10::optional<at::Tensor>
+      *argument.real_type() == *c10::getTypePtr<std::optional<at::Tensor>>()) {
+    // ivalue is std::optional<at::Tensor>
     unpack_optional_tensor_ivalue(ivalue, device, inputs);
   } else {
     // Unsupport IValue type.
@@ -115,14 +121,16 @@ AOTIPythonKernelHolder::AOTIPythonKernelHolder(
       (device_.type() == c10::DeviceType::CPU) ||
           (device_.type() == c10::DeviceType::CUDA),
       "Unsupported device type");
+  init_aoti_kernel_cache();
 }
 
 void AOTIPythonKernelHolder::operator()(
     const c10::OperatorHandle& op,
     c10::DispatchKeySet keyset,
     torch::jit::Stack* stack) {
-  if (cache_lookup(op, keyset, stack)) {
-    cache_hit(op, keyset, stack);
+  AOTIKernelState kernel_state;
+  if (cache_lookup(op, keyset, stack, kernel_state)) {
+    cache_hit(kernel_state, op, keyset, stack);
   } else {
     cache_miss(op, keyset, stack);
   }
@@ -130,23 +138,190 @@ void AOTIPythonKernelHolder::operator()(
 
 bool AOTIPythonKernelHolder::cache_lookup(
     const c10::OperatorHandle& op,
-    c10::DispatchKeySet keyset,
-    torch::jit::Stack* stack) {
-  // TODO: Always return false now to implement cache_miss. Later, we will add
-  // cache lookup and implement cache hit.
-  return false;
+    const c10::DispatchKeySet& keyset,
+    const torch::jit::Stack* stack,
+    AOTIKernelState& kernel_state) {
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      op.schema().returns().size() == 1,
+      "Not implemented for operations that return either multiple values or no value.");
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      op.schema().returns()[0].type()->isSubtypeOf(c10::TensorType::get()),
+      "Not implemented for operations that return a non-Tensor value.");
+
+  std::vector<at::Tensor> inputs;
+  auto res = unpack_tensors(op.schema().arguments(), *stack, device_, inputs);
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      res && inputs.size() > 0,
+      "Not implemented for operations that contain a parameter which is ",
+      "not one of the following types: at::Tensor, at::TensorList, ",
+      "std::optional<at::Tensor>, std::vector<std::optional<at::Tensor>>.");
+
+  auto inputs_metadata = get_inputs_metadata(inputs);
+  auto aoti_kernel_state = aoti_kernel_cache_.find(inputs_metadata);
+  if (aoti_kernel_state == aoti_kernel_cache_.end()) {
+    return false;
+  }
+
+  if (aoti_kernel_state->second.tensor_checks_.size() != inputs.size()) {
+    return false;
+  }
+
+  torch::dynamo::LocalState local_state;
+  local_state.overrideDispatchKeySet(c10::DispatchKeySet(dispatch_key_));
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    bool pass = aoti_kernel_state->second.tensor_checks_[i].check(
+        local_state, inputs[i]);
+    if (!pass) {
+      return false;
+    }
+  }
+
+  kernel_state = aoti_kernel_state->second;
+  return true;
 }
 
 void AOTIPythonKernelHolder::cache_hit(
+    const AOTIKernelState& kernel_state,
     const c10::OperatorHandle& op,
-    c10::DispatchKeySet keyset,
+    const c10::DispatchKeySet& keyset,
     torch::jit::Stack* stack) {
-  TORCH_INTERNAL_ASSERT(false);
+  std::vector<at::Tensor> inputs;
+  unpack_tensors(op.schema().arguments(), *stack, device_, inputs);
+  torch::jit::drop(*stack, op.schema().arguments().size());
+
+  auto outputs = kernel_state.kernel_runner_->run(inputs);
+  for (auto& output : outputs) {
+    stack->push_back(output);
+  }
+}
+
+AOTIKernelMetadata AOTIPythonKernelHolder::get_inputs_metadata(
+    const std::vector<at::Tensor>& inputs) {
+  AOTIKernelMetadata inputs_metadata;
+  for (const auto& input : inputs) {
+    auto device = input.device();
+    if (device.is_cpu()) {
+      // If the device is CPU, set the device index to -1.
+      device = c10::Device(device.type(), -1);
+    }
+
+    inputs_metadata.emplace_back(
+        false, // is symbloic
+        input.scalar_type(),
+        device,
+        input.sizes().vec(),
+        input.strides().vec());
+  }
+  return inputs_metadata;
+}
+
+void AOTIPythonKernelHolder::init_aoti_kernel_cache() {
+  if (device_.type() == c10::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES) {
+    return;
+  }
+
+  py::gil_scoped_acquire gil;
+
+  py::handle load_aoti_eager_cache_function =
+      py::module::import("torch._inductor.utils").attr("load_aoti_eager_cache");
+  TORCH_INTERNAL_ASSERT(
+      load_aoti_eager_cache_function.ptr() != nullptr,
+      "Failed to import - torch._inductor.utils.load_aoti_eager_cache");
+
+  auto result = py::reinterpret_steal<py::object>(PyObject_CallFunctionObjArgs(
+      load_aoti_eager_cache_function.ptr(),
+      py::str(ns_).ptr(),
+      py::str(op_name_with_overload_).ptr(),
+      py::str(c10::DeviceTypeName(device_.type(), true)).ptr(),
+      nullptr));
+  TORCH_INTERNAL_ASSERT(
+      result.ptr() != nullptr && result.ptr() != Py_None,
+      "Failed to load AOTI kernel. Operator Name is ",
+      op_name_with_overload_);
+
+  auto kernel_info_list = result.cast<py::list>();
+  for (auto kernel_info : kernel_info_list) {
+    auto item_dict = kernel_info.cast<py::dict>();
+
+    // Access the kernel_path field
+    auto kernel_path = item_dict["kernel_path"].cast<std::string>();
+
+    // Access the meta_info list
+    auto inputs_metadata = item_dict["meta_info"].cast<py::list>();
+
+    std::vector<torch::dynamo::TensorCheck> tensor_checks;
+    std::vector<TensorMetadata> tensor_metadata_list;
+
+    torch::dynamo::LocalState state;
+    // Loop over the meta_info list
+    for (auto item : inputs_metadata) {
+      // Convert the handle to a dict
+      auto metadata = item.cast<py::dict>();
+
+      // Access the fields of each metadata dict
+      auto is_dynamic = metadata["is_dynamic"].cast<bool>();
+      auto device_type = metadata["device_type"].cast<std::string>();
+      auto device_index = metadata["device_index"].cast<int8_t>();
+      auto data_type_obj = metadata["dtype"].cast<py::object>();
+      TORCH_INTERNAL_ASSERT(THPDtype_Check(data_type_obj.ptr()));
+      auto data_type =
+          reinterpret_cast<THPDtype*>(data_type_obj.ptr())->scalar_type;
+      auto sizes = metadata["sizes"].cast<std::vector<int64_t>>();
+      auto strides = metadata["strides"].cast<std::vector<int64_t>>();
+
+      std::vector<std::optional<c10::SymInt>> sym_optional_sizes;
+      std::vector<std::optional<c10::SymInt>> sym_optional_strides;
+      for (int64_t size : sizes) {
+        sym_optional_sizes.push_back(std::optional<c10::SymInt>(size));
+      }
+      for (int64_t stride : strides) {
+        sym_optional_strides.push_back(std::optional<c10::SymInt>(stride));
+      }
+
+      // Now you can use these variables in your code
+      tensor_metadata_list.emplace_back(
+          is_dynamic,
+          data_type,
+          c10::Device(c10::Device(device_type).type(), device_index),
+          sizes,
+          strides);
+      tensor_checks.emplace_back(
+          state,
+          nullptr,
+          uint64_t(c10::DispatchKeySet(dispatch_key_).raw_repr()),
+          data_type,
+          c10::DeviceIndex(device_index),
+          sym_optional_sizes,
+          sym_optional_strides);
+    }
+
+    AOTIKernelState aoti_kernel_state;
+    aoti_kernel_state.kernel_runner_ = load_aoti_model_runner(kernel_path);
+    aoti_kernel_state.tensor_checks_ = tensor_checks;
+    aoti_kernel_cache_[tensor_metadata_list] = aoti_kernel_state;
+  }
+}
+
+std::shared_ptr<AOTIModelContainerRunner> AOTIPythonKernelHolder::
+    load_aoti_model_runner(const std::string& so_path) {
+  if (device_.type() == c10::DeviceType::CUDA) {
+#ifdef USE_CUDA
+    return std::make_shared<AOTIModelContainerRunnerCpu>(so_path);
+#else
+    return nullptr;
+#endif
+  } else if (device_.type() == c10::DeviceType::CPU) {
+    return std::make_shared<AOTIModelContainerRunnerCpu>(so_path);
+  } else {
+    TORCH_WARN("Unsupported device type");
+    return nullptr;
+  }
 }
 
 void AOTIPythonKernelHolder::cache_miss(
     const c10::OperatorHandle& op,
-    c10::DispatchKeySet keyset,
+    const c10::DispatchKeySet& keyset,
     torch::jit::Stack* stack) {
   auto kernel_lib_path = produce_aoti_kernel_lib(op, keyset, stack);
   std::shared_ptr<AOTIModelContainerRunner> kernel = nullptr;
@@ -167,41 +342,41 @@ void AOTIPythonKernelHolder::cache_miss(
       unpack_tensors(op.schema().arguments(), *stack, device_, inputs),
       "Failed to unpack tensors for the stack to run the AOTI kernel.");
   auto outputs = kernel->run(inputs);
-  if (outputs.size() > 0) {
-    torch::jit::drop(*stack, op.schema().arguments().size());
-    // TODO: Get the output type of this operation and then convert to the
-    // output type.
-    for (auto& output : outputs) {
-      torch::jit::push(*stack, std::move(output));
-    }
+  torch::jit::drop(*stack, op.schema().arguments().size());
+  // TODO: Get the output type of this operation and then convert to the
+  // output type.
+  for (auto& output : outputs) {
+    torch::jit::push(*stack, std::move(output));
   }
 }
 
 std::string AOTIPythonKernelHolder::produce_aoti_kernel_lib(
     const c10::OperatorHandle& op,
-    c10::DispatchKeySet keyset,
-    torch::jit::Stack* stack) {
+    const c10::DispatchKeySet& keyset,
+    const torch::jit::Stack* stack) {
   auto arguments = torch::jit::last(*stack, op.schema().arguments().size());
 
-  py::gil_scoped_acquire gil;
-
-  // Get the corresponding python operation for the current operator and the
-  // python operation will pass to the AOT Inductor to generate the kernel
-  // library.
   const auto& schema = op.schema();
   const auto& qualified_name = op.operator_name().name;
   const auto& overload_name =
       schema.overload_name().empty() ? "default" : schema.overload_name();
   auto pos = qualified_name.find("::");
   TORCH_INTERNAL_ASSERT(pos != std::string::npos, qualified_name);
-  // Make me some null terminated strings
-  std::string ns_str = qualified_name.substr(0, pos);
-  const char* ns = ns_str.c_str();
-  const char* func_name = qualified_name.c_str() + pos + strlen("::");
+  std::string ns_str(qualified_name.begin(), qualified_name.begin() + pos);
+  std::string func_name(
+      qualified_name.begin() + pos + strlen("::"), qualified_name.end());
+
+  py::gil_scoped_acquire gil;
   py::handle op_py_func = op.getPythonOp(pyinterpreter_, [&]() -> PyObject* {
-    py::handle torch_api_function =
-        py::module::import("torch").attr("ops").attr(ns).attr(func_name);
-    return torch_api_function.attr(overload_name.c_str()).ptr();
+    py::handle torch_api_function = py::module::import("torch")
+                                        .attr("ops")
+                                        .attr(ns_str.c_str())
+                                        .attr(func_name.c_str());
+    if (overload_name.empty()) {
+      return torch_api_function.attr("default").ptr();
+    } else {
+      return torch_api_function.attr(overload_name.c_str()).ptr();
+    }
   });
 
   TORCH_INTERNAL_ASSERT(
@@ -212,17 +387,22 @@ std::string AOTIPythonKernelHolder::produce_aoti_kernel_lib(
       overload_name);
 
   py::handle aot_compile_function =
-      py::module::import("torch._export").attr("aot_compile");
+      py::module::import("torch._inductor.utils")
+          .attr("aoti_compile_with_persistent_cache");
   TORCH_INTERNAL_ASSERT(
       aot_compile_function.ptr() != nullptr &&
           aot_compile_function.ptr() != Py_None,
-      "Failed to import - torch._export.aot_compile");
+      "Failed to import - torch._inductor.utils.aoti_compile_with_persistent_cache");
 
   // Pass the python operation to the AOT Inductor to generate the kernel
   // library.
   auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments.vec());
   auto result = py::reinterpret_steal<py::object>(PyObject_CallFunctionObjArgs(
       aot_compile_function.ptr(),
+      py::str(ns_str).ptr(),
+      py::str(op_name_with_overload_).ptr(),
+      py::str(c10::DeviceTypeName(device_.type(), true)).ptr(),
+      py::bool_(false).ptr(),
       op_py_func.ptr(),
       args_kwargs.first.ptr(),
       args_kwargs.second.ptr(),
diff --git a/torch/csrc/inductor/aoti_eager/kernel_holder.h b/torch/csrc/inductor/aoti_eager/kernel_holder.h
index f7a886eb266bd..9cbcc217d7c30 100644
--- a/torch/csrc/inductor/aoti_eager/kernel_holder.h
+++ b/torch/csrc/inductor/aoti_eager/kernel_holder.h
@@ -4,6 +4,8 @@
 #include <ATen/ATen.h>
 #include <ATen/core/boxing/KernelFunction.h>
 
+#include <torch/csrc/dynamo/guards.h>
+#include <torch/csrc/inductor/aoti_eager/kernel_meta_info.h>
 #include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
 #include <torch/csrc/utils/pybind.h>
 
@@ -11,6 +13,11 @@
 
 namespace torch::inductor {
 
+struct AOTIKernelState {
+  std::shared_ptr<AOTIModelContainerRunner> kernel_runner_;
+  std::vector<torch::dynamo::TensorCheck> tensor_checks_;
+};
+
 // The AOTIPythonKernelHolder class uses the AOT Inductor to generate a kernel
 // for a specified operation. To speed up this process, the generated kernel
 // library is cached on disk. Detailed information from the input tensors is
@@ -31,6 +38,10 @@ class AOTIPythonKernelHolder : public c10::OperatorKernel {
   // op_overload_name.
   c10::impl::PyInterpreter* pyinterpreter_;
 
+  std::
+      unordered_map<AOTIKernelMetadata, AOTIKernelState, AOTIKernelMetadataHash>
+          aoti_kernel_cache_;
+
  public:
   AOTIPythonKernelHolder(
       c10::DispatchKey dispatch_key,
@@ -45,20 +56,36 @@ class AOTIPythonKernelHolder : public c10::OperatorKernel {
  private:
   bool cache_lookup(
       const c10::OperatorHandle& op,
-      c10::DispatchKeySet keyset,
-      torch::jit::Stack* stack);
+      const c10::DispatchKeySet& keyset,
+      const torch::jit::Stack* stack,
+      AOTIKernelState& kernel_state);
   void cache_miss(
       const c10::OperatorHandle& op,
-      c10::DispatchKeySet keyset,
+      const c10::DispatchKeySet& keyset,
       torch::jit::Stack* stack);
   void cache_hit(
+      const AOTIKernelState& kernel_state,
       const c10::OperatorHandle& op,
-      c10::DispatchKeySet keyset,
+      const c10::DispatchKeySet& keyset,
       torch::jit::Stack* stack);
+  // Invoke python utility function on the Inductor side to produce AOTI kernel
+  // for the given operation.
+  //   Inductor utility function -
+  //   torch._inductor.utils.aoti_compile_with_persistent_cache
   std::string produce_aoti_kernel_lib(
       const c10::OperatorHandle& op,
-      c10::DispatchKeySet keyset,
-      torch::jit::Stack* stack);
+      const c10::DispatchKeySet& keyset,
+      const torch::jit::Stack* stack);
+  // Invoke python utility function on the Inductor side to load AOTI kernel for
+  // the given operation.
+  //   Inductor utility function - torch._inductor.utils.load_aoti_eager_cache
+  void init_aoti_kernel_cache();
+  // Abstract the meta information of each tensor for the given operation. The
+  // meta infomation will be used for cache lookup as the key.
+  AOTIKernelMetadata get_inputs_metadata(const std::vector<at::Tensor>&);
+  // Load the AOTIModelContainerRunner object from the given file path.
+  std::shared_ptr<AOTIModelContainerRunner> load_aoti_model_runner(
+      const std::string&);
 };
 
 } // namespace torch::inductor
diff --git a/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp b/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
new file mode 100644
index 0000000000000..e89c59142328f
--- /dev/null
+++ b/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
@@ -0,0 +1,64 @@
+#if !defined(C10_MOBILE) && !defined(ANDROID)
+#include <torch/csrc/inductor/aoti_eager/kernel_meta_info.h>
+
+namespace torch::inductor {
+
+TensorMetadata::TensorMetadata(const at::Tensor& src_tensor)
+    : is_symbolic_(false),
+      device_(src_tensor.device()),
+      sizes_(src_tensor.sizes().vec()),
+      strides_(src_tensor.sizes().vec()) {}
+
+TensorMetadata::TensorMetadata(
+    bool is_symbolic,
+    c10::ScalarType dtype,
+    c10::Device device,
+    std::vector<int64_t> sizes,
+    std::vector<int64_t> strides)
+    : is_symbolic_(is_symbolic),
+      dtype_(dtype),
+      device_(device),
+      sizes_(sizes),
+      strides_(strides) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      !is_symbolic_, "Not support symbolic shape now");
+}
+
+bool TensorMetadata::operator==(const TensorMetadata& other) const {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      !is_symbolic_, "Not support symbolic shape now");
+  return this->is_symbolic_ == other.is_symbolic_ &&
+      this->dtype_ == other.dtype_ &&
+      this->device_.type() == other.device_.type() &&
+      this->sizes_ == other.sizes_ && this->strides_ == other.strides_;
+}
+
+size_t TensorMetadataHash::operator()(
+    const TensorMetadata& tensor_metadata) const {
+  auto hash = std::hash<bool>()(tensor_metadata.is_symbolic_);
+  hash = c10::hash_combine(
+      hash, std::hash<c10::ScalarType>()(tensor_metadata.dtype_));
+  hash = c10::hash_combine(
+      hash, std::hash<c10::DeviceType>()(tensor_metadata.device_.type()));
+
+  for (auto& e : tensor_metadata.sizes_) {
+    hash = c10::hash_combine(hash, std::hash<int64_t>()(e));
+  }
+
+  for (auto& e : tensor_metadata.strides_) {
+    hash = c10::hash_combine(hash, std::hash<int64_t>()(e));
+  }
+  return hash;
+}
+
+size_t AOTIKernelMetadataHash::operator()(
+    const AOTIKernelMetadata& aoti_kernel_metadata) const {
+  size_t hash = 0;
+  for (auto& e : aoti_kernel_metadata) {
+    hash = c10::hash_combine(hash, TensorMetadataHash()(e));
+  }
+  return hash;
+}
+
+} // namespace torch::inductor
+#endif
diff --git a/torch/csrc/inductor/aoti_eager/kernel_meta_info.h b/torch/csrc/inductor/aoti_eager/kernel_meta_info.h
new file mode 100644
index 0000000000000..c7f8315d2707a
--- /dev/null
+++ b/torch/csrc/inductor/aoti_eager/kernel_meta_info.h
@@ -0,0 +1,67 @@
+#if !defined(C10_MOBILE) && !defined(ANDROID)
+#pragma once
+
+#include <ATen/ATen.h>
+#include <c10/core/SymIntArrayRef.h>
+
+#include <string>
+
+namespace torch::inductor {
+
+// Regarding a aten operation implemented by AOTI, the metadata of the input
+// tensors will be cached on the disk to acclerate next run. TensorMetada
+// structure is to represent the metadata of each input tensor. it includes
+// whether the tensor is symbolic, the dtype, the device, the sizes and the
+// strides of the tensor. When the metadata of the input tensors is the same as
+// the cached metadata, the cached kernel library will be loaded and executed.
+// Otherwise, the AOT Inductor will be called again to generate the kernel
+// library.
+// Beyond the TensorMetadata, we build guard/TensorCheck for each input tensor
+// as well to support symbolic shape. We intend to utilize TensorCheck to find
+// out the proper kernel rather than TensorMetada comparison. Suppose an
+// operation with a single input tensor and two kernels:
+//   kernel1: TensorMetadata(is_symbolic=false, dtype=Float, device=CPU,
+//   sizes=[s0, s1, s2], strides=[s1 * s2, s2, 1]) kernel2:
+//   TensorMetadata(is_symbolic=false, dtype=Float, device=CPU, sizes=[3, s1,
+//   s2], strides=[s1 * s2, s2, 1])
+// If a tensor with sizes=[3, 4, 5] is passed to the operation, both kernel1 and
+// kernel2 support the tensor shape. In this case, we need to use TensorCheck
+// plus some heruistic rules to find out the proper kernel.
+struct TensorMetadata {
+  // Indicate whether the tensor is symbolic and it may be concluded by sizes_
+  // and strides_ in the future.
+  bool is_symbolic_;
+  // Dtype of a tensor(For scalar, we will wrap it as a scalar tensor)
+  c10::ScalarType dtype_;
+  // Device of a tensor.
+  c10::Device device_;
+  // Sizes of a tensor. Currently, we only support static shape and use int64_t
+  // to represent the sizes. In the future, we will create symbolic size and use
+  // SymInt to represent it to support symbolic shape.
+  std::vector<int64_t> sizes_;
+  // Strides of a tensor. For symbolic shape support, it is the same as sizes_
+  std::vector<int64_t> strides_;
+
+  TensorMetadata(const at::Tensor& src_tensor);
+  TensorMetadata(
+      bool is_symbolic,
+      c10::ScalarType dtype,
+      c10::Device device,
+      std::vector<int64_t> sizes,
+      std::vector<int64_t> strides);
+
+  bool operator==(const TensorMetadata& other) const;
+};
+
+struct TensorMetadataHash {
+  size_t operator()(const TensorMetadata&) const;
+};
+
+using AOTIKernelMetadata = std::vector<TensorMetadata>;
+
+struct AOTIKernelMetadataHash {
+  size_t operator()(const AOTIKernelMetadata&) const;
+};
+
+} // namespace torch::inductor
+#endif
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
index f31c52408aa77..8058618f97486 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
@@ -67,6 +67,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_geqrf(AtenTensorHandle self, Ate
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bin_ct(AtenTensorHandle self, int64_t bins, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_reduce(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
@@ -83,11 +85,17 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_median(AtenTensorHandle self, At
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Scalar(double self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Tensor_Scalar(AtenTensorHandle self, double exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Tensor_Tensor(AtenTensorHandle self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
@@ -96,10 +104,13 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad2d_backward(AtenT
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort_stable(AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_bicubic2d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
index 37e45a7030a56..1382be18573f0 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
@@ -74,6 +74,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_gcd(AtenTensorHandle self, Aten
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_geqrf(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_reduce(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
@@ -90,11 +92,17 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_median(AtenTensorHandle self, A
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Scalar(double self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Tensor_Scalar(AtenTensorHandle self, double exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Tensor_Tensor(AtenTensorHandle self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
@@ -103,10 +111,13 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad2d_backward(Aten
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort_stable(AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_bicubic2d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp
index bd45a4a9f0f87..79cea0cb45ec8 100644
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@@ -775,7 +775,7 @@ AOTITorchError aoti_torch_index_put_out(
     const AtenTensorHandle values,
     bool accumulate) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
-    c10::List<c10::optional<at::Tensor>> indices_;
+    c10::List<std::optional<at::Tensor>> indices_;
     indices_.reserve(num_indices);
     for (size_t i = 0; i < num_indices; i++) {
       indices_.emplace_back(
diff --git a/torch/csrc/inductor/aoti_torch/utils.h b/torch/csrc/inductor/aoti_torch/utils.h
index a0739afabd5ee..0964479caabd8 100644
--- a/torch/csrc/inductor/aoti_torch/utils.h
+++ b/torch/csrc/inductor/aoti_torch/utils.h
@@ -39,29 +39,29 @@ inline AtenTensorHandle new_tensor_handle(at::Tensor&& tensor) {
 
 // utility functions to convert a pointer to an optional value
 template <class T>
-inline c10::optional<T> pointer_to_optional(T* ptr) {
+inline std::optional<T> pointer_to_optional(T* ptr) {
   return ptr ? c10::make_optional(*ptr) : c10::nullopt;
 }
 
 template <class T, class U, typename = std::enable_if_t<!std::is_same_v<T, U>>>
-inline c10::optional<T> pointer_to_optional(U* ptr) {
+inline std::optional<T> pointer_to_optional(U* ptr) {
   return ptr ? c10::make_optional<T>(T(*ptr)) : c10::nullopt;
 }
 
 template <>
-inline c10::optional<at::Tensor> pointer_to_optional(AtenTensorHandle* ptr) {
+inline std::optional<at::Tensor> pointer_to_optional(AtenTensorHandle* ptr) {
   return ptr ? c10::make_optional(*tensor_handle_to_tensor_pointer(*ptr))
              : c10::nullopt;
 }
 
 template <>
-inline c10::optional<at::Tensor> pointer_to_optional(
+inline std::optional<at::Tensor> pointer_to_optional(
     const AtenTensorHandle* ptr) {
   return ptr ? c10::make_optional(*tensor_handle_to_tensor_pointer(*ptr))
              : c10::nullopt;
 }
 
-inline c10::optional<c10::Device> pointer_to_optional_device(
+inline std::optional<c10::Device> pointer_to_optional_device(
     int32_t* device_type,
     int32_t device_index) {
   return device_type ? c10::make_optional(c10::Device(
@@ -74,7 +74,7 @@ inline c10::optional<c10::Device> pointer_to_optional_device(
 template <typename T>
 struct is_optional : std::false_type {};
 template <typename T>
-struct is_optional<c10::optional<T>> : std::true_type {};
+struct is_optional<std::optional<T>> : std::true_type {};
 
 template <class T>
 inline c10::ArrayRef<T> pointer_to_list(T* ptr, int64_t len) {
@@ -123,10 +123,10 @@ inline std::vector<at::Tensor> pointer_to_list(
 }
 
 template <>
-inline std::vector<c10::optional<at::Tensor>> pointer_to_list(
+inline std::vector<std::optional<at::Tensor>> pointer_to_list(
     const AtenTensorHandle** ptr,
     int64_t len) {
-  std::vector<c10::optional<at::Tensor>> result;
+  std::vector<std::optional<at::Tensor>> result;
   result.reserve(len);
   for (int64_t i = 0; i < len; i++) {
     result.emplace_back(pointer_to_optional<at::Tensor>(ptr[i]));
@@ -143,7 +143,7 @@ inline std::array<bool, N> pointer_to_list(const int32_t* ptr) {
 
 // Utility function to convert a pointer to an optional list of values
 template <class T, class U>
-inline c10::optional<c10::ArrayRef<T>> pointer_to_optional_list(
+inline std::optional<c10::ArrayRef<T>> pointer_to_optional_list(
     U** ptr,
     int64_t len) {
   return ptr
diff --git a/torch/csrc/jit/api/compilation_unit.h b/torch/csrc/jit/api/compilation_unit.h
index 6203905732667..8e28ef4717b93 100644
--- a/torch/csrc/jit/api/compilation_unit.h
+++ b/torch/csrc/jit/api/compilation_unit.h
@@ -86,7 +86,7 @@ struct TORCH_API CompilationUnit {
   // for historic reasons, these are defined in ir_emitter.cpp
   // Returns the list of Functions just defined.
   std::vector<Function*> define(
-      const c10::optional<c10::QualifiedName>& prefix,
+      const std::optional<c10::QualifiedName>& prefix,
       const std::vector<Property>& properties,
       const std::vector<ResolverPtr>& propResolvers,
       const std::vector<Def>& definitions,
@@ -97,10 +97,10 @@ struct TORCH_API CompilationUnit {
       const Self* self,
       // see [name mangling]
       bool shouldMangle = false,
-      c10::optional<size_t> operator_set_version = c10::nullopt);
+      std::optional<size_t> operator_set_version = c10::nullopt);
 
   void define_hooks(
-      const c10::optional<c10::QualifiedName>& prefix,
+      const std::optional<c10::QualifiedName>& prefix,
       const std::vector<Def>& hookDefs,
       const std::vector<ResolverPtr>& hookResolvers,
       const std::vector<Def>& preHookDefs,
@@ -112,7 +112,7 @@ struct TORCH_API CompilationUnit {
   // Returns the list of Functions just defined.
   std::vector<Function*> define(
       // prefix namespace to put all the defined functions into
-      const c10::optional<c10::QualifiedName>& prefix,
+      const std::optional<c10::QualifiedName>& prefix,
       const std::string& source,
       const ResolverPtr& resolver,
       const Self* self);
@@ -286,19 +286,19 @@ struct TORCH_API CompilationUnit {
 
  private:
   std::unique_ptr<Function> define(
-      const c10::optional<c10::QualifiedName>& prefix,
+      const std::optional<c10::QualifiedName>& prefix,
       const Def& def,
       const ResolverPtr& resolver,
       const Self* self,
       const std::unordered_map<std::string, Function*>& function_table,
       bool shouldMangle = false,
       FunctionType type = FunctionType::Method,
-      c10::optional<size_t> version = c10::nullopt) const;
+      std::optional<size_t> version = c10::nullopt) const;
 
   // Define a property on \p self.
   struct PropertyPair;
   PropertyPair define_property(
-      const c10::optional<c10::QualifiedName>& prefix,
+      const std::optional<c10::QualifiedName>& prefix,
       const Property& prop,
       const ResolverPtr& resolver,
       const Self* self,
diff --git a/torch/csrc/jit/api/function_impl.h b/torch/csrc/jit/api/function_impl.h
index 74663cfb41ce7..6ed8cb36199ef 100644
--- a/torch/csrc/jit/api/function_impl.h
+++ b/torch/csrc/jit/api/function_impl.h
@@ -12,7 +12,7 @@ struct TORCH_API GraphFunction : public Function {
       c10::QualifiedName name,
       std::shared_ptr<Graph> graph,
       std::function<void(GraphFunction&)> function_creator,
-      c10::optional<ExecutorExecutionMode> executor_execution_mode =
+      std::optional<ExecutorExecutionMode> executor_execution_mode =
           c10::nullopt)
       : name_(std::move(name)),
         graph_(std::move(graph)),
@@ -108,7 +108,7 @@ struct TORCH_API GraphFunction : public Function {
   using Function::call;
   bool call(
       Stack& stack,
-      c10::optional<size_t> bailOut,
+      std::optional<size_t> bailOut,
       c10::function_ref<void(const Code&)> f) override {
     f(get_executor().getPlanFor(stack, bailOut).code);
     return true;
@@ -139,7 +139,7 @@ struct TORCH_API GraphFunction : public Function {
 
   // allows users to specify Simple/Profiling Executor for function
   // TODO: add more executors
-  mutable c10::optional<ExecutorExecutionMode> executor_execution_mode_;
+  mutable std::optional<ExecutorExecutionMode> executor_execution_mode_;
 
   // if invoked on a graph that has already traced through amp
   // don't invoke amp pass
@@ -159,7 +159,7 @@ struct TORCH_API GraphFunction : public Function {
   // executor_[1] - autocast cpu on
   // executor_[2] - autocast gpu on
   // executor_[3] - autocast cpu & gpu on
-  std::array<c10::optional<GraphExecutor>, SpecializationKey::TotalCount>
+  std::array<std::optional<GraphExecutor>, SpecializationKey::TotalCount>
       executors_;
 
   // an optional function that actually creates the method when
diff --git a/torch/csrc/jit/api/module.cpp b/torch/csrc/jit/api/module.cpp
index e32d2bba34501..1b9932ed34d4d 100644
--- a/torch/csrc/jit/api/module.cpp
+++ b/torch/csrc/jit/api/module.cpp
@@ -167,8 +167,8 @@ void Module::to(at::Device device, bool non_blocking) {
 
 static void module_state_to(
     const autograd::Variable& variable,
-    const c10::optional<at::Device>& device,
-    const c10::optional<at::ScalarType>& dtype,
+    const std::optional<at::Device>& device,
+    const std::optional<at::ScalarType>& dtype,
     bool non_blocking) {
   // Need to access the `at::Tensor` as a `Variable` here.
   // Use the data's original device or dtype if not supplied here.
@@ -180,8 +180,8 @@ static void module_state_to(
 }
 
 void Module::to_impl(
-    const c10::optional<at::Device>& device,
-    const c10::optional<at::ScalarType>& dtype,
+    const std::optional<at::Device>& device,
+    const std::optional<at::ScalarType>& dtype,
     bool non_blocking) {
   for (at::Tensor e : parameters()) {
     module_state_to(e, device, dtype, non_blocking);
@@ -317,7 +317,7 @@ Module Module::copy() const {
   return Module(_ivalue()->copy());
 }
 
-Module Module::deepcopy(c10::optional<at::Device> device) const {
+Module Module::deepcopy(std::optional<at::Device> device) const {
   return Module(_ivalue()->deepcopy(device));
 }
 
@@ -476,7 +476,7 @@ IValue Module::create_class(const c10::QualifiedName& name, Stack stack) const {
 
 Module freeze(
     const Module& module,
-    const c10::optional<std::vector<std::string>>& preserved_attrs,
+    const std::optional<std::vector<std::string>>& preserved_attrs,
     bool optimize_numerics) {
   TORCH_CHECK(
       !module.hasattr("training") || !module.is_training(),
diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h
index 6c49b695cb6b5..0787210a4aefe 100644
--- a/torch/csrc/jit/api/module.h
+++ b/torch/csrc/jit/api/module.h
@@ -238,7 +238,7 @@ struct TORCH_API Module : public Object {
 
   Module copy() const;
 
-  Module deepcopy(c10::optional<at::Device> device = c10::nullopt) const;
+  Module deepcopy(std::optional<at::Device> device = c10::nullopt) const;
 
   // Clones both the underlying `ClassType` and the module instance(data), this
   // function creates a new `ClassType` and returns a new instance that has the
@@ -315,8 +315,8 @@ struct TORCH_API Module : public Object {
   }
 
   void to_impl(
-      const c10::optional<at::Device>& device,
-      const c10::optional<at::ScalarType>& dtype,
+      const std::optional<at::Device>& device,
+      const std::optional<at::ScalarType>& dtype,
       bool non_blocking);
 
   // Extra handle for the module to delete when itself is deleted
@@ -333,7 +333,7 @@ struct TORCH_API Module : public Object {
 // details.
 TORCH_API Module freeze(
     const Module& module,
-    const c10::optional<std::vector<std::string>>& preserved_attrs =
+    const std::optional<std::vector<std::string>>& preserved_attrs =
         c10::nullopt,
     bool optimize_numerics = true);
 
@@ -566,7 +566,7 @@ struct slot_list_impl {
   bool return_module_;
   // size of this list, cached on first request
   // when we need to filter the slot list
-  mutable c10::optional<size_t> size_;
+  mutable std::optional<size_t> size_;
   friend struct Module;
 };
 
diff --git a/torch/csrc/jit/api/object.cpp b/torch/csrc/jit/api/object.cpp
index 0593916dbbaea..b707e76772765 100644
--- a/torch/csrc/jit/api/object.cpp
+++ b/torch/csrc/jit/api/object.cpp
@@ -14,7 +14,7 @@ Object::Object(
           c10::StrongTypePtr(std::move(cu), type),
           type->numAttributes())) {}
 
-c10::optional<Method> Object::find_method(const std::string& basename) const {
+std::optional<Method> Object::find_method(const std::string& basename) const {
   for (Function* fn : type()->methods()) {
     if (fn->name() == basename) {
       return Method(_ivalue(), fn);
diff --git a/torch/csrc/jit/api/object.h b/torch/csrc/jit/api/object.h
index 7ccacf385be53..164f6e2ac073a 100644
--- a/torch/csrc/jit/api/object.h
+++ b/torch/csrc/jit/api/object.h
@@ -46,7 +46,7 @@ struct TORCH_API Object {
   struct Property {
     std::string name;
     Method getter_func;
-    c10::optional<Method> setter_func;
+    std::optional<Method> setter_func;
   };
 
   void setattr(const std::string& name, c10::IValue v) {
@@ -129,7 +129,7 @@ struct TORCH_API Object {
   const Property get_property(const std::string& name) const {
     for (const auto& prop : type()->properties()) {
       if (prop.name == name) {
-        c10::optional<Method> setter = c10::nullopt;
+        std::optional<Method> setter = c10::nullopt;
         if (prop.setter) {
           setter = Method(_ivalue(), prop.setter);
         }
@@ -142,7 +142,7 @@ struct TORCH_API Object {
 
   const std::vector<Property> get_properties() const {
     return c10::fmap(type()->properties(), [&](ClassType::Property prop) {
-      c10::optional<Method> setter = c10::nullopt;
+      std::optional<Method> setter = c10::nullopt;
       if (prop.setter) {
         setter = Method(_ivalue(), prop.setter);
       }
@@ -153,7 +153,7 @@ struct TORCH_API Object {
     });
   }
 
-  c10::optional<Method> find_method(const std::string& basename) const;
+  std::optional<Method> find_method(const std::string& basename) const;
 
   /// Run a method from this module.
   ///
diff --git a/torch/csrc/jit/backends/backend_debug_info.h b/torch/csrc/jit/backends/backend_debug_info.h
index 1d07beb6bdb3c..291eb48132e8e 100644
--- a/torch/csrc/jit/backends/backend_debug_info.h
+++ b/torch/csrc/jit/backends/backend_debug_info.h
@@ -27,7 +27,7 @@ class TORCH_API PyTorchBackendDebugInfo : public torch::CustomClassHolder {
  public:
   PyTorchBackendDebugInfo() = default;
 
-  c10::optional<BackendDebugInfoMapType>& getDebugInfoMap() {
+  std::optional<BackendDebugInfoMapType>& getDebugInfoMap() {
     return debug_info_map_;
   }
 
@@ -36,7 +36,7 @@ class TORCH_API PyTorchBackendDebugInfo : public torch::CustomClassHolder {
   }
 
  private:
-  c10::optional<BackendDebugInfoMapType> debug_info_map_;
+  std::optional<BackendDebugInfoMapType> debug_info_map_;
 };
 
 #else
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
index 5bdcbe63797c4..a0b59a73f46f9 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
@@ -233,7 +233,7 @@ void XNNGraph::defineAllTensorValues() {
       size_t buffer_idx = 0;
       size_t num_bytes = 0;
       if (val->node()->kind() == prim::Constant) {
-        c10::optional<IValue> constant = val->node()->t(attr::value);
+        std::optional<IValue> constant = val->node()->t(attr::value);
         auto const_val = constant->toIValue().toTensor();
         // Need tensor data to be contiguous for serialization
         auto cont_const_val = const_val.contiguous();
diff --git a/torch/csrc/jit/codegen/fuser/codegen.cpp b/torch/csrc/jit/codegen/fuser/codegen.cpp
index 10ddf2267b21d..2f9217e133697 100644
--- a/torch/csrc/jit/codegen/fuser/codegen.cpp
+++ b/torch/csrc/jit/codegen/fuser/codegen.cpp
@@ -364,7 +364,7 @@ static void emitCheckFor(
 std::string generateKernel(
     const std::string& name,
     const Graph& graph,
-    const std::vector<std::pair<const Value*, const c10::optional<TensorDesc>>>&
+    const std::vector<std::pair<const Value*, const std::optional<TensorDesc>>>&
         inputs,
     const std::vector<std::pair<const Value*, const TensorDesc>>& outputs,
     const bool use_cuda) {
diff --git a/torch/csrc/jit/codegen/fuser/codegen.h b/torch/csrc/jit/codegen/fuser/codegen.h
index fc0b34e55fe7e..e42adc1314320 100644
--- a/torch/csrc/jit/codegen/fuser/codegen.h
+++ b/torch/csrc/jit/codegen/fuser/codegen.h
@@ -18,7 +18,7 @@ namespace fuser {
 TORCH_API std::string generateKernel(
     const std::string& name,
     const Graph& graph,
-    const std::vector<std::pair<const Value*, const c10::optional<TensorDesc>>>&
+    const std::vector<std::pair<const Value*, const std::optional<TensorDesc>>>&
         inputs,
     const std::vector<std::pair<const Value*, const TensorDesc>>& outputs,
     const bool use_cuda);
diff --git a/torch/csrc/jit/codegen/fuser/compiler.cpp b/torch/csrc/jit/codegen/fuser/compiler.cpp
index 52dc3a07fe765..3c05b70e8341a 100644
--- a/torch/csrc/jit/codegen/fuser/compiler.cpp
+++ b/torch/csrc/jit/codegen/fuser/compiler.cpp
@@ -225,7 +225,7 @@ std::shared_ptr<FusedKernel> compileKernel(
 
   // Creates chunk and flattened input descriptions
   std::vector<PartitionDesc> chunk_desc;
-  std::vector<std::pair<const Value*, const c10::optional<TensorDesc>>>
+  std::vector<std::pair<const Value*, const std::optional<TensorDesc>>>
       flat_inputs;
   {
     size_t input_index = 0;
diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
index c930f3293aa56..5f692d50e6b54 100644
--- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
@@ -59,7 +59,7 @@ static bool programExists(const std::string& program) {
 }
 
 #ifdef _MSC_VER
-c10::optional<std::wstring> exec(const std::wstring& cmd) {
+std::optional<std::wstring> exec(const std::wstring& cmd) {
   std::array<wchar_t, 128> buffer;
   std::wstring result;
   std::unique_ptr<FILE, decltype(&_pclose)> pipe(
@@ -82,7 +82,7 @@ inline std::wstring& rtrim(std::wstring& s, const wchar_t* t = L" \t\n\r\f\v") {
 void activate() {
   wchar_t* root = nullptr;
   std::wstring cmd;
-  c10::optional<std::wstring> exec_out;
+  std::optional<std::wstring> exec_out;
   std::wstring path;
   std::wstring vcruntime_plat;
   std::wstring envvars;
diff --git a/torch/csrc/jit/codegen/fuser/executor.cpp b/torch/csrc/jit/codegen/fuser/executor.cpp
index fad7cfcd630da..8abb99283ffc7 100644
--- a/torch/csrc/jit/codegen/fuser/executor.cpp
+++ b/torch/csrc/jit/codegen/fuser/executor.cpp
@@ -26,7 +26,7 @@ namespace fuser {
 
 // Returns the "map size" for this run, which is the common size for all
 // intermediate tensors.
-static c10::optional<std::vector<int64_t>> getMapSize(
+static std::optional<std::vector<int64_t>> getMapSize(
     const KernelSpec& spec,
     at::TensorList args,
     at::IntArrayRef arg_subset) {
@@ -67,7 +67,7 @@ static c10::optional<std::vector<int64_t>> getMapSize(
 }
 
 // Tries to determine a map size for the instantiated kernel (see above)
-static c10::optional<std::vector<int64_t>> canRunKernel(
+static std::optional<std::vector<int64_t>> canRunKernel(
     const KernelSpec& spec,
     at::TensorList args) {
   // Short-circuits on size mismatch
@@ -78,7 +78,7 @@ static c10::optional<std::vector<int64_t>> canRunKernel(
       " arguments, but got ",
       args.size());
 
-  c10::optional<std::vector<int64_t>> map_size;
+  std::optional<std::vector<int64_t>> map_size;
   for (const auto& broadcast_group : spec.inputBroadcastGroups()) {
     if (!map_size) {
       map_size = getMapSize(spec, args, broadcast_group);
diff --git a/torch/csrc/jit/codegen/fuser/kernel_spec.h b/torch/csrc/jit/codegen/fuser/kernel_spec.h
index 57806ed436311..2fc52f2d76f0f 100644
--- a/torch/csrc/jit/codegen/fuser/kernel_spec.h
+++ b/torch/csrc/jit/codegen/fuser/kernel_spec.h
@@ -117,7 +117,7 @@ struct TORCH_API KernelSpec {
   }
 
   // Cache functions
-  c10::optional<std::shared_ptr<FusedKernel>> findKernel(
+  std::optional<std::shared_ptr<FusedKernel>> findKernel(
       const ArgSpec& arg_spec) const {
     std::lock_guard<std::mutex> guard{mutex_};
     const auto it = kernels_.find(arg_spec);
diff --git a/torch/csrc/jit/codegen/onednn/defer_size_check.cpp b/torch/csrc/jit/codegen/onednn/defer_size_check.cpp
index 1dbef6643dba8..4d0f12564bd9c 100644
--- a/torch/csrc/jit/codegen/onednn/defer_size_check.cpp
+++ b/torch/csrc/jit/codegen/onednn/defer_size_check.cpp
@@ -41,7 +41,7 @@ class SizeCheckMover {
           // tensorexpr_elementwise_set that's defined in
           // torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp
           OperatorMap<std::string> schemaMap = get_tensorexpr_elementwise_set();
-          c10::optional<std::string> mapping =
+          std::optional<std::string> mapping =
               schemaMap.find(u.user->getOperator());
           return mapping == "unary";
         });
diff --git a/torch/csrc/jit/codegen/onednn/graph_fuser.h b/torch/csrc/jit/codegen/onednn/graph_fuser.h
index ee83edc68fc41..ab37ad0211b7a 100644
--- a/torch/csrc/jit/codegen/onednn/graph_fuser.h
+++ b/torch/csrc/jit/codegen/onednn/graph_fuser.h
@@ -39,7 +39,7 @@ class GraphRewriter {
   std::pair<graph_node_list::iterator, bool> scanNode(
       Node* consumer,
       graph_node_list::iterator workblock_begin);
-  c10::optional<Node*> tryMerge(Node* consumer, Node* producer);
+  std::optional<Node*> tryMerge(Node* consumer, Node* producer);
 };
 
 // This pass creates the subgraphs for oneDNN Graph Fusion Nodes.
diff --git a/torch/csrc/jit/codegen/onednn/graph_helper.cpp b/torch/csrc/jit/codegen/onednn/graph_helper.cpp
index fdd69f85c5d52..f8e54c8743216 100644
--- a/torch/csrc/jit/codegen/onednn/graph_helper.cpp
+++ b/torch/csrc/jit/codegen/onednn/graph_helper.cpp
@@ -22,7 +22,7 @@ static void fixConvOptionalBias(Node* node) {
   }
 }
 
-static c10::optional<size_t> getDimensions(Value* v) {
+static std::optional<size_t> getDimensions(Value* v) {
   if (v->type()->isSubtypeOf(TensorType::get())) {
     return v->type()->cast<TensorType>()->sizes().size();
   } else {
diff --git a/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp b/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp
index c91ff9b3917a4..dfbfe467e9765 100644
--- a/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp
+++ b/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp
@@ -127,7 +127,7 @@ std::pair<graph_node_list::iterator, bool> GraphRewriter::scanNode(
 
 // Try to merge `producer` into `consumer`. If successful, this destroys
 // `producer` and returns the `consumer` group.
-c10::optional<Node*> GraphRewriter::tryMerge(Node* consumer, Node* producer) {
+std::optional<Node*> GraphRewriter::tryMerge(Node* consumer, Node* producer) {
   AT_ASSERT(llgaHelper_.isLlgaSubgraph(consumer));
   bool canMerge = llgaHelper_.shouldMerge(producer, consumer) &&
       aliasDb_.moveBeforeTopologicallyValid(producer, consumer);
diff --git a/torch/csrc/jit/codegen/onednn/prepare_binary.cpp b/torch/csrc/jit/codegen/onednn/prepare_binary.cpp
index 795fce27e0083..a4f6d268694e3 100644
--- a/torch/csrc/jit/codegen/onednn/prepare_binary.cpp
+++ b/torch/csrc/jit/codegen/onednn/prepare_binary.cpp
@@ -47,7 +47,7 @@ static void handleBinaryOpInputs(Node* node) {
       // 42 : Scalar  -->  tensor(42.0) : Float([])
       auto t = g->insert(aten::as_tensor, {scalar}, {{"dtype", promotedDtype}});
       // add dim & stride info to IR
-      c10::optional<size_t> t_dim = 1;
+      std::optional<size_t> t_dim = 1;
       auto target_type = TensorTypePtr(
           TensorType::create(promotedDtype, at::kCPU, t_dim, false));
       target_type = target_type->withSizes({1});
@@ -67,7 +67,7 @@ static void handleBinaryOpInputs(Node* node) {
       // are the same dtype, as oneDNN Graph requires both inputs to have the
       // same dtype. We'll follow PyTorch's type-promotion rules here.
       auto second_input_typeptr = node->input(1)->type()->expect<TensorType>();
-      c10::optional<at::ScalarType> second_input_type =
+      std::optional<at::ScalarType> second_input_type =
           second_input_typeptr->scalarType();
       if (second_input_type != c10::nullopt) {
         // dtype of the second tensor might not be available in the IR
diff --git a/torch/csrc/jit/cuda/cuda.h b/torch/csrc/jit/cuda/cuda.h
index e8a0d04aa935e..80b2e2a82f788 100644
--- a/torch/csrc/jit/cuda/cuda.h
+++ b/torch/csrc/jit/cuda/cuda.h
@@ -15,7 +15,7 @@ class CUDAStream final : public CustomClassHolder {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   CUDAStream(
-      c10::optional<c10::Device> device = c10::nullopt,
+      std::optional<c10::Device> device = c10::nullopt,
       int64_t priority = 0) {
     c10::DeviceIndex device_index =
         device.has_value() ? device->index() : c10::cuda::current_device();
@@ -155,7 +155,7 @@ void CUDAEvent::wait(c10::intrusive_ptr<CUDAStream> stream) {
 
 TORCH_LIBRARY(cuda, m) {
   auto stream_class = m.class_<torch::jit::CUDAStream>("Stream").def(
-      torch::init<c10::optional<c10::Device>, int64_t>(),
+      torch::init<std::optional<c10::Device>, int64_t>(),
       "",
       {torch::arg("device") = c10::nullopt, torch::arg("priority") = 0});
   auto event_class = m.class_<torch::jit::CUDAEvent>("Event").def(
diff --git a/torch/csrc/jit/frontend/concrete_module_type.cpp b/torch/csrc/jit/frontend/concrete_module_type.cpp
index b18917d0dc01f..c15116ac3e244 100644
--- a/torch/csrc/jit/frontend/concrete_module_type.cpp
+++ b/torch/csrc/jit/frontend/concrete_module_type.cpp
@@ -149,14 +149,14 @@ TypePtr ConcreteModuleType::getJitType() const {
   return jitType_;
 }
 
-c10::optional<py::object> ConcreteModuleType::getPyClass() const {
+std::optional<py::object> ConcreteModuleType::getPyClass() const {
   if (!data_.pyClass_) {
     return c10::nullopt;
   }
   return data_.pyClass_;
 }
 
-c10::optional<std::vector<std::string>> ConcreteModuleType::findOverloads(
+std::optional<std::vector<std::string>> ConcreteModuleType::findOverloads(
     const std::string& name) const {
   const auto it = data_.overloads_.find(name);
   if (it != data_.overloads_.end()) {
@@ -165,7 +165,7 @@ c10::optional<std::vector<std::string>> ConcreteModuleType::findOverloads(
   return c10::nullopt;
 }
 
-c10::optional<Function*> ConcreteModuleType::findFunctionAttribute(
+std::optional<Function*> ConcreteModuleType::findFunctionAttribute(
     const std::string& name) const {
   const auto it = data_.functionAttributes_.find(name);
   if (it != data_.functionAttributes_.end()) {
@@ -174,7 +174,7 @@ c10::optional<Function*> ConcreteModuleType::findFunctionAttribute(
   return c10::nullopt;
 }
 
-c10::optional<c10::Symbol> ConcreteModuleType::findBuiltinFunction(
+std::optional<c10::Symbol> ConcreteModuleType::findBuiltinFunction(
     const std::string& name) const {
   const auto it = data_.builtinFunctions_.find(name);
   if (it != data_.builtinFunctions_.end()) {
@@ -183,7 +183,7 @@ c10::optional<c10::Symbol> ConcreteModuleType::findBuiltinFunction(
   return c10::nullopt;
 }
 
-c10::optional<std::string> ConcreteModuleType::findFailedAttribute(
+std::optional<std::string> ConcreteModuleType::findFailedAttribute(
     const std::string& name) const {
   const auto it = data_.failedAttributes_.find(name);
   if (it != data_.failedAttributes_.end()) {
diff --git a/torch/csrc/jit/frontend/concrete_module_type.h b/torch/csrc/jit/frontend/concrete_module_type.h
index 22349936687ce..b3c3221253563 100644
--- a/torch/csrc/jit/frontend/concrete_module_type.h
+++ b/torch/csrc/jit/frontend/concrete_module_type.h
@@ -195,15 +195,15 @@ class VISIBILITY_HIDDEN ConcreteModuleType {
   static std::shared_ptr<ConcreteModuleType> fromJitType(TypePtr type);
 
   TypePtr getJitType() const;
-  c10::optional<py::object> getPyClass() const;
+  std::optional<py::object> getPyClass() const;
   IterableModuleKind getIterableModuleKind() const;
-  c10::optional<std::vector<std::string>> findOverloads(
+  std::optional<std::vector<std::string>> findOverloads(
       const std::string& name) const;
-  c10::optional<Function*> findFunctionAttribute(const std::string& name) const;
-  c10::optional<c10::Symbol> findBuiltinFunction(const std::string& name) const;
+  std::optional<Function*> findFunctionAttribute(const std::string& name) const;
+  std::optional<c10::Symbol> findBuiltinFunction(const std::string& name) const;
   std::shared_ptr<ConcreteModuleType> findSubmoduleConcreteType(
       const std::string& name) const;
-  c10::optional<std::string> findFailedAttribute(const std::string& name) const;
+  std::optional<std::string> findFailedAttribute(const std::string& name) const;
   bool isIgnoredAttribute(const std::string& name) const;
 
   // These getters are only here to return things as types that can be
diff --git a/torch/csrc/jit/frontend/function_schema_parser.cpp b/torch/csrc/jit/frontend/function_schema_parser.cpp
index 4b681055bd075..a651b35786cea 100644
--- a/torch/csrc/jit/frontend/function_schema_parser.cpp
+++ b/torch/csrc/jit/frontend/function_schema_parser.cpp
@@ -149,9 +149,9 @@ struct SchemaParser {
     auto fake_type = std::move(std::get<0>(p));
     auto real_type = std::move(std::get<1>(p));
     auto alias_info = std::move(std::get<2>(p));
-    c10::optional<int32_t> N;
-    c10::optional<IValue> default_value;
-    c10::optional<std::string> alias_set;
+    std::optional<int32_t> N;
+    std::optional<IValue> default_value;
+    std::optional<std::string> alias_set;
     std::string name;
     if (L.nextIf('[')) {
       // note: an array with a size hint can only occur at the Argument level
@@ -162,7 +162,7 @@ struct SchemaParser {
       auto container = type_parser.parseAliasAnnotation();
       if (alias_info) {
         if (!container) {
-          container = c10::optional<at::AliasInfo>(at::AliasInfo());
+          container = std::optional<at::AliasInfo>(at::AliasInfo());
           container->setIsWrite(alias_info->isWrite());
         }
         container->addContainedType(std::move(*alias_info));
@@ -297,7 +297,7 @@ struct SchemaParser {
   IValue parseDefaultValue(
       const c10::Type& arg_type,
       TypeKind kind,
-      c10::optional<int32_t> arg_N) {
+      std::optional<int32_t> arg_N) {
     auto range = L.cur().range;
     switch (kind) {
       case TypeKind::TensorType:
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index 989a6eaf2dfe0..0aca3ea800623 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -168,7 +168,7 @@ struct CondValue {
   CondValue(
       Value* value,
       RefinementSet refinements,
-      c10::optional<bool> static_if)
+      std::optional<bool> static_if)
       : value_(value),
         refinements_(std::move(refinements)),
         static_if_(static_if) {}
@@ -186,14 +186,14 @@ struct CondValue {
   const RefinementSet& refinements() const {
     return refinements_;
   }
-  c10::optional<bool> staticIf() const {
+  std::optional<bool> staticIf() const {
     return static_if_;
   }
 
  private:
   Value* value_;
   RefinementSet refinements_;
-  c10::optional<bool>
+  std::optional<bool>
       static_if_; // certain expression cause us to emit a static if statement
                   // this value is present if this is the case.
                   // this is not equivalent to value_ being a constant
@@ -283,7 +283,7 @@ struct Environment {
   }
 
   // see if type error has been set for a variable
-  c10::optional<std::string> findVariableTypeError(const std::string& name) {
+  std::optional<std::string> findVariableTypeError(const std::string& name) {
     auto runner = this;
     while (runner->next) {
       runner = runner->next.get();
@@ -1200,7 +1200,7 @@ struct to_ir {
     }
     if (const auto union_type = lhs_value->type()->cast<UnionType>()) {
       std::vector<TypePtr> to_subtract{NoneType::get()};
-      c10::optional<TypePtr> remaining =
+      std::optional<TypePtr> remaining =
           union_type->subtractTypeSet(to_subtract);
       std::vector<Refinement> all_present;
       if (remaining) {
@@ -1228,7 +1228,7 @@ struct to_ir {
         CondValue v = emitCondExpr(Expr(expr.tree()->trees()[0]));
         Value* result = emitBuiltinCall(
             expr.range(), *graph, aten::__not__, {v.value()}, {});
-        c10::optional<bool> static_if;
+        std::optional<bool> static_if;
         if (v.staticIf()) {
           static_if = !*v.staticIf();
         }
@@ -1294,7 +1294,7 @@ struct to_ir {
           }
         }
         auto expr_out = emitToBool(expr.range(), emitExpr(expr));
-        c10::optional<bool> static_if = c10::nullopt;
+        std::optional<bool> static_if = c10::nullopt;
         auto kind = expr_out->node()->kind();
         if (kind == aten::is_scripting) {
           static_if = true;
@@ -1559,7 +1559,7 @@ struct to_ir {
           ? refined_type_hint->cast<ListType>()->getElementType()
           : nullptr;
 
-      c10::optional<TypePtr> unified_elem_type = unifyTypes(
+      std::optional<TypePtr> unified_elem_type = unifyTypes(
           list_value->type()->expect<ListType>()->getElementType(),
           out->type(),
           /*default_to_union=*/true,
@@ -1740,7 +1740,7 @@ struct to_ir {
           ? refined_type_hint->expect<DictType>()->getValueType()
           : nullptr;
 
-      c10::optional<TypePtr> unified_value_type = unifyTypes(
+      std::optional<TypePtr> unified_value_type = unifyTypes(
           first_generated_value_type,
           v->type(),
           /*default_to_union=*/true,
@@ -1832,7 +1832,7 @@ struct to_ir {
     // and the second expr in the false branch, if it's an AND the opposite
     auto get_const_expr = [&] { return graph->insertConstant(is_or, loc); };
 
-    c10::optional<CondValue> rhs;
+    std::optional<CondValue> rhs;
     auto get_continue_expr = [&] {
       rhs = emitCondExpr(second_expr);
       return rhs->value();
@@ -1842,8 +1842,8 @@ struct to_ir {
     // If this is an AND, eval second expression if first expr is True
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     Value* new_result;
-    c10::optional<RefinementSet> refinements;
-    c10::optional<bool> static_if;
+    std::optional<RefinementSet> refinements;
+    std::optional<bool> static_if;
     if (is_or) {
       new_result = emitIfExpr(loc, lhs, get_const_expr, get_continue_expr);
       refinements = lhs.refinements().Or(rhs->refinements());
@@ -2320,8 +2320,8 @@ struct to_ir {
       const SourceRange& range,
       const std::function<void()>& emit_body,
       const SugaredValuePtr& iter_val,
-      c10::optional<List<Expr>> targets,
-      c10::optional<Expr> cond) {
+      std::optional<List<Expr>> targets,
+      std::optional<Expr> cond) {
     Value* max_trip_count_val = nullptr;
     if (iter_val != nullptr) {
       max_trip_count_val = iter_val->len(range, method);
@@ -2968,7 +2968,7 @@ struct to_ir {
     auto outputs = rhs_output->asTuple(
         rhs_loc,
         method,
-        starred_unpack ? c10::nullopt : c10::optional<size_t>{n_binders});
+        starred_unpack ? c10::nullopt : std::optional<size_t>{n_binders});
     if (outputs.size() < n_binders) {
       throw ErrorReport(tl)
           << "need " << (starred_unpack ? "at least " : "") << n_binders
@@ -3655,7 +3655,7 @@ struct to_ir {
         auto iterable_value = expr_sv->iter(loc, method);
 
         // range should have the same static length as the other iterable
-        c10::optional<int64_t> iter_static_len = iterable_value->staticLen();
+        std::optional<int64_t> iter_static_len = iterable_value->staticLen();
         SugaredValuePtr range_sv = std::make_shared<RangeValue>(
             loc, method, range_inputs, iter_static_len);
 
@@ -4454,7 +4454,7 @@ struct to_ir {
           ? refined_type_hint->cast<ListType>()->getElementType()
           : nullptr;
 
-      c10::optional<TypePtr> unified_elem_type = unifyTypeList(
+      std::optional<TypePtr> unified_elem_type = unifyTypeList(
           types, nowhere, /*default_to_union=*/true, elem_type_hint);
 
       if (!refined_type_hint &&
@@ -4885,7 +4885,7 @@ struct to_ir {
       return graph->insertConstant(dim, loc);
     };
     std::vector<int64_t> dims(subscript_exprs.size());
-    std::vector<c10::optional<Value*>> exprs(
+    std::vector<std::optional<Value*>> exprs(
         subscript_exprs.size(), c10::nullopt);
 
     auto handle_indexing = [&](const Expr& subscript_expr,
@@ -5352,7 +5352,7 @@ struct CompilationUnit::PropertyPair
 };
 
 CompilationUnit::PropertyPair CompilationUnit::define_property(
-    const c10::optional<c10::QualifiedName>& prefix,
+    const std::optional<c10::QualifiedName>& prefix,
     const Property& prop,
     const ResolverPtr& resolver,
     const Self* self,
@@ -5386,14 +5386,14 @@ CompilationUnit::PropertyPair CompilationUnit::define_property(
 }
 
 std::unique_ptr<Function> CompilationUnit::define(
-    const c10::optional<QualifiedName>& prefix,
+    const std::optional<QualifiedName>& prefix,
     const Def& def,
     const ResolverPtr& resolver,
     const Self* self,
     const std::unordered_map<std::string, Function*>& function_table,
     bool shouldMangle,
     CompilationUnit::FunctionType type,
-    c10::optional<size_t> operator_set_version) const {
+    std::optional<size_t> operator_set_version) const {
   TORCH_INTERNAL_ASSERT(resolver);
   auto _resolver = resolver;
   if (!self) {
@@ -5444,14 +5444,14 @@ std::unique_ptr<Function> CompilationUnit::define(
 }
 
 std::vector<Function*> CompilationUnit::define(
-    const c10::optional<c10::QualifiedName>& prefix,
+    const std::optional<c10::QualifiedName>& prefix,
     const std::vector<Property>& properties,
     const std::vector<ResolverPtr>& propResolvers,
     const std::vector<Def>& definitions,
     const std::vector<ResolverPtr>& defResolvers,
     const Self* self,
     bool shouldMangle,
-    c10::optional<size_t> operator_set_version) {
+    std::optional<size_t> operator_set_version) {
   TORCH_INTERNAL_ASSERT(definitions.size() == defResolvers.size());
   TORCH_INTERNAL_ASSERT(properties.size() == propResolvers.size());
   std::vector<Function*> functions;
@@ -5515,7 +5515,7 @@ std::vector<Function*> CompilationUnit::define(
 }
 
 void CompilationUnit::define_hooks(
-    const c10::optional<c10::QualifiedName>& prefix,
+    const std::optional<c10::QualifiedName>& prefix,
     const std::vector<Def>& hookDefs,
     const std::vector<ResolverPtr>& hookResolvers,
     const std::vector<Def>& preHookDefs,
@@ -5620,7 +5620,7 @@ void CompilationUnit::define_hooks(
 }
 
 std::vector<Function*> CompilationUnit::define(
-    const c10::optional<QualifiedName>& prefix,
+    const std::optional<QualifiedName>& prefix,
     const std::string& source,
     const ResolverPtr& resolver,
     const Self* self) {
diff --git a/torch/csrc/jit/frontend/parse_string_literal.h b/torch/csrc/jit/frontend/parse_string_literal.h
index 2ca1f150aacdd..5b924864bebd8 100644
--- a/torch/csrc/jit/frontend/parse_string_literal.h
+++ b/torch/csrc/jit/frontend/parse_string_literal.h
@@ -12,7 +12,7 @@ inline bool isCharCount(char c, const std::string& str, size_t start, int len) {
       std::count(str.begin() + start, str.begin() + start + len, c) == len;
 }
 
-inline c10::optional<char> parseOctal(const std::string& str, size_t pos) {
+inline std::optional<char> parseOctal(const std::string& str, size_t pos) {
   //\xxx where x are 0-7
   if (pos + 3 >= str.size())
     return c10::nullopt;
diff --git a/torch/csrc/jit/frontend/parser.cpp b/torch/csrc/jit/frontend/parser.cpp
index 02e22547edd44..ae2c98028e071 100644
--- a/torch/csrc/jit/frontend/parser.cpp
+++ b/torch/csrc/jit/frontend/parser.cpp
@@ -210,7 +210,7 @@ struct ParserImpl {
     }
     return prefix;
   }
-  c10::optional<TreeRef> maybeParseAssignmentOp() {
+  std::optional<TreeRef> maybeParseAssignmentOp() {
     auto r = L.cur().range;
     switch (L.cur().kind) {
       case TK_PLUS_EQ:
diff --git a/torch/csrc/jit/frontend/schema_matching.cpp b/torch/csrc/jit/frontend/schema_matching.cpp
index 0b4fa8ef65b2e..87ec9992141d8 100644
--- a/torch/csrc/jit/frontend/schema_matching.cpp
+++ b/torch/csrc/jit/frontend/schema_matching.cpp
@@ -247,7 +247,7 @@ static Value* tryMatchArgument(
   return value;
 }
 
-c10::optional<size_t> findInputWithName(
+std::optional<size_t> findInputWithName(
     const std::string& name,
     at::ArrayRef<NamedValue> kwargs,
     bool is_aten) {
@@ -354,13 +354,13 @@ bool isBlockListedSchema(const FunctionSchema& schema) {
   return false;
 }
 
-static c10::optional<MatchedSchema> tryMatchSchema(
+static std::optional<MatchedSchema> tryMatchSchema(
     const FunctionSchema& schema,
     const SourceRange& loc,
     Graph& graph,
     at::ArrayRef<NamedValue> args,
     at::ArrayRef<NamedValue> kwargs,
-    c10::optional<NamedValue> self,
+    std::optional<NamedValue> self,
     std::ostream* failure_messages,
     bool allow_conversions) {
   if (isBlockListedSchema(schema)) {
@@ -389,7 +389,7 @@ static c10::optional<MatchedSchema> tryMatchSchema(
   size_t used_args = 0;
   for (const auto schema_i : c10::irange(schema.arguments().size())) {
     const auto& arg = schema.arguments()[schema_i];
-    c10::optional<NamedValue> actual_named_value;
+    std::optional<NamedValue> actual_named_value;
     if (arg.name() == "self" && self) {
       actual_named_value = self;
       self = c10::nullopt;
@@ -540,7 +540,7 @@ MatchedSchema matchSchema(
     Graph& graph,
     at::ArrayRef<NamedValue> args,
     at::ArrayRef<NamedValue> kwargs,
-    const c10::optional<NamedValue>& self) {
+    const std::optional<NamedValue>& self) {
   std::stringstream failure_messages;
   if (auto result = tryMatchSchema(
           schema,
@@ -576,7 +576,7 @@ std::pair<size_t, MatchedSchema> matchSchemas(
     Graph& graph,
     at::ArrayRef<NamedValue> args,
     at::ArrayRef<NamedValue> kwargs,
-    const c10::optional<NamedValue>& self,
+    const std::optional<NamedValue>& self,
     bool render_errors) {
   TORCH_INTERNAL_ASSERT(!schemas.empty());
   // if there is only one schema, we do not need to try without conversions
@@ -645,7 +645,7 @@ static Value* emitBuiltinNode(
     const SourceRange& loc,
     Graph& graph,
     Symbol name,
-    c10::optional<size_t> version) {
+    std::optional<size_t> version) {
   auto n = graph.insertNode(graph.create(name, matched_schema.inputs, 0))
                ->setSourceRange(loc);
 
@@ -681,7 +681,7 @@ Value* emitBuiltinCall(
     Symbol name,
     at::ArrayRef<NamedValue> args,
     at::ArrayRef<NamedValue> kwargs,
-    const c10::optional<NamedValue>& self) {
+    const std::optional<NamedValue>& self) {
   const auto& variants = getAllOperatorsFor(name);
   const auto& builtin_functions = getAllBuiltinFunctionsFor(name);
 
diff --git a/torch/csrc/jit/frontend/schema_matching.h b/torch/csrc/jit/frontend/schema_matching.h
index 754ede24597e5..0c69df521df6a 100644
--- a/torch/csrc/jit/frontend/schema_matching.h
+++ b/torch/csrc/jit/frontend/schema_matching.h
@@ -28,7 +28,7 @@ TORCH_API MatchedSchema matchSchema(
     Graph& graph,
     at::ArrayRef<NamedValue> args,
     at::ArrayRef<NamedValue> kwargs,
-    const c10::optional<NamedValue>& self = c10::nullopt);
+    const std::optional<NamedValue>& self = c10::nullopt);
 
 TORCH_API std::pair<size_t, MatchedSchema> matchSchemas(
     const std::vector<const ::c10::FunctionSchema*>& schemas,
@@ -36,7 +36,7 @@ TORCH_API std::pair<size_t, MatchedSchema> matchSchemas(
     Graph& graph,
     at::ArrayRef<NamedValue> args,
     at::ArrayRef<NamedValue> kwargs,
-    const c10::optional<NamedValue>& self = c10::nullopt,
+    const std::optional<NamedValue>& self = c10::nullopt,
     bool render_errors = false);
 
 TORCH_API bool convertibleToList(
@@ -51,9 +51,9 @@ TORCH_API Value* emitBuiltinCall(
     Symbol name,
     at::ArrayRef<NamedValue> args,
     at::ArrayRef<NamedValue> kwargs,
-    const c10::optional<NamedValue>& self = c10::nullopt);
+    const std::optional<NamedValue>& self = c10::nullopt);
 
-TORCH_API c10::optional<size_t> findInputWithName(
+TORCH_API std::optional<size_t> findInputWithName(
     const std::string& name,
     at::ArrayRef<NamedValue> kwargs,
     bool is_aten = false);
diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp
index 7c4b8ba0cac26..89465bca3f7a3 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.cpp
+++ b/torch/csrc/jit/frontend/schema_type_parser.cpp
@@ -98,7 +98,7 @@ TypePtr SchemaTypeParser::parseBaseType() {
 // Tensor!  // shorthand for Tensor(fresh_identifier!)
 // Tensor(a! -> a|b) // Tensor is in set a, written to,
 //                      and after the write is in set a AND b.
-c10::optional<AliasInfo> SchemaTypeParser::parseAliasAnnotation() {
+std::optional<AliasInfo> SchemaTypeParser::parseAliasAnnotation() {
   AliasInfo alias_info;
   if (L.nextIf('(')) {
     // optional 'alias set annotation'
@@ -147,7 +147,7 @@ c10::optional<AliasInfo> SchemaTypeParser::parseAliasAnnotation() {
   return alias_info;
 }
 
-c10::optional<at::ScalarType> SchemaTypeParser::parseTensorDType(
+std::optional<at::ScalarType> SchemaTypeParser::parseTensorDType(
     const std::string& dtype) {
 #define DEFINE_SCALAR_TYPE(_1, n) {#n, at::ScalarType::n},
 
@@ -161,7 +161,7 @@ c10::optional<at::ScalarType> SchemaTypeParser::parseTensorDType(
   return c10::nullopt;
 }
 
-c10::optional<c10::Device> SchemaTypeParser::tryToParseDeviceType() {
+std::optional<c10::Device> SchemaTypeParser::tryToParseDeviceType() {
   L.expect('=');
   const std::string& dev = L.expect(TK_IDENT).text();
 
@@ -195,7 +195,7 @@ c10::optional<c10::Device> SchemaTypeParser::tryToParseDeviceType() {
   throw ErrorReport(L.cur()) << "cannot parse device type '" << dev << "'\n";
 }
 
-c10::optional<bool> SchemaTypeParser::tryToParseRequiresGrad() {
+std::optional<bool> SchemaTypeParser::tryToParseRequiresGrad() {
   L.expect('=');
   const std::string& num = L.expect(TK_NUMBER).text();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -218,8 +218,8 @@ TypePtr SchemaTypeParser::parseRefinedTensor() {
   TypePtr ptr;
   L.expect('(');
   TypePtr tensor_type;
-  c10::optional<c10::Device> device;
-  c10::optional<bool> requires_grad;
+  std::optional<c10::Device> device;
+  std::optional<bool> requires_grad;
   // Parse a type with either no ranks, known ranks with sizes, ranks with
   // unknown sizes, a mix of ranks with known and unknown sizes, or ranks with
   // known sizes and strides. The type might also have requires_grad and/or
@@ -227,7 +227,7 @@ TypePtr SchemaTypeParser::parseRefinedTensor() {
   //   Long(10, 8, 6, strides=[48, 6, 1], requires_grad=0, device=cuda:1)
   //   Float(10, *, 20, device=cuda:1)
   //   Float(requires_grad=1)
-  std::vector<c10::optional<int64_t>> dims;
+  std::vector<std::optional<int64_t>> dims;
   bool seen_strides = false;
   std::vector<int64_t> strides;
   parseList(TK_NOTHING, ',', ')', [&] {
@@ -339,16 +339,16 @@ TypePtr SchemaTypeParser::parseRefinedTensor() {
   return ptr;
 }
 
-std::pair<TypePtr, c10::optional<AliasInfo>> SchemaTypeParser::parseType() {
+std::pair<TypePtr, std::optional<AliasInfo>> SchemaTypeParser::parseType() {
   auto r = parseFakeAndRealType();
   return std::make_pair(std::move(std::get<0>(r)), std::move(std::get<2>(r)));
 }
 
-std::tuple</*fake*/ TypePtr, /*real*/ TypePtr, c10::optional<AliasInfo>>
+std::tuple</*fake*/ TypePtr, /*real*/ TypePtr, std::optional<AliasInfo>>
 SchemaTypeParser::parseFakeAndRealType() {
   TypePtr fake_value;
   TypePtr real_value;
-  c10::optional<AliasInfo> alias_info;
+  std::optional<AliasInfo> alias_info;
   // Tuple type
   if (L.cur().kind == '(') {
     std::vector<TypePtr> types;
@@ -465,7 +465,7 @@ SchemaTypeParser::parseFakeAndRealType() {
       auto container = parseAliasAnnotation();
       if (alias_info) {
         if (!container) {
-          container = c10::optional<AliasInfo>(AliasInfo());
+          container = std::optional<AliasInfo>(AliasInfo());
           container->setIsWrite(alias_info->isWrite());
         }
         container->addContainedType(std::move(*alias_info));
diff --git a/torch/csrc/jit/frontend/schema_type_parser.h b/torch/csrc/jit/frontend/schema_type_parser.h
index c43e4363da386..e8c830cd5ae06 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.h
+++ b/torch/csrc/jit/frontend/schema_type_parser.h
@@ -13,19 +13,19 @@ using TypePtr = c10::TypePtr;
 
 struct TORCH_API SchemaTypeParser {
   TypePtr parseBaseType();
-  c10::optional<c10::AliasInfo> parseAliasAnnotation();
-  std::pair<TypePtr, c10::optional<c10::AliasInfo>> parseType();
-  std::tuple</*fake*/ TypePtr, /*real*/ TypePtr, c10::optional<c10::AliasInfo>>
+  std::optional<c10::AliasInfo> parseAliasAnnotation();
+  std::pair<TypePtr, std::optional<c10::AliasInfo>> parseType();
+  std::tuple</*fake*/ TypePtr, /*real*/ TypePtr, std::optional<c10::AliasInfo>>
   parseFakeAndRealType();
-  c10::optional<at::ScalarType> parseTensorDType(const std::string& dtype);
+  std::optional<at::ScalarType> parseTensorDType(const std::string& dtype);
   TypePtr parseRefinedTensor();
 
   SchemaTypeParser(Lexer& L, bool parse_complete_tensor_types)
       : complete_tensor_types(parse_complete_tensor_types), L(L) {}
 
  private:
-  c10::optional<bool> tryToParseRequiresGrad();
-  c10::optional<c10::Device> tryToParseDeviceType();
+  std::optional<bool> tryToParseRequiresGrad();
+  std::optional<c10::Device> tryToParseDeviceType();
   void parseList(
       int begin,
       int sep,
diff --git a/torch/csrc/jit/frontend/script_type_parser.cpp b/torch/csrc/jit/frontend/script_type_parser.cpp
index 245a7496d8f36..9295a3ed4007a 100644
--- a/torch/csrc/jit/frontend/script_type_parser.cpp
+++ b/torch/csrc/jit/frontend/script_type_parser.cpp
@@ -118,7 +118,7 @@ TypePtr ScriptTypeParser::subscriptToType(
   }
 }
 
-c10::optional<std::pair<TypePtr, int32_t>> ScriptTypeParser::parseBroadcastList(
+std::optional<std::pair<TypePtr, int32_t>> ScriptTypeParser::parseBroadcastList(
     const Expr& expr) const {
   // Alias torch.nn._common_types._size_?_t to BroadcastingList?[int]
   if (expr.kind() == TK_VAR) {
@@ -191,7 +191,7 @@ c10::optional<std::pair<TypePtr, int32_t>> ScriptTypeParser::parseBroadcastList(
 
 // gets the base type name given namespaces where the types live
 // turns torch.Tensor -> Tensor, X -> X
-c10::optional<std::string> ScriptTypeParser::parseBaseTypeName(
+std::optional<std::string> ScriptTypeParser::parseBaseTypeName(
     const Expr& expr) const {
   switch (expr.kind()) {
     case TK_VAR: {
@@ -407,7 +407,7 @@ std::vector<Argument> ScriptTypeParser::parseArgsFromDecl(
     auto decl_arg = *it;
 
     TypePtr type;
-    c10::optional<int32_t> N = c10::nullopt;
+    std::optional<int32_t> N = c10::nullopt;
     if (!decl_arg.type().present()) {
       // If this param doesn't have a type, default to "tensor"
       type = TensorType::getInferred();
@@ -421,7 +421,7 @@ std::vector<Argument> ScriptTypeParser::parseArgsFromDecl(
         type = parseTypeFromExpr(decl_arg.type().get());
       }
     }
-    c10::optional<IValue> default_value = c10::nullopt;
+    std::optional<IValue> default_value = c10::nullopt;
     if (decl_arg.defaultValue().present()) {
       default_value = *defaults_it++;
     }
diff --git a/torch/csrc/jit/frontend/script_type_parser.h b/torch/csrc/jit/frontend/script_type_parser.h
index 3a05af9c598ab..66c963b7d6d3d 100644
--- a/torch/csrc/jit/frontend/script_type_parser.h
+++ b/torch/csrc/jit/frontend/script_type_parser.h
@@ -21,7 +21,7 @@ class TORCH_API ScriptTypeParser {
 
   c10::TypePtr parseTypeFromExpr(const Expr& expr) const;
 
-  c10::optional<std::pair<c10::TypePtr, int32_t>> parseBroadcastList(
+  std::optional<std::pair<c10::TypePtr, int32_t>> parseBroadcastList(
       const Expr& expr) const;
 
   c10::TypePtr parseType(const std::string& str);
@@ -33,7 +33,7 @@ class TORCH_API ScriptTypeParser {
  private:
   c10::TypePtr parseTypeFromExprImpl(const Expr& expr) const;
 
-  c10::optional<std::string> parseBaseTypeName(const Expr& expr) const;
+  std::optional<std::string> parseBaseTypeName(const Expr& expr) const;
   at::TypePtr subscriptToType(
       const std::string& typeName,
       const Subscript& subscript) const;
diff --git a/torch/csrc/jit/frontend/source_range.cpp b/torch/csrc/jit/frontend/source_range.cpp
index 03c366878af99..20ffbfd4601e3 100644
--- a/torch/csrc/jit/frontend/source_range.cpp
+++ b/torch/csrc/jit/frontend/source_range.cpp
@@ -151,7 +151,7 @@ size_t SourceRangeHasher::operator()(const torch::jit::SourceRange& key) const {
       std::hash<size_t>()(key.start()) ^ std::hash<size_t>()(key.end()));
 }
 
-c10::optional<SourceRange> Source::findSourceRangeThatGenerated(
+std::optional<SourceRange> Source::findSourceRangeThatGenerated(
     const SourceRange& range) {
   if (!gen_ranges_) {
     return c10::nullopt;
diff --git a/torch/csrc/jit/frontend/source_range.h b/torch/csrc/jit/frontend/source_range.h
index 72710a94ed210..1f8715ad00969 100644
--- a/torch/csrc/jit/frontend/source_range.h
+++ b/torch/csrc/jit/frontend/source_range.h
@@ -190,7 +190,7 @@ struct TORCH_API Source {
 
   explicit Source(
       c10::string_view text_view,
-      c10::optional<std::string> filename = c10::nullopt,
+      std::optional<std::string> filename = c10::nullopt,
       size_t starting_line_no = 0,
       std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr,
       CopiesString copies_str = COPIES_STRING)
@@ -210,7 +210,7 @@ struct TORCH_API Source {
 
   explicit Source(
       StringCordView str,
-      c10::optional<std::string> filename = c10::nullopt,
+      std::optional<std::string> filename = c10::nullopt,
       size_t starting_line_no = 0,
       std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr)
       : text_view_(std::move(str)),
@@ -266,7 +266,7 @@ struct TORCH_API Source {
     return text_view_.size();
   }
 
-  c10::optional<std::string>& filename() {
+  std::optional<std::string>& filename() {
     return filename_;
   }
 
@@ -274,7 +274,7 @@ struct TORCH_API Source {
     return starting_line_no_;
   }
 
-  c10::optional<SourceRange> findSourceRangeThatGenerated(
+  std::optional<SourceRange> findSourceRangeThatGenerated(
       const SourceRange& range);
 
   ~Source() = default;
@@ -291,7 +291,7 @@ struct TORCH_API Source {
 
   StringCordView text_view_;
 
-  c10::optional<std::string> filename_;
+  std::optional<std::string> filename_;
   // If filename_ is not present, starting_line_no_ is don't care
   size_t starting_line_no_;
   // Starting offsets for lines into the source. e.g. line 0 starts at
@@ -358,14 +358,14 @@ struct TORCH_API SourceRange {
     return ss.str();
   }
 
-  c10::optional<std::tuple<std::string, size_t, size_t>> file_line_col() const {
+  std::optional<std::tuple<std::string, size_t, size_t>> file_line_col() const {
     if (!source_view_ || !source()->filename()) {
       return c10::nullopt;
     }
 
     auto lineno = source_view_->lineno_for_offset(start_);
     auto col_offset = (int)start_ - (int)source_view_->offset_for_line(lineno);
-    // TODO: c10::optional<>::value returns an rvalue ref so can't use it here??
+    // TODO: std::optional<>::value returns an rvalue ref so can't use it here??
     return std::make_tuple<std::string, size_t, size_t>(
         source_view_->filename().value_or(""),
         source_view_->lineno_to_source_lineno(lineno),
@@ -381,7 +381,7 @@ struct TORCH_API SourceRange {
     return !(*this == rhs);
   }
 
-  c10::optional<SourceRange> findSourceRangeThatGenerated() const {
+  std::optional<SourceRange> findSourceRangeThatGenerated() const {
     if (!source_view_) {
       return c10::nullopt;
     }
diff --git a/torch/csrc/jit/frontend/sugared_value.cpp b/torch/csrc/jit/frontend/sugared_value.cpp
index 80b5d27fba079..4b65903529d23 100644
--- a/torch/csrc/jit/frontend/sugared_value.cpp
+++ b/torch/csrc/jit/frontend/sugared_value.cpp
@@ -283,7 +283,7 @@ std::shared_ptr<SugaredValue> SimpleValue::attr(
 std::vector<std::shared_ptr<SugaredValue>> SimpleValue::asTuple(
     const SourceRange& loc,
     GraphFunction& m,
-    const c10::optional<size_t>& size_hint) {
+    const std::optional<size_t>& size_hint) {
   static const auto make_simple_value =
       [](Value* v) -> std::shared_ptr<SugaredValue> {
     return std::make_shared<SimpleValue>(v);
@@ -525,7 +525,7 @@ RangeValue::RangeValue(
     const SourceRange& loc,
     GraphFunction& m,
     std::vector<Value*> inputs,
-    c10::optional<int64_t> static_len) {
+    std::optional<int64_t> static_len) {
   for (const auto i : c10::irange(inputs.size())) {
     auto typ = inputs[i]->type();
     if (!typ->cast<IntType>()) {
@@ -645,7 +645,7 @@ void IterableTree::addChild(
     const SourceRange& range,
     GraphFunction& m,
     const SugaredValuePtr& iter_value) {
-  c10::optional<int64_t> child_len = iter_value->staticLen();
+  std::optional<int64_t> child_len = iter_value->staticLen();
   if (children_.empty()) {
     unroll_length_ = child_len;
   } else {
@@ -748,7 +748,7 @@ std::shared_ptr<SugaredValue> NamedTupleConstructor::call(
 
 std::shared_ptr<BuiltinFunction> BuiltinFunction::tryCreate(
     Symbol symbol,
-    c10::optional<NamedValue> self) {
+    std::optional<NamedValue> self) {
   for (const std::shared_ptr<Operator>& op : getAllOperatorsFor(symbol)) {
     if (!self) {
       return std::make_shared<BuiltinFunction>(symbol, nullptr);
diff --git a/torch/csrc/jit/frontend/sugared_value.h b/torch/csrc/jit/frontend/sugared_value.h
index 9bf09f4a56e17..97b092cad3ce7 100644
--- a/torch/csrc/jit/frontend/sugared_value.h
+++ b/torch/csrc/jit/frontend/sugared_value.h
@@ -67,7 +67,7 @@ struct TORCH_API SugaredValue
   virtual std::vector<std::shared_ptr<SugaredValue>> asTuple(
       const SourceRange& loc,
       GraphFunction& m,
-      const c10::optional<size_t>& size_hint = {}) {
+      const std::optional<size_t>& size_hint = {}) {
     throw ErrorReport(loc) << kind() << " cannot be used as a tuple";
   }
 
@@ -121,7 +121,7 @@ struct TORCH_API SugaredValue
   // function, then we emit an unrolled loop over the variable. This allows us
   // to support containers of Heterogenous types, like Module Containers &
   // Tuples
-  virtual c10::optional<int64_t> staticLen() {
+  virtual std::optional<int64_t> staticLen() {
     return c10::nullopt;
   }
 
@@ -169,7 +169,7 @@ struct TORCH_API SimpleValue : public SugaredValue {
   std::vector<std::shared_ptr<SugaredValue>> asTuple(
       const SourceRange& loc,
       GraphFunction& m,
-      const c10::optional<size_t>& size_hint = {}) override;
+      const std::optional<size_t>& size_hint = {}) override;
   std::shared_ptr<SugaredValue> attr(
       const SourceRange& loc,
       GraphFunction& m,
@@ -213,14 +213,14 @@ struct TORCH_API SimpleValue : public SugaredValue {
 };
 
 struct TORCH_API BuiltinFunction : public SugaredValue {
-  BuiltinFunction(Symbol symbol, c10::optional<NamedValue> self)
+  BuiltinFunction(Symbol symbol, std::optional<NamedValue> self)
       : symbol(symbol), self(std::move(self)) {}
 
   // The symbol of the function (e.g. `aten::relu`).
   Symbol symbol;
 
   // if this is method, then this is the self argument.
-  c10::optional<NamedValue> self;
+  std::optional<NamedValue> self;
   std::string kind() const override {
     return "builtin";
   }
@@ -236,7 +236,7 @@ struct TORCH_API BuiltinFunction : public SugaredValue {
   // not clear if it is a valid builtin
   static std::shared_ptr<BuiltinFunction> tryCreate(
       Symbol symbol,
-      c10::optional<NamedValue> self);
+      std::optional<NamedValue> self);
 };
 
 struct TORCH_API SugaredTupleValue : public SugaredValue {
@@ -246,7 +246,7 @@ struct TORCH_API SugaredTupleValue : public SugaredValue {
   std::vector<std::shared_ptr<SugaredValue>> asTuple(
       const SourceRange& loc,
       GraphFunction& m,
-      const c10::optional<size_t>& size_hint = {}) override {
+      const std::optional<size_t>& size_hint = {}) override {
     return tup_;
   };
 
@@ -297,7 +297,7 @@ struct TORCH_API SugaredTupleValue : public SugaredValue {
   // Because this is used to contain SugaredValues of Heterogenous types,
   // we define staticLen() so that when this is iterated over it is emitted
   // as an unrolled loop.
-  c10::optional<int64_t> staticLen() override {
+  std::optional<int64_t> staticLen() override {
     return static_cast<int64_t>(tup_.size());
   }
 
@@ -305,7 +305,7 @@ struct TORCH_API SugaredTupleValue : public SugaredValue {
 };
 
 struct TORCH_API BuiltinModule : public SugaredValue {
-  BuiltinModule(std::string name, c10::optional<int64_t> version = at::nullopt)
+  BuiltinModule(std::string name, std::optional<int64_t> version = at::nullopt)
       : name(std::move(name)), version(version) {}
 
   std::string kind() const override {
@@ -330,7 +330,7 @@ struct TORCH_API BuiltinModule : public SugaredValue {
   std::string name;
   // when we add operator versioning, emit this op as it exising at 'version'
   // if not set, use the latest version
-  c10::optional<int64_t> version;
+  std::optional<int64_t> version;
 };
 
 // Represents a class, analagous to `int` or `dict`. Instances of classes,
@@ -638,7 +638,7 @@ struct TORCH_API RangeValue : SugaredValue {
       const SourceRange& loc,
       GraphFunction& m,
       std::vector<Value*> input,
-      c10::optional<int64_t> static_len = c10::nullopt);
+      std::optional<int64_t> static_len = c10::nullopt);
 
   std::string kind() const override {
     return "range";
@@ -654,7 +654,7 @@ struct TORCH_API RangeValue : SugaredValue {
 
   // When Range is instantiated via enumerate(iterable_with_static_len),
   // then it takes the static length of the iterable
-  c10::optional<int64_t> staticLen() override {
+  std::optional<int64_t> staticLen() override {
     return static_len_;
   }
 
@@ -667,7 +667,7 @@ struct TORCH_API RangeValue : SugaredValue {
   // derivation nodes to simplify the graph and enable more possible
   // optimizations
   bool has_only_end_{};
-  c10::optional<int64_t> static_len_;
+  std::optional<int64_t> static_len_;
 };
 
 // Specialized Tree structure to matched against for special handling
@@ -712,7 +712,7 @@ struct TORCH_API IterableTree : SugaredValue {
 
   // If this iterable contains a ModuleList or Tuple, then it will have a
   // static length, and we will emit it as an unrolled for loop.
-  c10::optional<int64_t> staticLen() override {
+  std::optional<int64_t> staticLen() override {
     return unroll_length_;
   }
 
@@ -730,7 +730,7 @@ struct TORCH_API IterableTree : SugaredValue {
       TypePtr type_hint = nullptr) override;
 
  private:
-  c10::optional<int64_t> unroll_length_ = c10::nullopt;
+  std::optional<int64_t> unroll_length_ = c10::nullopt;
   std::vector<SugaredValuePtr> children_;
 };
 
diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp
index 823b27f30fcb1..9616e0f83dfbe 100644
--- a/torch/csrc/jit/frontend/tracer.cpp
+++ b/torch/csrc/jit/frontend/tracer.cpp
@@ -44,7 +44,7 @@ template <typename T>
 void genericAddOptionalInput(
     Node* n,
     const char* name,
-    const c10::optional<T>& value) {
+    const std::optional<T>& value) {
   if (value) {
     jit::tracer::addInputs(n, name, *value);
   } else {
@@ -110,7 +110,7 @@ void TracingState::delValue(const IValue& var) {
 Value* getValueTrace(const IValue& var) {
   return getTracingState()->getValue(var);
 }
-static Value* getOptTensorValueTrace(const c10::optional<at::Tensor>& var) {
+static Value* getOptTensorValueTrace(const std::optional<at::Tensor>& var) {
   return getValueTrace(IValue(var));
 }
 Value* TracingState::getValue(const IValue& var) {
@@ -617,7 +617,7 @@ void addInputs(Node* n, const char* name, c10::SymInt value) {
   addInputs(n, name, value.guard_int(__FILE__, __LINE__));
 }
 
-void addInputs(Node* n, const char* name, c10::optional<int64_t> value) {
+void addInputs(Node* n, const char* name, std::optional<int64_t> value) {
   using ArgumentStash = jit::tracer::ArgumentStash;
   if (ArgumentStash::hasValue(name)) {
     Value* v = ArgumentStash::popValue(name);
@@ -633,13 +633,13 @@ void addInputs(Node* n, const char* name, c10::optional<int64_t> value) {
 void addInputs(Node* n, const char* name, bool value) {
   detail::genericAddInput(n, value);
 }
-void addInputs(Node* n, const char* name, const c10::optional<bool>& value) {
+void addInputs(Node* n, const char* name, const std::optional<bool>& value) {
   detail::genericAddOptionalInput(n, name, value);
 }
 void addInputs(Node* n, const char* name, double value) {
   detail::genericAddInput(n, value);
 }
-void addInputs(Node* n, const char* name, const c10::optional<double>& value) {
+void addInputs(Node* n, const char* name, const std::optional<double>& value) {
   detail::genericAddOptionalInput(n, name, value);
 }
 void addInputs(Node* n, const char* name, const at::Scalar& value) {
@@ -654,7 +654,7 @@ void addInputs(Node* n, const char* name, const at::Scalar& value) {
 void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::Scalar>& value) {
+    const std::optional<at::Scalar>& value) {
   detail::genericAddOptionalInput(n, name, value);
 }
 void addInputs(Node* n, const char* name, const c10::string_view value) {
@@ -663,7 +663,7 @@ void addInputs(Node* n, const char* name, const c10::string_view value) {
 void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<c10::string_view>& value) {
+    const std::optional<c10::string_view>& value) {
   detail::genericAddOptionalInput(n, name, value);
 }
 void addInputs(Node* n, const char* name, const at::Tensor& value) {
@@ -672,13 +672,13 @@ void addInputs(Node* n, const char* name, const at::Tensor& value) {
 void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::Tensor>& value) {
+    const std::optional<at::Tensor>& value) {
   detail::genericAddOptionalInput(n, name, value);
 }
 void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::Generator>& value) {
+    const std::optional<at::Generator>& value) {
   Graph* g = n->owningGraph();
 
   if (value.has_value() && value->defined()) {
@@ -706,31 +706,31 @@ void addInputs(Node* n, const char* name, at::MemoryFormat value) {
 void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::MemoryFormat>& value) {
+    const std::optional<at::MemoryFormat>& value) {
   detail::genericAddOptionalInput(n, name, value);
 }
 void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::Layout>& value) {
+    const std::optional<at::Layout>& value) {
   detail::genericAddOptionalInput(n, name, value);
 }
 void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::Device>& value) {
+    const std::optional<at::Device>& value) {
   detail::genericAddOptionalInput(n, name, value);
 }
 void addInputs(
     Node* n,
     const char* name,
-    c10::optional<at::DimnameList> value) {
+    std::optional<at::DimnameList> value) {
   TORCH_CHECK(false, "NYI: Named tensors are not supported with the tracer");
 }
 void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::ScalarType>& value) {
+    const std::optional<at::ScalarType>& value) {
   detail::genericAddOptionalInput(n, name, value);
 }
 void addInputs(
@@ -767,7 +767,7 @@ void addInputs(
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const List<c10::optional<at::Tensor>>& value) {
+    const List<std::optional<at::Tensor>>& value) {
   Graph* g = n->owningGraph();
   Node* list_node = nullptr;
   list_node = g->insertNode(g->createList(
@@ -813,7 +813,7 @@ void addInputs(Node* n, const char* name, c10::SymIntArrayRef value) {
   addInputs(n, name, C10_AS_INTARRAYREF_SLOW(value));
 }
 
-void addInputs(Node* n, const char* name, c10::optional<c10::SymInt> value) {
+void addInputs(Node* n, const char* name, std::optional<c10::SymInt> value) {
   addInputs(
       n,
       name,
@@ -825,7 +825,7 @@ void addInputs(Node* n, const char* name, c10::optional<c10::SymInt> value) {
 void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::IntArrayRef>& opt_value) {
+    const std::optional<at::IntArrayRef>& opt_value) {
   detail::genericAddOptionalInput(n, name, opt_value);
 }
 
@@ -869,7 +869,7 @@ void addInputs(Node* n, const char* name, ArrayRef<double> value) {
 void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<c10::ArrayRef<double>>& opt_value) {
+    const std::optional<c10::ArrayRef<double>>& opt_value) {
   detail::genericAddOptionalInput(n, name, opt_value);
 }
 
@@ -995,7 +995,7 @@ void ensureUniqueIfOutOfPlaced(const char* name, const at::Tensor& tensor) {
 }
 void ensureUniqueIfOutOfPlaced(
     const char* name,
-    const c10::optional<at::Tensor>& tensor) {
+    const std::optional<at::Tensor>& tensor) {
   ensureUniqueIfOutOfPlaced(name, tensor.has_value() ? *tensor : at::Tensor());
 }
 
diff --git a/torch/csrc/jit/frontend/tracer.h b/torch/csrc/jit/frontend/tracer.h
index f265d57b649dd..a1cc856a22e19 100644
--- a/torch/csrc/jit/frontend/tracer.h
+++ b/torch/csrc/jit/frontend/tracer.h
@@ -236,37 +236,37 @@ TORCH_API void addInputs(Node* n, const char* name, c10::SymInt value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    c10::optional<int64_t> value);
+    std::optional<int64_t> value);
 TORCH_API void addInputs(Node* n, const char* name, bool value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<bool>& value);
+    const std::optional<bool>& value);
 TORCH_API void addInputs(Node* n, const char* name, double value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<double>& value);
+    const std::optional<double>& value);
 TORCH_API void addInputs(Node* n, const char* name, const at::Scalar& value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::Scalar>& value);
+    const std::optional<at::Scalar>& value);
 TORCH_API void addInputs(Node* n, const char* name, const at::Tensor& value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::Tensor>& value);
+    const std::optional<at::Tensor>& value);
 TORCH_API void addInputs(Node* n, const char* name, ArrayRef<int64_t> value);
 TORCH_API void addInputs(Node* n, const char* name, c10::SymIntArrayRef value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    c10::optional<c10::SymInt> value);
+    std::optional<c10::SymInt> value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<ArrayRef<int64_t>>& value);
+    const std::optional<ArrayRef<int64_t>>& value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
@@ -293,7 +293,7 @@ TORCH_API void addInputs(
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const List<c10::optional<at::Tensor>>& value);
+    const List<std::optional<at::Tensor>>& value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
@@ -303,7 +303,7 @@ TORCH_API void addInputs(Node* n, const char* name, ArrayRef<double> value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<ArrayRef<double>>& value);
+    const std::optional<ArrayRef<double>>& value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
@@ -311,7 +311,7 @@ TORCH_API void addInputs(
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<c10::string_view>& value);
+    const std::optional<c10::string_view>& value);
 TORCH_API void addInputs(Node* n, const char* name, at::Device value);
 TORCH_API void addInputs(Node* n, const char* name, c10::Stream stream);
 TORCH_API void addInputs(Node* n, const char* name, at::Layout value);
@@ -319,28 +319,28 @@ TORCH_API void addInputs(Node* n, const char* name, at::ScalarType value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::ScalarType>& value);
+    const std::optional<at::ScalarType>& value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::Device>& value);
+    const std::optional<at::Device>& value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::Layout>& value);
+    const std::optional<at::Layout>& value);
 TORCH_API void addInputs(Node* n, const char* name, at::MemoryFormat value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    c10::optional<at::DimnameList> value);
+    std::optional<at::DimnameList> value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::MemoryFormat>& value);
+    const std::optional<at::MemoryFormat>& value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const c10::optional<at::Generator>& value);
+    const std::optional<at::Generator>& value);
 
 inline void addInputs(
     Node* n,
@@ -377,7 +377,7 @@ TORCH_API void ensureUniqueIfOutOfPlaced(
     const at::Tensor& tensor);
 TORCH_API void ensureUniqueIfOutOfPlaced(
     const char* name,
-    const c10::optional<at::Tensor>& tensor);
+    const std::optional<at::Tensor>& tensor);
 
 template <
     typename T,
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index 29953ecd19a3e..f9b2ed5dd7ce9 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -54,7 +54,7 @@ class MutableTypePtrHelper {
   //     of dimension 4 would map to the same type as a Tensor of
   //     dimension 1. This allows us to treat all subclasses of Tensor
   //     as a single, homogenous "Tensor" type.
-  c10::optional<AliasTypeSet> mapTypeToAliasTypeSet(const TypePtr& type) {
+  std::optional<AliasTypeSet> mapTypeToAliasTypeSet(const TypePtr& type) {
     if (mutable_type_cache_) {
       const AliasTypeSet* result = mapTypeToBorrowedAliasTypeSet(type);
       if (result) {
@@ -82,7 +82,7 @@ class MutableTypePtrHelper {
   }
 
  private:
-  c10::optional<AliasTypeSet> mapTypeToAliasTypeSetImpl(const TypePtr& type) {
+  std::optional<AliasTypeSet> mapTypeToAliasTypeSetImpl(const TypePtr& type) {
     switch (type->kind()) {
       case TypeKind::ListType:
       case TypeKind::DictType:
@@ -1097,7 +1097,7 @@ void AliasDb::analyzeRpcAsync(Node* node) {
 }
 
 namespace {
-c10::optional<bool> getConstantBooleanInput(
+std::optional<bool> getConstantBooleanInput(
     Node* node,
     const std::string& inputName) {
   TORCH_INTERNAL_ASSERT(
@@ -1893,7 +1893,7 @@ bool AliasDb::mayAliasWildcard(const at::ArrayRef<Value*> vs) const {
       vs.begin(), vs.end(), [&](Value* v) { return mayAliasWildcard(v); });
 }
 
-c10::optional<Element*> AliasDb::tryGetOrCreateWildcard(const TypePtr& type) {
+std::optional<Element*> AliasDb::tryGetOrCreateWildcard(const TypePtr& type) {
   auto maybe_mut_types = mapTypeToAliasTypeSetPtr(type);
   if (!maybe_mut_types) {
     return c10::nullopt;
@@ -1966,8 +1966,8 @@ Element* AliasDb::getWildcard(const TypePtr& type) const {
 }
 
 // Register `v` as a wildcard value.
-c10::optional<Element*> AliasDb::setWildcard(const Value* v) {
-  c10::optional<Element*> maybe_wildcardElement =
+std::optional<Element*> AliasDb::setWildcard(const Value* v) {
+  std::optional<Element*> maybe_wildcardElement =
       tryGetOrCreateWildcard(v->type());
   if (!maybe_wildcardElement) {
     return c10::nullopt;
diff --git a/torch/csrc/jit/ir/alias_analysis.h b/torch/csrc/jit/ir/alias_analysis.h
index 380943635ea35..c06a4a88080b4 100644
--- a/torch/csrc/jit/ir/alias_analysis.h
+++ b/torch/csrc/jit/ir/alias_analysis.h
@@ -203,7 +203,7 @@ class AliasDb {
    * Wildcard methods
    */
   // Register `v` as a wildcard value.
-  c10::optional<Element*> setWildcard(const Value* v);
+  std::optional<Element*> setWildcard(const Value* v);
 
   // Is this a value which will not alias?
   bool nonAliasingValue(const Value* elem) const;
@@ -274,7 +274,7 @@ class AliasDb {
   // All wildcard Elements (one for each unique mutable type)
   ska::flat_hash_map<TypePtr, Element*, HashType, EqualType> wildcardIndex_;
   Element* getWildcard(const TypePtr& type) const;
-  c10::optional<Element*> tryGetOrCreateWildcard(const TypePtr& type);
+  std::optional<Element*> tryGetOrCreateWildcard(const TypePtr& type);
   void addContainedTypesToFreshElement(
       Element* container_elem,
       const AliasTypeSet& mut_types);
@@ -301,9 +301,9 @@ class AliasDb {
 
   // Map of nodes to the memory locations that they write to
   using TWriteIndex = ska::flat_hash_map<Node*, MemoryLocations>;
-  c10::optional<TWriteIndex> writeIndex_;
+  std::optional<TWriteIndex> writeIndex_;
   // Collection of all memory locations that are written to.
-  c10::optional<MemoryLocations> writtenToLocationsIndex_;
+  std::optional<MemoryLocations> writtenToLocationsIndex_;
   void buildWrittenToLocationsIndex();
 
   std::unordered_set<const Value*> wildcards_;
diff --git a/torch/csrc/jit/ir/constants.cpp b/torch/csrc/jit/ir/constants.cpp
index 905088a20d1e2..ef697a5af7680 100644
--- a/torch/csrc/jit/ir/constants.cpp
+++ b/torch/csrc/jit/ir/constants.cpp
@@ -48,8 +48,8 @@ static bool insertableIValue(const IValue& ivalue) {
 Value* insertConstant(
     Graph& g,
     const IValue& val,
-    c10::optional<SourceRange> loc,
-    c10::optional<ScopePtr> scope) {
+    std::optional<SourceRange> loc,
+    std::optional<ScopePtr> scope) {
   auto value = tryInsertConstant(g, val, std::move(loc), std::move(scope));
   if (value) {
     return *value;
@@ -59,11 +59,11 @@ Value* insertConstant(
 }
 
 // IValue -> Constant node
-c10::optional<Value*> tryInsertConstant(
+std::optional<Value*> tryInsertConstant(
     Graph& g,
     const IValue& val,
-    c10::optional<SourceRange> loc,
-    c10::optional<ScopePtr> scope) {
+    std::optional<SourceRange> loc,
+    std::optional<ScopePtr> scope) {
   Node* n = g.create(prim::Constant);
   if (val.isTensor()) {
     at::Tensor ref = val.toTensor();
@@ -153,7 +153,7 @@ c10::optional<Value*> tryInsertConstant(
   return g.insertNode(n)->output();
 }
 
-c10::optional<IValue> toIValue(const Value* v) {
+std::optional<IValue> toIValue(const Value* v) {
   if (v->node()->kind() != prim::Constant || v->type()->cast<FunctionType>()) {
     return c10::nullopt;
   }
diff --git a/torch/csrc/jit/ir/constants.h b/torch/csrc/jit/ir/constants.h
index d9d11075dd204..118da1e932d9c 100644
--- a/torch/csrc/jit/ir/constants.h
+++ b/torch/csrc/jit/ir/constants.h
@@ -25,8 +25,8 @@ struct TORCH_API constant_not_supported_error : public std::runtime_error {
 TORCH_API Value* insertConstant(
     Graph& g,
     const IValue& val,
-    c10::optional<SourceRange> loc = c10::nullopt,
-    c10::optional<ScopePtr> scope = c10::nullopt);
+    std::optional<SourceRange> loc = c10::nullopt,
+    std::optional<ScopePtr> scope = c10::nullopt);
 
 // note: prefer g.insertConsant(val, loc) which does exactly the same thing
 // this function is only declared/defined here because its implementation is
@@ -34,11 +34,11 @@ TORCH_API Value* insertConstant(
 // constants.cpp.
 //
 // returns a c10::nullopt if the IValue kind cannot be inserted as a constant
-TORCH_API c10::optional<Value*> tryInsertConstant(
+TORCH_API std::optional<Value*> tryInsertConstant(
     Graph& g,
     const IValue& val,
-    c10::optional<SourceRange> loc = c10::nullopt,
-    c10::optional<ScopePtr> scope = c10::nullopt);
+    std::optional<SourceRange> loc = c10::nullopt,
+    std::optional<ScopePtr> scope = c10::nullopt);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Helper for retrieving constants
@@ -46,12 +46,12 @@ TORCH_API c10::optional<Value*> tryInsertConstant(
 
 // attempt to convert a (possibly constant) Value* into an interpreter value
 // (IValue). returns c10::nullopt if the Value* was not constant
-TORCH_API c10::optional<IValue> toIValue(const Value* v);
+TORCH_API std::optional<IValue> toIValue(const Value* v);
 
 // if a value is a constant then try to turn into type T using the
 // same rules as the interpreter
 template <typename T>
-c10::optional<T> constant_as(const Value* v) {
+std::optional<T> constant_as(const Value* v) {
   if (auto ivalue = toIValue(v)) {
     return ivalue->to<T>();
   }
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index a320570de5ca9..e288f78875c62 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -418,7 +418,7 @@ std::ostream& operator<<(std::ostream& out, const Graph& g) {
 
 static void checkSameDevice(const Node* node) {
   bool has_device = false;
-  c10::optional<at::Device> device = c10::nullopt;
+  std::optional<at::Device> device = c10::nullopt;
   auto checkValue = [&](const Value* v) {
     if (TensorTypePtr type = v->type()->cast<TensorType>()) {
       if (type->device() && !has_device) {
@@ -984,7 +984,7 @@ static size_t findArgument(const FunctionSchema& the_schema, Symbol name) {
   return findArgument(the_schema, unqualName);
 }
 
-c10::optional<IValue> Node::get(Symbol name) const {
+std::optional<IValue> Node::get(Symbol name) const {
   return toIValue(namedInput(name));
 }
 
@@ -1686,7 +1686,7 @@ Value* Graph::insert(
     Symbol opname,
     at::ArrayRef<NamedValue> args,
     at::ArrayRef<NamedValue> kwargs,
-    const c10::optional<SourceRange>& range) {
+    const std::optional<SourceRange>& range) {
   return emitBuiltinCall(
       range.value_or(fakeRange()), *this, opname, args, kwargs);
 }
@@ -1993,8 +1993,8 @@ Node* Graph::createClone(
 
 Value* Graph::insertConstant(
     const IValue& val,
-    c10::optional<SourceRange> loc,
-    c10::optional<ScopePtr> scope) {
+    std::optional<SourceRange> loc,
+    std::optional<ScopePtr> scope) {
   return jit::insertConstant(*this, val, std::move(loc), std::move(scope));
 }
 
@@ -2051,14 +2051,14 @@ void inlineCallStackOfNode(
     std::unordered_map<InlinedCallStack*, InlinedCallStackPtr>& new_cs_entries,
     Function* callee,
     Node* to_replace,
-    c10::optional<ModuleInstanceInfo> m_info);
+    std::optional<ModuleInstanceInfo> m_info);
 
 static void inlineCallStackOfBlock(
     Block* b,
     std::unordered_map<InlinedCallStack*, InlinedCallStackPtr>& new_cs_entries,
     Function* callee,
     Node* to_replace,
-    c10::optional<ModuleInstanceInfo> m_info) {
+    std::optional<ModuleInstanceInfo> m_info) {
   for (auto n : b->nodes()) {
     inlineCallStackOfNode(n, new_cs_entries, callee, to_replace, m_info);
   }
@@ -2069,7 +2069,7 @@ void inlineCallStackOfNode(
     std::unordered_map<InlinedCallStack*, InlinedCallStackPtr>& new_cs_entries,
     Function* callee,
     Node* to_replace,
-    c10::optional<ModuleInstanceInfo> m_info) {
+    std::optional<ModuleInstanceInfo> m_info) {
   auto new_node_cs = new_node->callstack();
 
   InlinedCallStack* raw_callstack_ptr =
@@ -2108,7 +2108,7 @@ std::vector<Value*> inlineCallTo(
   std::unordered_map<InlinedCallStack*, InlinedCallStackPtr>
       new_callstack_entries;
 
-  c10::optional<ModuleInstanceInfo> module_instance_info = c10::nullopt;
+  std::optional<ModuleInstanceInfo> module_instance_info = c10::nullopt;
   if (to_replace->kind() == prim::CallMethod) {
     auto class_type_ptr = to_replace->input(0)->type()->cast<c10::ClassType>();
     if (to_replace->input(0)->node()->kind() == prim::GetAttr) {
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index 4781b15229cbb..549f4a11001f5 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -332,9 +332,9 @@ struct TORCH_API Node {
   std::vector<Block*> blocks_;
   Graph* graph_;
   Block* owning_block_;
-  c10::optional<SourceRange> source_range_;
+  std::optional<SourceRange> source_range_;
   ScopePtr scope_;
-  c10::optional<InlinedCallStackPtr> callstack_;
+  std::optional<InlinedCallStackPtr> callstack_;
   // Assumes FunctionSchemas are persistent, so we don't manage their lifetime.
   // This field is effective a cache that's populated on attribute lookups and
   // invalidated every time we perform an operation that could potentially
@@ -348,7 +348,7 @@ struct TORCH_API Node {
   // is changed, we need to rely on this name
   // to retrieve old schemas to successfully apply upgraders
   // for this operator.
-  c10::optional<std::string> historic_schema_name_ = c10::nullopt;
+  std::optional<std::string> historic_schema_name_ = c10::nullopt;
 
  protected:
   Node(Graph* graph_, NodeKind kind_); // defined after graph
@@ -373,7 +373,7 @@ struct TORCH_API Node {
     return wrap_;
   }
 
-  const c10::optional<std::string> getHistoricSchemaName() {
+  const std::optional<std::string> getHistoricSchemaName() {
     return historic_schema_name_;
   }
 
@@ -442,7 +442,7 @@ struct TORCH_API Node {
     return this;
   }
 
-  c10::optional<InlinedCallStackPtr> callstack() const {
+  std::optional<InlinedCallStackPtr> callstack() const {
     return callstack_;
   }
   void setCallStack(InlinedCallStackPtr cs) {
@@ -527,10 +527,10 @@ struct TORCH_API Node {
   Value* namedInput(const std::string& unqualName) const;
   Value* namedInput(Symbol name) const;
 
-  c10::optional<IValue> get(Symbol name) const;
+  std::optional<IValue> get(Symbol name) const;
 
   template <typename T>
-  c10::optional<T> get(Symbol name) const {
+  std::optional<T> get(Symbol name) const {
     if (auto v = get(name)) {
       return v->template to<T>();
     }
@@ -1208,7 +1208,7 @@ struct Graph : std::enable_shared_from_this<Graph> {
   Node* insert_before_;
   int64_t predicted_insert_count_ = 0;
 
-  c10::optional<size_t> op_version_;
+  std::optional<size_t> op_version_;
 
  public:
   Graph(ScopePtr scope_root = c10::make_intrusive<Scope>())
@@ -1261,11 +1261,11 @@ struct Graph : std::enable_shared_from_this<Graph> {
     return current_scope_;
   }
 
-  void set_op_version(c10::optional<size_t> version) {
+  void set_op_version(std::optional<size_t> version) {
     op_version_ = version;
   }
 
-  c10::optional<size_t> get_op_version() {
+  std::optional<size_t> get_op_version() {
     return op_version_;
   }
 
@@ -1368,8 +1368,8 @@ struct Graph : std::enable_shared_from_this<Graph> {
   // Insert constant IValue into the graph.
   TORCH_API Value* insertConstant(
       const IValue& val,
-      c10::optional<SourceRange> loc = c10::nullopt,
-      c10::optional<ScopePtr> scope = c10::nullopt);
+      std::optional<SourceRange> loc = c10::nullopt,
+      std::optional<ScopePtr> scope = c10::nullopt);
 
   // Schema-driven insert:
   // This inserts a node into the graph with inputs determined from args and
@@ -1382,7 +1382,7 @@ struct Graph : std::enable_shared_from_this<Graph> {
       Symbol opname,
       at::ArrayRef<NamedValue> args,
       at::ArrayRef<NamedValue> kwargs = {},
-      const c10::optional<SourceRange>& range = {});
+      const std::optional<SourceRange>& range = {});
 
   Node* appendNode(Node* n) {
     return block_->appendNode(n);
@@ -1591,7 +1591,7 @@ struct TORCH_API PythonOp : public Node {
   // recover the autograd.Function instance, if this PythonOp's function
   // was originally SomeFunction.apply
   // used in ONNX for discovering symbolics
-  virtual c10::optional<THPObjectPtr> autogradFunction() const = 0;
+  virtual std::optional<THPObjectPtr> autogradFunction() const = 0;
 
   virtual void lint_python() const = 0;
 };
@@ -1730,7 +1730,7 @@ struct OperatorMap {
     return n->maybeOperator() && contains(n->getOperator());
   }
 
-  c10::optional<T> find(const Operator& op) {
+  std::optional<T> find(const Operator& op) {
     const auto it = map.find(Symbol::fromQualString(op.schema().name()));
     if (it == map.end()) {
       return c10::nullopt;
@@ -1806,7 +1806,7 @@ struct FunctionSchemaMap {
     return false;
   }
 
-  c10::optional<T> find(const FunctionSchema& schema) const {
+  std::optional<T> find(const FunctionSchema& schema) const {
     const auto it = map.find(Symbol::fromQualString(schema.name()));
     if (it == map.end()) {
       return c10::nullopt;
diff --git a/torch/csrc/jit/ir/irparser.cpp b/torch/csrc/jit/ir/irparser.cpp
index c37988e322a8d..06e0a66fa055c 100644
--- a/torch/csrc/jit/ir/irparser.cpp
+++ b/torch/csrc/jit/ir/irparser.cpp
@@ -169,7 +169,7 @@ void IRParser::parseOperatorOutputs(std::vector<VarWithType>* outs) {
 ParsedLiteral IRParser::parseScalarLiteral(Node* n) {
   auto token = L.cur();
   std::string str;
-  std::pair<TypePtr, c10::optional<c10::AliasInfo>> type_alias;
+  std::pair<TypePtr, std::optional<c10::AliasInfo>> type_alias;
   ParsedLiteral r;
   switch (token.kind) {
     case TK_STRINGLITERAL:
diff --git a/torch/csrc/jit/ir/named_value.h b/torch/csrc/jit/ir/named_value.h
index ead3d73e9a86b..277e7f2699695 100644
--- a/torch/csrc/jit/ir/named_value.h
+++ b/torch/csrc/jit/ir/named_value.h
@@ -73,8 +73,8 @@ struct NamedValue {
   at::TypePtr type() const;
 
  private:
-  c10::optional<SourceRange> loc_;
-  c10::optional<std::string> name_;
+  std::optional<SourceRange> loc_;
+  std::optional<std::string> name_;
   Value* value_{nullptr};
   // only valid if value_ == nullptr;
   IValue ivalue_;
diff --git a/torch/csrc/jit/ir/scope.cpp b/torch/csrc/jit/ir/scope.cpp
index dfb1ef36f359e..3ff1c22b8d119 100644
--- a/torch/csrc/jit/ir/scope.cpp
+++ b/torch/csrc/jit/ir/scope.cpp
@@ -113,7 +113,7 @@ InlinedCallStack::InlinedCallStack(Function* fn, SourceRange source_range)
 InlinedCallStack::InlinedCallStack(
     Function* fn,
     SourceRange source_range,
-    c10::optional<ModuleInstanceInfo> module_instance_info)
+    std::optional<ModuleInstanceInfo> module_instance_info)
     : fn_(fn),
       fn_name_(fn_ ? fn_->name() : ""),
       source_range_(std::move(source_range)),
@@ -122,7 +122,7 @@ InlinedCallStack::InlinedCallStack(
 InlinedCallStack::InlinedCallStack(
     Function* fn,
     SourceRange source_range,
-    c10::optional<ModuleInstanceInfo> module_instance_info,
+    std::optional<ModuleInstanceInfo> module_instance_info,
     std::string& function_name)
     : fn_(fn),
       fn_name_(std::move(function_name)),
@@ -142,7 +142,7 @@ InlinedCallStack::InlinedCallStack(
     InlinedCallStackPtr callee,
     Function* fn,
     SourceRange source_range,
-    c10::optional<ModuleInstanceInfo> module_instance_info,
+    std::optional<ModuleInstanceInfo> module_instance_info,
     std::string& function_name)
     : callee_(std::move(callee)),
       fn_(fn),
@@ -154,22 +154,22 @@ InlinedCallStack::InlinedCallStack(
     InlinedCallStackPtr callee,
     Function* fn,
     SourceRange source_range,
-    c10::optional<ModuleInstanceInfo> module_instance_info)
+    std::optional<ModuleInstanceInfo> module_instance_info)
     : callee_(std::move(callee)),
       fn_(fn),
       fn_name_(fn_ ? fn_->name() : ""),
       source_range_(std::move(source_range)),
       module_instance_info_(std::move(module_instance_info)) {}
 
-c10::optional<InlinedCallStackPtr> InlinedCallStack::callee() const {
+std::optional<InlinedCallStackPtr> InlinedCallStack::callee() const {
   return callee_;
 }
 
-void InlinedCallStack::setCallee(c10::optional<InlinedCallStackPtr> callee) {
+void InlinedCallStack::setCallee(std::optional<InlinedCallStackPtr> callee) {
   callee_ = std::move(callee);
 }
 
-c10::optional<ModuleInstanceInfo> InlinedCallStack::module_instance() const {
+std::optional<ModuleInstanceInfo> InlinedCallStack::module_instance() const {
   return module_instance_info_;
 }
 
@@ -187,7 +187,7 @@ const std::string& InlinedCallStack::function_name() const {
 
 std::vector<InlinedCallStackEntry> InlinedCallStack::vec() {
   std::vector<InlinedCallStackEntry> r;
-  c10::optional<InlinedCallStackPtr> current = intrusive_from_this();
+  std::optional<InlinedCallStackPtr> current = intrusive_from_this();
   while (current) {
     r.emplace_back(
         (*current)->fn_,
diff --git a/torch/csrc/jit/ir/scope.h b/torch/csrc/jit/ir/scope.h
index 423bbbd3ab2e1..5449803032238 100644
--- a/torch/csrc/jit/ir/scope.h
+++ b/torch/csrc/jit/ir/scope.h
@@ -120,11 +120,11 @@ struct ModuleInstanceInfo {
  */
 using InlinedCallStackPtr = c10::intrusive_ptr<InlinedCallStack>;
 using InlinedCallStackEntry =
-    std::tuple<Function*, SourceRange, c10::optional<ModuleInstanceInfo>>;
+    std::tuple<Function*, SourceRange, std::optional<ModuleInstanceInfo>>;
 
 struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target {
  private:
-  c10::optional<InlinedCallStackPtr> callee_;
+  std::optional<InlinedCallStackPtr> callee_;
   Function* fn_;
   // Reason for fn_name_ even though we have fn_
   // Serialized callstack is used in circustmances where InlinedCallstack
@@ -137,7 +137,7 @@ struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target {
   const std::string fn_name_;
   SourceRange source_range_;
   InlinedCallStackPtr intrusive_from_this();
-  c10::optional<ModuleInstanceInfo> module_instance_info_;
+  std::optional<ModuleInstanceInfo> module_instance_info_;
 
  public:
   // Constructor for a leaf callstack node.
@@ -147,13 +147,13 @@ struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target {
   InlinedCallStack(
       Function* fn,
       SourceRange source_range,
-      c10::optional<ModuleInstanceInfo> module_instance_info);
+      std::optional<ModuleInstanceInfo> module_instance_info);
 
   // Constructor for a leaf callstack node.
   InlinedCallStack(
       Function* fn,
       SourceRange source_range,
-      c10::optional<ModuleInstanceInfo> module_instance_info,
+      std::optional<ModuleInstanceInfo> module_instance_info,
       std::string& function_name);
 
   // Constructor for an inner callstack node.
@@ -166,20 +166,20 @@ struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target {
       InlinedCallStackPtr callee,
       Function* fn,
       SourceRange source_range,
-      c10::optional<ModuleInstanceInfo> module_instance_info);
+      std::optional<ModuleInstanceInfo> module_instance_info);
 
   InlinedCallStack(
       InlinedCallStackPtr callee,
       Function* fn,
       SourceRange source_range,
-      c10::optional<ModuleInstanceInfo> module_instance_info,
+      std::optional<ModuleInstanceInfo> module_instance_info,
       std::string& function_name);
 
   // Return next element in the callstack list.
-  c10::optional<InlinedCallStackPtr> callee() const;
+  std::optional<InlinedCallStackPtr> callee() const;
 
   // Return module instance associated with the current element.
-  c10::optional<ModuleInstanceInfo> module_instance() const;
+  std::optional<ModuleInstanceInfo> module_instance() const;
 
   // Returns the source range of the node
   SourceRange source_range() const;
@@ -191,7 +191,7 @@ struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target {
   // Return callstack as a vector of [Function, SourceRange] pairs.
   std::vector<InlinedCallStackEntry> vec();
 
-  void setCallee(c10::optional<InlinedCallStackPtr>);
+  void setCallee(std::optional<InlinedCallStackPtr>);
 
   bool operator==(const InlinedCallStack& rhs) const {
     // No need to compare fn_, since source_range equivalence check
diff --git a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
index de120c8fa1e87..1980023e8fc4a 100644
--- a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
+++ b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
@@ -31,7 +31,7 @@ using caffe2::serialize::ReadAdapterInterface;
 c10::IValue readArchive(
     const std::string& archive_name,
     PyTorchStreamReader& stream_reader) {
-  c10::optional<at::Device> device;
+  std::optional<at::Device> device;
   std::shared_ptr<CompilationUnit> compilation_unit =
       std::make_shared<CompilationUnit>();
 
diff --git a/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp b/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp
index b3516e5bafc80..1cda81045b81a 100644
--- a/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp
+++ b/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp
@@ -53,7 +53,7 @@ std::unordered_map<std::string, OperatorInfo> _get_runtime_ops_and_info() {
   for (auto& op : dispatcherOperators) {
     // grab schema
     const auto op_handle = c10::Dispatcher::singleton().findOp(op);
-    c10::optional<int> num_schema_args;
+    std::optional<int> num_schema_args;
     if (op_handle->hasSchema()) {
       num_schema_args = op_handle->schema().arguments().size();
     }
diff --git a/torch/csrc/jit/mobile/compatibility/runtime_compatibility.h b/torch/csrc/jit/mobile/compatibility/runtime_compatibility.h
index 13adf04c0cc9d..2e65f1f38bd8d 100644
--- a/torch/csrc/jit/mobile/compatibility/runtime_compatibility.h
+++ b/torch/csrc/jit/mobile/compatibility/runtime_compatibility.h
@@ -13,7 +13,7 @@ namespace jit {
 // Struct storing metadata of an operator that can be useful for versioning
 struct OperatorInfo {
   // The number of arguments within the schema of the op
-  c10::optional<int> num_schema_args;
+  std::optional<int> num_schema_args;
 };
 
 struct RuntimeCompatibilityInfo {
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.cpp b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
index f906f4e2b9eb4..239deb76d2673 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.cpp
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
@@ -359,7 +359,7 @@ std::unique_ptr<mobile::Function> FlatbufferLoader::parseFunction(
       (operator_version < caffe2::serialize::kProducedFileFormatVersion);
 
   for (const auto* op : *method->operators()) {
-    c10::optional<int> num_args = c10::nullopt;
+    std::optional<int> num_args = c10::nullopt;
     if (op->num_args_serialized() > -1) {
       num_args = op->num_args_serialized();
     }
@@ -752,7 +752,7 @@ void FlatbufferLoader::extractJitSourceAndConstants(
 mobile::Module parse_and_initialize_mobile_module(
     void* data,
     size_t size,
-    c10::optional<at::Device>,
+    std::optional<at::Device>,
     ExtraFilesMap* extra_files,
     bool should_copy_tensor_memory) {
   // TODO(T128189662): If not copying, enforce that data is aligned to
@@ -781,7 +781,7 @@ mobile::Module parse_and_initialize_mobile_module(
 mobile::Module parse_and_initialize_mobile_module(
     std::shared_ptr<char> data,
     size_t size,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap* extra_files) {
   mobile::Module m = parse_and_initialize_mobile_module(
       data.get(),
@@ -798,7 +798,7 @@ mobile::Module parse_and_initialize_mobile_module_for_jit(
     size_t size,
     ExtraFilesMap& jit_sources,
     std::vector<IValue>& jit_constants,
-    c10::optional<at::Device>,
+    std::optional<at::Device>,
     ExtraFilesMap* extra_files) {
   TORCH_CHECK(
       mobile::serialization::ModuleBufferHasIdentifier(data), "Format error");
@@ -825,7 +825,7 @@ mobile::Module parse_and_initialize_mobile_module_for_jit(
 
 mobile::Module load_mobile_module_from_file(
     const std::string& filename,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap* extra_files) {
   auto [data, size] = get_file_content(filename.c_str());
   return parse_and_initialize_mobile_module(
@@ -885,7 +885,7 @@ mobile::ModuleInfo get_module_info_from_flatbuffer(char* flatbuffer_content) {
 
 mobile::Module load_mobile_module_from_stream_with_copy(
     std::istream& in,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap* extra_files) {
   auto [data, size] = get_stream_content(in);
   return parse_and_initialize_mobile_module(
@@ -895,7 +895,7 @@ mobile::Module load_mobile_module_from_stream_with_copy(
 mobile::Module parse_flatbuffer_no_object(
     std::shared_ptr<char> data,
     size_t size,
-    c10::optional<at::Device> device) {
+    std::optional<at::Device> device) {
   (void)device;
   (void)size;
 
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.h b/torch/csrc/jit/mobile/flatbuffer_loader.h
index f29fe5b2e4942..9ac9636f3f14b 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.h
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.h
@@ -58,7 +58,7 @@ using ExtraFilesMap = std::unordered_map<std::string, std::string>;
 TORCH_API mobile::Module parse_and_initialize_mobile_module(
     void* data,
     size_t size, // of `data`, in bytes.
-    c10::optional<at::Device> device = c10::nullopt,
+    std::optional<at::Device> device = c10::nullopt,
     ExtraFilesMap* extra_files = nullptr,
     bool should_copy_tensor_memory = false);
 
@@ -74,7 +74,7 @@ TORCH_API mobile::Module parse_and_initialize_mobile_module(
 TORCH_API mobile::Module parse_and_initialize_mobile_module(
     std::shared_ptr<char> data,
     size_t size, // of `data`, in bytes.
-    c10::optional<at::Device> device = c10::nullopt,
+    std::optional<at::Device> device = c10::nullopt,
     ExtraFilesMap* extra_files = nullptr);
 
 // Parse a mobile::Module from raw bytes, also returning JIT-related metadata.
@@ -87,7 +87,7 @@ TORCH_API mobile::Module parse_and_initialize_mobile_module_for_jit(
     size_t size, // of `data`, in bytes.
     ExtraFilesMap& jit_sources,
     std::vector<IValue>& jit_constants,
-    c10::optional<at::Device> device = c10::nullopt,
+    std::optional<at::Device> device = c10::nullopt,
     ExtraFilesMap* extra_files = nullptr);
 
 // Load a mobile::Module from a filepath.
@@ -100,7 +100,7 @@ TORCH_API mobile::Module parse_and_initialize_mobile_module_for_jit(
 // directly.
 TORCH_API mobile::Module load_mobile_module_from_file(
     const std::string& filename,
-    c10::optional<at::Device> device = c10::nullopt,
+    std::optional<at::Device> device = c10::nullopt,
     ExtraFilesMap* extra_files = nullptr);
 
 TORCH_API uint64_t get_bytecode_version(std::istream& in);
@@ -114,18 +114,18 @@ TORCH_API mobile::ModuleInfo get_module_info_from_flatbuffer(
 // its entirity to a buffer
 TORCH_API mobile::Module load_mobile_module_from_stream_with_copy(
     std::istream& in,
-    c10::optional<at::Device> device = c10::nullopt,
+    std::optional<at::Device> device = c10::nullopt,
     ExtraFilesMap* extra_files = nullptr);
 
 TORCH_API mobile::Module parse_flatbuffer_no_object(
     std::shared_ptr<char> data,
     size_t size,
-    c10::optional<at::Device> device);
+    std::optional<at::Device> device);
 
 TORCH_API mobile::Module parse_and_initialize_mobile_module(
     void* data,
     size_t,
-    c10::optional<at::Device>,
+    std::optional<at::Device>,
     ExtraFilesMap* extra_files,
     bool should_copy_tensor_memory);
 
diff --git a/torch/csrc/jit/mobile/frame.h b/torch/csrc/jit/mobile/frame.h
index 2db12f7d19374..45c51fef0085e 100644
--- a/torch/csrc/jit/mobile/frame.h
+++ b/torch/csrc/jit/mobile/frame.h
@@ -32,11 +32,11 @@ class Frame {
     return code_.instructions_.at(pc_);
   }
 
-  c10::optional<int64_t> getDebugHandle() const {
+  std::optional<int64_t> getDebugHandle() const {
     return getDebugHandle(pc_);
   }
 
-  c10::optional<int64_t> getDebugHandle(size_t pc) const {
+  std::optional<int64_t> getDebugHandle(size_t pc) const {
     if (pc >= code_.debug_handles_.size()) {
       return {};
     }
diff --git a/torch/csrc/jit/mobile/function.cpp b/torch/csrc/jit/mobile/function.cpp
index b410bf7765cc7..36f19fb1fac41 100644
--- a/torch/csrc/jit/mobile/function.cpp
+++ b/torch/csrc/jit/mobile/function.cpp
@@ -47,7 +47,7 @@ void Function::append_instruction(OpCode op, int X, int N) {
 void Function::append_operator(
     const std::string& name,
     const std::string& overload_name,
-    const c10::optional<int>& num_specified_args) {
+    const std::optional<int>& num_specified_args) {
   // Keep the original opname in code_
   code_.op_names_.emplace_back(name, overload_name);
   code_.operator_input_sizes_.emplace_back(num_specified_args.value_or(-1));
@@ -71,8 +71,8 @@ bool Function::initialize_operators(bool should_check_operators) {
   for (unsigned i = 0; i < code_.op_names_.size(); i++) {
     const auto& opname = code_.op_names_[i];
     int num_args = code_.operator_input_sizes_[i];
-    c10::optional<int> num_specified_args =
-        num_args < 0 ? c10::nullopt : c10::optional<int>(num_args);
+    std::optional<int> num_specified_args =
+        num_args < 0 ? c10::nullopt : std::optional<int>(num_args);
     auto func = makeOperatorFunction(opname, num_specified_args);
     if (!func.has_value()) {
       unsupported_op_names.insert(operator_str(opname));
@@ -165,9 +165,9 @@ const std::vector<int64_t>& Function::getExceptionDebugHandles() const {
   return getInterpretersExceptionDebugHandles();
 }
 
-c10::optional<std::function<void(Stack&)>> makeOperatorFunction(
+std::optional<std::function<void(Stack&)>> makeOperatorFunction(
     c10::OperatorName opname,
-    c10::optional<int> num_specified_args) {
+    std::optional<int> num_specified_args) {
   std::function<void(Stack&)> fn;
   const auto full_name = c10::toString(opname);
   const std::vector<c10::Argument>* pArgs = nullptr;
diff --git a/torch/csrc/jit/mobile/function.h b/torch/csrc/jit/mobile/function.h
index fb6f77fa64d76..42065d4a1c1b0 100644
--- a/torch/csrc/jit/mobile/function.h
+++ b/torch/csrc/jit/mobile/function.h
@@ -37,7 +37,7 @@ class TORCH_API Function : public torch::jit::Function {
   void append_operator(
       const std::string& name,
       const std::string& overload_name,
-      const c10::optional<int>& num_specified_args);
+      const std::optional<int>& num_specified_args);
   void append_constant(const c10::IValue& constant);
   void append_type(const c10::TypePtr& type);
   void append_function(mobile::Function& func);
@@ -75,9 +75,9 @@ class TORCH_API Function : public torch::jit::Function {
   at::optional<c10::FunctionSchema> schema_; // (byte-code version 4+)
 };
 
-c10::optional<std::function<void(Stack&)>> makeOperatorFunction(
+std::optional<std::function<void(Stack&)>> makeOperatorFunction(
     c10::OperatorName opname,
-    c10::optional<int> num_specified_args);
+    std::optional<int> num_specified_args);
 
 TORCH_API std::string operator_str(const c10::OperatorName& opname);
 
diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index a82e7d69366ec..96ff6c88779d9 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -191,12 +191,12 @@ class BytecodeDeserializer final {
   explicit BytecodeDeserializer(
       std::unique_ptr<PyTorchStreamReader> reader,
       uint64_t module_load_options = 0);
-  mobile::Module deserialize(c10::optional<at::Device> device);
+  mobile::Module deserialize(std::optional<at::Device> device);
   mobile::Module deserialize(
-      c10::optional<at::Device> device,
+      std::optional<at::Device> device,
       ExtraFilesMap& extra_files);
   void deserialize_only_extra(
-      c10::optional<at::Device> device,
+      std::optional<at::Device> device,
       ExtraFilesMap& extra_files);
 
  private:
@@ -204,7 +204,7 @@ class BytecodeDeserializer final {
   void init_upgrader(mobile::Function* function);
   void parseMethods(
       c10::ivalue::TupleElements&& vals,
-      c10::optional<c10::ivalue::TupleElements>&& debug_handles,
+      std::optional<c10::ivalue::TupleElements>&& debug_handles,
       mobile::CompilationUnit& mcu);
   c10::IValue readArchive(
       const std::string& archive_name,
@@ -217,7 +217,7 @@ class BytecodeDeserializer final {
   std::shared_ptr<CompilationUnit> compilation_unit_;
   std::unordered_set<std::string> imported_libs_;
   std::unique_ptr<PyTorchStreamReader> reader_{};
-  c10::optional<at::Device> device_;
+  std::optional<at::Device> device_;
   uint64_t module_load_options_;
   // From `version` or `.data/version` in model.ptl and it's compute
   // dynamically. It's used for finding the minimum required runtime to run all
@@ -305,7 +305,7 @@ void BytecodeDeserializer::init_upgrader(mobile::Function* function) {
 
 void BytecodeDeserializer::parseMethods(
     c10::ivalue::TupleElements&& vals,
-    c10::optional<c10::ivalue::TupleElements>&& debug_handles,
+    std::optional<c10::ivalue::TupleElements>&& debug_handles,
     mobile::CompilationUnit& mcu) {
   TORCH_CHECK(!vals.empty(), "Bytecode has no elements. ");
   // Initialized with the version number when kProducedBytecodeVersion was
@@ -417,7 +417,7 @@ void BytecodeDeserializer::parseMethods(
 }
 
 void BytecodeDeserializer::deserialize_only_extra(
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files) {
   device_ = device;
   for (const auto& kv : extra_files) {
@@ -431,14 +431,14 @@ void BytecodeDeserializer::deserialize_only_extra(
 }
 
 mobile::Module BytecodeDeserializer::deserialize(
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files) {
   deserialize_only_extra(device, extra_files);
   return deserialize(device);
 }
 
 mobile::Module BytecodeDeserializer::deserialize(
-    c10::optional<at::Device> device) {
+    std::optional<at::Device> device) {
   device_ = device;
   auto mcu = std::make_shared<mobile::CompilationUnit>();
 
@@ -453,7 +453,7 @@ mobile::Module BytecodeDeserializer::deserialize(
   //
   auto bvals = std::move(readArchive("bytecode", mcu).toTupleRef()).elements();
 
-  c10::optional<c10::ivalue::TupleElements> debug_handles;
+  std::optional<c10::ivalue::TupleElements> debug_handles;
   bool has_debug_handles{false};
   if (reader_->hasRecord("mobile_debug_handles.pkl")) {
     debug_handles =
@@ -504,7 +504,7 @@ c10::IValue BytecodeDeserializer::readArchive(
 
 mobile::Module _load_for_mobile_impl(
     std::unique_ptr<ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     uint64_t module_load_options) {
   auto observer = torch::observerConfig().getModuleObserver();
@@ -577,7 +577,7 @@ mobile::Module _load_for_mobile_impl(
 mobile::Module _load_mobile_from_bytes(
     const std::shared_ptr<char>& data,
     size_t size,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     uint64_t module_load_options) {
   TORCH_CHECK(size >= kFileFormatHeaderSize, "Format error");
@@ -603,28 +603,28 @@ mobile::Module _load_mobile_from_bytes(
 
 mobile::Module _load_for_mobile(
     std::istream& in,
-    c10::optional<at::Device> device) {
+    std::optional<at::Device> device) {
   ExtraFilesMap extra_files;
   return _load_for_mobile(in, device, extra_files);
 }
 
 mobile::Module _load_for_mobile(
     const std::string& filename,
-    c10::optional<at::Device> device) {
+    std::optional<at::Device> device) {
   ExtraFilesMap extra_files;
   return _load_for_mobile(filename, device, extra_files);
 }
 
 mobile::Module _load_for_mobile(
     std::unique_ptr<ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device) {
+    std::optional<c10::Device> device) {
   ExtraFilesMap extra_files;
   return _load_for_mobile(std::move(rai), device, extra_files);
 }
 
 mobile::Module _load_for_mobile(
     std::istream& in,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files,
     uint64_t module_load_options) {
   if (getFileFormat(in) == FileFormat::FlatbufferFileFormat) {
@@ -640,7 +640,7 @@ mobile::Module _load_for_mobile(
 
 mobile::Module _load_for_mobile(
     const std::string& filename,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files) {
   return _load_for_mobile(
       filename, device, extra_files, kDefaultMobileLoadOptions);
@@ -648,7 +648,7 @@ mobile::Module _load_for_mobile(
 
 mobile::Module _load_for_mobile(
     const std::string& filename,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files,
     uint64_t module_load_options) {
   auto format = getFileFormat(filename);
@@ -666,7 +666,7 @@ mobile::Module _load_for_mobile(
 
 TORCH_API mobile::Module _load_for_mobile(
     std::unique_ptr<ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     uint64_t module_load_options) {
   // TODO optimize file read for non-flatbuffer models
@@ -677,7 +677,7 @@ TORCH_API mobile::Module _load_for_mobile(
 
 void _load_extra_only_for_mobile(
     const std::string& filename,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files) {
   auto observer = torch::observerConfig().getModuleObserver();
   // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
diff --git a/torch/csrc/jit/mobile/import.h b/torch/csrc/jit/mobile/import.h
index 26bc112f9a760..77a801e62571d 100644
--- a/torch/csrc/jit/mobile/import.h
+++ b/torch/csrc/jit/mobile/import.h
@@ -22,38 +22,38 @@ constexpr const char* kArchiveNameVersion = "version";
 // into a mobile::Module object.
 TORCH_API mobile::Module _load_for_mobile(
     std::istream& in,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_file,
     uint64_t module_load_options = kDefaultMobileLoadOptions);
 
 TORCH_API mobile::Module _load_for_mobile(
     const std::string& filename,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files);
 
 TORCH_API mobile::Module _load_for_mobile(
     std::unique_ptr<ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     uint64_t module_load_options = kDefaultMobileLoadOptions);
 
 TORCH_API mobile::Module _load_for_mobile(
     const std::string& filename,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files,
     uint64_t module_load_options);
 
 TORCH_API mobile::Module _load_for_mobile(
     std::istream& in,
-    c10::optional<at::Device> device = c10::nullopt);
+    std::optional<at::Device> device = c10::nullopt);
 
 TORCH_API mobile::Module _load_for_mobile(
     const std::string& filename,
-    c10::optional<at::Device> device = c10::nullopt);
+    std::optional<at::Device> device = c10::nullopt);
 
 TORCH_API mobile::Module _load_for_mobile(
     std::unique_ptr<ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device = c10::nullopt);
+    std::optional<c10::Device> device = c10::nullopt);
 
 /**
  * Load only the contents of the "extra/" files whose names are
@@ -69,7 +69,7 @@ TORCH_API mobile::Module _load_for_mobile(
  */
 void _load_extra_only_for_mobile(
     const std::string& filename,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files);
 
 // Currently used by both mobile/import.cpp and model_compatibility.cpp.
diff --git a/torch/csrc/jit/mobile/import_data.cpp b/torch/csrc/jit/mobile/import_data.cpp
index 11fbcbc45e3f2..32825f1f5e17f 100644
--- a/torch/csrc/jit/mobile/import_data.cpp
+++ b/torch/csrc/jit/mobile/import_data.cpp
@@ -40,13 +40,13 @@ namespace {
 class IValueUnpickler final {
  public:
   explicit IValueUnpickler(std::unique_ptr<PyTorchStreamReader> reader);
-  c10::IValue deserialize(c10::optional<at::Device> device);
+  c10::IValue deserialize(std::optional<at::Device> device);
 
  private:
   c10::IValue readArchive(
       const std::string& archive_name,
       std::shared_ptr<mobile::CompilationUnit> mcu,
-      c10::optional<at::Device> device);
+      std::optional<at::Device> device);
 
   std::shared_ptr<CompilationUnit> compilation_unit_;
   std::unique_ptr<PyTorchStreamReader> reader_;
@@ -56,7 +56,7 @@ IValueUnpickler::IValueUnpickler(std::unique_ptr<PyTorchStreamReader> reader)
     : compilation_unit_(std::make_shared<CompilationUnit>()),
       reader_(std::move(reader)) {}
 
-c10::IValue IValueUnpickler::deserialize(c10::optional<at::Device> device) {
+c10::IValue IValueUnpickler::deserialize(std::optional<at::Device> device) {
   auto mcu = std::make_shared<mobile::CompilationUnit>();
 
   // NOLINTNEXTLINE(performance-move-const-arg)
@@ -66,7 +66,7 @@ c10::IValue IValueUnpickler::deserialize(c10::optional<at::Device> device) {
 c10::IValue IValueUnpickler::readArchive(
     const std::string& archive_name,
     std::shared_ptr<mobile::CompilationUnit> mcu,
-    c10::optional<at::Device> device) {
+    std::optional<at::Device> device) {
   std::stringstream picklename;
   picklename << archive_name << ".pkl";
   at::DataPtr pickle_ptr;
@@ -169,7 +169,7 @@ c10::IValue IValueUnpickler::readArchive(
  */
 std::map<std::string, at::Tensor> load_parameters_from_zip(
     std::unique_ptr<ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device) {
+    std::optional<c10::Device> device) {
   auto reader = std::make_unique<PyTorchStreamReader>(std::move(rai));
   IValueUnpickler unpickler(std::move(reader));
   auto result = unpickler.deserialize(device).toGenericDict();
@@ -241,7 +241,7 @@ std::map<std::string, at::Tensor> mobile_module_to_parameter_map(
 static std::map<std::string, at::Tensor> _load_parameters_bytes(
     std::shared_ptr<char> data,
     size_t size,
-    c10::optional<at::Device> device) {
+    std::optional<at::Device> device) {
   TORCH_CHECK(size >= kFileFormatHeaderSize, "Unrecognized data format");
   FileFormat format = getFileFormat(data.get());
   // Call the appropriate parser.
@@ -268,14 +268,14 @@ static std::map<std::string, at::Tensor> _load_parameters_bytes(
 
 std::map<std::string, at::Tensor> _load_parameters(
     std::istream& in,
-    c10::optional<at::Device> device) {
+    std::optional<at::Device> device) {
   auto [data, size] = get_stream_content(in);
   return _load_parameters_bytes(std::move(data), size, device);
 }
 
 std::map<std::string, at::Tensor> _load_parameters(
     const std::string& filename,
-    c10::optional<at::Device> device) {
+    std::optional<at::Device> device) {
   auto [data, size] = get_file_content(filename.c_str());
   return _load_parameters_bytes(std::move(data), size, device);
 }
diff --git a/torch/csrc/jit/mobile/import_data.h b/torch/csrc/jit/mobile/import_data.h
index f3eb202b7f00a..25e1fd81341c1 100644
--- a/torch/csrc/jit/mobile/import_data.h
+++ b/torch/csrc/jit/mobile/import_data.h
@@ -19,7 +19,7 @@ namespace jit {
  */
 TORCH_API std::map<std::string, at::Tensor> _load_parameters(
     std::istream& in,
-    c10::optional<at::Device> device = c10::nullopt);
+    std::optional<at::Device> device = c10::nullopt);
 
 /**
  * Loads named parameters from the serialized data in @p filename.
@@ -28,7 +28,7 @@ TORCH_API std::map<std::string, at::Tensor> _load_parameters(
  */
 TORCH_API std::map<std::string, at::Tensor> _load_parameters(
     const std::string& filename,
-    c10::optional<at::Device> device = c10::nullopt);
+    std::optional<at::Device> device = c10::nullopt);
 
 // NOTE: Please prefer using _load_parameters over using the function below.
 TORCH_API std::map<std::string, at::Tensor> mobile_module_to_parameter_map(
diff --git a/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp b/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp
index 0da724ade0bf8..c273b41537e40 100644
--- a/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp
+++ b/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp
@@ -10,7 +10,7 @@ OperatorCallTracer::OperatorCallTracer() {
 
   auto recorder_cb =
       [](const at::RecordFunction& fn) -> std::unique_ptr<at::ObserverContext> {
-    c10::optional<c10::OperatorName> op_name = fn.operator_name();
+    std::optional<c10::OperatorName> op_name = fn.operator_name();
     if (op_name.has_value()) {
       getCalledOperators().withLock(
           [op_name](std::set<std::string>& called_operators) {
diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp
index 55ec47d8e9387..23dfe9ff36785 100644
--- a/torch/csrc/jit/mobile/module.cpp
+++ b/torch/csrc/jit/mobile/module.cpp
@@ -46,7 +46,7 @@ Method Module::get_method(const std::string& name) const {
 bool Module::compareMethodSchemas(
     const std::string& name_1,
     const std::string& name_2) {
-  c10::optional<c10::FunctionSchema> schema_1, schema_2;
+  std::optional<c10::FunctionSchema> schema_1, schema_2;
   for (const auto& fn : cu_->methods()) {
     if (fn->name() == name_1) {
       schema_1 = fn->getSchema();
@@ -87,7 +87,7 @@ void Module::unsafeCopyMethod(
   cu_->register_function(std::move(new_fn));
 }
 
-c10::optional<Method> Module::find_method(const std::string& basename) const {
+std::optional<Method> Module::find_method(const std::string& basename) const {
   for (const auto& fn : cu_->methods()) {
     if (fn->name() == basename) {
       return c10::make_optional<Method>(Method(this, fn.get()));
@@ -316,7 +316,7 @@ c10::IValue Method::operator()(std::vector<c10::IValue> stack) const {
   return stack.front();
 }
 
-static c10::optional<std::string> print_type(const c10::Type& t) {
+static std::optional<std::string> print_type(const c10::Type& t) {
   auto namedType = t.cast<c10::NamedType>();
   if (namedType && namedType->name()) {
     return namedType->name().value().qualifiedName();
diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h
index 5e5d87f946355..3d37c7dc436ad 100644
--- a/torch/csrc/jit/mobile/module.h
+++ b/torch/csrc/jit/mobile/module.h
@@ -76,7 +76,7 @@ class TORCH_API Module {
   c10::IValue forward(std::vector<c10::IValue> inputs) {
     return get_method("forward")(std::move(inputs));
   }
-  c10::optional<Method> find_method(const std::string& basename) const;
+  std::optional<Method> find_method(const std::string& basename) const;
 
   const std::string name() const {
     return object_->name();
diff --git a/torch/csrc/jit/mobile/nnc/aot_compiler.cpp b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp
index 3b3fb8af6185a..1f7ba264048ff 100644
--- a/torch/csrc/jit/mobile/nnc/aot_compiler.cpp
+++ b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp
@@ -328,7 +328,7 @@ static std::string getNncKernelFuncName(
 static std::pair<std::shared_ptr<Graph>, std::vector<int64_t>>
 preprocessGraphPasses(
     std::shared_ptr<Graph>& graph,
-    const std::vector<c10::optional<at::Tensor>>& example_inputs,
+    const std::vector<std::optional<at::Tensor>>& example_inputs,
     const std::vector<int64_t>& dynamic_sizes) {
   GRAPH_DEBUG("Before preprocessing graph passes: ", *graph);
   torch::jit::RemoveTensorMutation(graph);
@@ -368,11 +368,11 @@ preprocessGraphPasses(
   return std::make_pair(graph, sym_val);
 }
 
-static std::vector<c10::optional<at::Tensor>> generateExampleInputs(
+static std::vector<std::optional<at::Tensor>> generateExampleInputs(
     const std::vector<std::vector<int64_t>>& inputShapes,
     const std::vector<at::ScalarType>& inputTypes,
     const std::vector<at::MemoryFormat>& inputMemoryFormats) {
-  std::vector<c10::optional<at::Tensor>> example_inputs;
+  std::vector<std::optional<at::Tensor>> example_inputs;
   example_inputs.reserve(inputShapes.size());
   for (const auto i : c10::irange(inputShapes.size())) {
     const auto dtype = at::dtype(inputTypes[i]);
diff --git a/torch/csrc/jit/mobile/nnc/context.h b/torch/csrc/jit/mobile/nnc/context.h
index ddc179740549e..3976d28ec8944 100644
--- a/torch/csrc/jit/mobile/nnc/context.h
+++ b/torch/csrc/jit/mobile/nnc/context.h
@@ -47,8 +47,8 @@ struct TORCH_API OutputSpec {
 
   std::vector<int64_t> sizes_;
   c10::ScalarType dtype_{c10::ScalarType::Undefined};
-  c10::optional<double> qscale_;
-  c10::optional<int64_t> qzero_;
+  std::optional<double> qscale_;
+  std::optional<int64_t> qzero_;
 };
 
 // Hold the temporary buffers / states needed during the execution.
diff --git a/torch/csrc/jit/mobile/parse_operators.cpp b/torch/csrc/jit/mobile/parse_operators.cpp
index 03415657c780b..c260a2e5d832a 100644
--- a/torch/csrc/jit/mobile/parse_operators.cpp
+++ b/torch/csrc/jit/mobile/parse_operators.cpp
@@ -16,7 +16,7 @@ void parseOperators(
         "There should be either two parts (name and overload name), ",
         "or three parts (name, overload name and number of specified args) ",
         "for an operator");
-    c10::optional<int> num_args;
+    std::optional<int> num_args;
     if (op_item.size() > 2) {
       num_args = op_item[2].toInt();
     }
diff --git a/torch/csrc/jit/mobile/promoted_prim_ops.cpp b/torch/csrc/jit/mobile/promoted_prim_ops.cpp
index 7ee8140b931c5..8e49749042424 100644
--- a/torch/csrc/jit/mobile/promoted_prim_ops.cpp
+++ b/torch/csrc/jit/mobile/promoted_prim_ops.cpp
@@ -24,7 +24,7 @@ void raiseException(Stack& stack) {
 void raiseExceptionWithMessage(Stack& stack) {
   // this kernel supports RaiseException with only two arguments: the error and
   // the message Please make changes only to this kernel
-  c10::optional<std::string> qualified_class_name =
+  std::optional<std::string> qualified_class_name =
       pop(stack).toOptional<std::string>();
   std::string message;
   pop(stack, message);
@@ -116,9 +116,9 @@ void toPrimDType(Stack& stack) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   bool copy;
   pop(stack, non_blocking, copy);
-  c10::optional<at::ScalarType> scalarType =
+  std::optional<at::ScalarType> scalarType =
       pop(stack).toOptional<at::ScalarType>();
-  c10::optional<c10::Device> device = c10::nullopt;
+  std::optional<c10::Device> device = c10::nullopt;
   at::Tensor self = pop(stack).toTensor();
   push(stack, to_dispatch(self, device, scalarType, non_blocking, copy));
 }
diff --git a/torch/csrc/jit/mobile/register_ops_common_utils.h b/torch/csrc/jit/mobile/register_ops_common_utils.h
index b0ecaf055f5ee..904e8786b1611 100644
--- a/torch/csrc/jit/mobile/register_ops_common_utils.h
+++ b/torch/csrc/jit/mobile/register_ops_common_utils.h
@@ -17,8 +17,8 @@ int64_t normalizeIndex(int64_t idx, int64_t list_size);
 // reference function THPVariable_to in python_variable_methods.cpp
 static C10_UNUSED at::Tensor to_dispatch(
     at::Tensor self,
-    c10::optional<at::Device> device,
-    c10::optional<at::ScalarType> scalarType,
+    std::optional<at::Device> device,
+    std::optional<at::ScalarType> scalarType,
     bool non_blocking,
     bool copy) {
   if (device && device->is_cuda()) {
diff --git a/torch/csrc/jit/mobile/upgrader_mobile.h b/torch/csrc/jit/mobile/upgrader_mobile.h
index f339484214f8b..68094a62ceabb 100644
--- a/torch/csrc/jit/mobile/upgrader_mobile.h
+++ b/torch/csrc/jit/mobile/upgrader_mobile.h
@@ -28,7 +28,7 @@ getOperatorVersionMapForMobile();
 struct OperatorString {
   const std::string name;
   const std::string overload_name;
-  const c10::optional<int> num_specified_args;
+  const std::optional<int> num_specified_args;
 };
 
 struct ByteCodeFunctionWithOperator {
diff --git a/torch/csrc/jit/operator_upgraders/utils.cpp b/torch/csrc/jit/operator_upgraders/utils.cpp
index 2cfd7c0559fe0..fef7b92c83c95 100644
--- a/torch/csrc/jit/operator_upgraders/utils.cpp
+++ b/torch/csrc/jit/operator_upgraders/utils.cpp
@@ -10,7 +10,7 @@
 
 namespace torch::jit {
 
-c10::optional<UpgraderEntry> findUpgrader(
+std::optional<UpgraderEntry> findUpgrader(
     const std::vector<UpgraderEntry>& upgraders_for_schema,
     size_t current_version) {
   // we want to find the entry which satisfies following two conditions:
@@ -51,7 +51,7 @@ bool isOpSymbolCurrent(const std::string& name, size_t current_version) {
 
 std::vector<std::string> loadPossibleHistoricOps(
     const std::string& name,
-    c10::optional<size_t> version) {
+    std::optional<size_t> version) {
   std::vector<std::string> possibleSchemas;
 
   if (!version.has_value()) {
diff --git a/torch/csrc/jit/operator_upgraders/utils.h b/torch/csrc/jit/operator_upgraders/utils.h
index 78cb31b4bf60e..a30b8c1182b9c 100644
--- a/torch/csrc/jit/operator_upgraders/utils.h
+++ b/torch/csrc/jit/operator_upgraders/utils.h
@@ -16,7 +16,7 @@ struct UpgraderRange {
 // Given a list of upgrader entries for a single operator
 // and the model version for that operator, find a valid
 // upgrader.
-TORCH_API c10::optional<UpgraderEntry> findUpgrader(
+TORCH_API std::optional<UpgraderEntry> findUpgrader(
     const std::vector<UpgraderEntry>& upgraders_for_schema,
     size_t current_version);
 
@@ -39,7 +39,7 @@ TORCH_API bool isOpSymbolCurrent(
 // can be multiple schemas for different overloads.
 TORCH_API std::vector<std::string> loadPossibleHistoricOps(
     const std::string& name,
-    c10::optional<size_t> version);
+    std::optional<size_t> version);
 
 TORCH_API uint64_t getMaxOperatorVersion();
 
diff --git a/torch/csrc/jit/passes/autocast.cpp b/torch/csrc/jit/passes/autocast.cpp
index 213f569f87b02..635162e049531 100644
--- a/torch/csrc/jit/passes/autocast.cpp
+++ b/torch/csrc/jit/passes/autocast.cpp
@@ -60,7 +60,7 @@ bool isAutocastNode(Value* value) {
 //  2. `prim::SetAttr` must follow `prim::CreateObject()` in the same block,
 //    but there might be other nodes in between
 //
-c10::optional<AutocastScope> parseAutocast(
+std::optional<AutocastScope> parseAutocast(
     Value* value,
     const AutocastContext& context) {
   if (!isAutocastNode(value)) {
@@ -71,7 +71,7 @@ c10::optional<AutocastScope> parseAutocast(
     AutocastScope scope;
     scope.instance = value;
     scope.context = context;
-    c10::optional<bool> enabled;
+    std::optional<bool> enabled;
     std::string device;
     c10::ScalarType dtype = c10::ScalarType::Undefined;
     for (Use use : value->uses()) {
@@ -269,7 +269,7 @@ void updateAutocastEnabledCheck(Node* node, bool is_jit_enabled) {
 void handleBlock(Block* block, AutocastContext initial_state) {
   std::stack<AutocastScope> autocast_stack;
 
-  c10::optional<bool> incompatible_amp = c10::nullopt;
+  std::optional<bool> incompatible_amp = c10::nullopt;
 
   // The current autocast enabled/disabled state
   auto current_state = [&] {
diff --git a/torch/csrc/jit/passes/canonicalize.cpp b/torch/csrc/jit/passes/canonicalize.cpp
index 5a5b867a36d09..20a883a8d06fd 100644
--- a/torch/csrc/jit/passes/canonicalize.cpp
+++ b/torch/csrc/jit/passes/canonicalize.cpp
@@ -142,7 +142,7 @@ bool isBeforeOrAfter(const Use& a, const Use& b, bool checking_before) {
   return checking_before ? isBefore(a, b) : isAfter(a, b);
 }
 
-c10::optional<const Use> firstOrLastUse(Value* v, bool find_first) {
+std::optional<const Use> firstOrLastUse(Value* v, bool find_first) {
   if (v->uses().empty()) {
     return c10::nullopt;
   }
@@ -157,9 +157,9 @@ c10::optional<const Use> firstOrLastUse(Value* v, bool find_first) {
   return extreme_use;
 }
 
-static std::vector<c10::optional<const Use>> gatherFirstUses(
+static std::vector<std::optional<const Use>> gatherFirstUses(
     at::ArrayRef<Value*> values) {
-  return fmap(values, [&](Value* v) -> c10::optional<const Use> {
+  return fmap(values, [&](Value* v) -> std::optional<const Use> {
     return firstOrLastUse(v, true);
   });
 }
@@ -169,7 +169,7 @@ static std::vector<size_t> sort_indexes(at::ArrayRef<Value*> values) {
   std::vector<size_t> idx(values.size());
   std::iota(idx.begin(), idx.end(), 0);
 
-  std::vector<c10::optional<const Use>> first_uses = gatherFirstUses(values);
+  std::vector<std::optional<const Use>> first_uses = gatherFirstUses(values);
 
   // Sort values based on canonical ordering of their first usage
   std::sort(idx.begin(), idx.end(), [&first_uses](size_t i1, size_t i2) {
diff --git a/torch/csrc/jit/passes/canonicalize.h b/torch/csrc/jit/passes/canonicalize.h
index 46d90d1a515f6..b84cdd9f6a355 100644
--- a/torch/csrc/jit/passes/canonicalize.h
+++ b/torch/csrc/jit/passes/canonicalize.h
@@ -11,7 +11,7 @@ TORCH_API std::shared_ptr<Graph> Canonicalize(
 
 TORCH_API void CanonicalizeOutputs(std::shared_ptr<Graph>& graph);
 
-TORCH_API c10::optional<const Use> firstOrLastUse(Value* v, bool find_first);
+TORCH_API std::optional<const Use> firstOrLastUse(Value* v, bool find_first);
 
 TORCH_API bool isBeforeOrAfter(
     const Use& a,
diff --git a/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp b/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp
index a8d7c75fbe7f3..72d419eeb9c16 100644
--- a/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp
+++ b/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp
@@ -12,7 +12,7 @@ struct ChunkOutput {
   size_t offset;
 };
 
-static c10::optional<std::vector<ChunkOutput>> getChunkOutputs(Node* chunk) {
+static std::optional<std::vector<ChunkOutput>> getChunkOutputs(Node* chunk) {
   std::vector<ChunkOutput> outputs;
   for (auto list_use : chunk->output()->uses()) {
     if (list_use.user->matches(
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index cd3fb6b1e2b06..6334cd75faa90 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -19,7 +19,7 @@
 namespace torch {
 namespace jit {
 
-c10::optional<std::vector<IValue>> runNodeIfInputsAreConstant(
+std::optional<std::vector<IValue>> runNodeIfInputsAreConstant(
     const Node* n,
     bool ignore_custom_classes,
     AliasDb* db) {
diff --git a/torch/csrc/jit/passes/constant_propagation.h b/torch/csrc/jit/passes/constant_propagation.h
index 62293c8d7abc9..2200acfa39ede 100644
--- a/torch/csrc/jit/passes/constant_propagation.h
+++ b/torch/csrc/jit/passes/constant_propagation.h
@@ -23,7 +23,7 @@ TORCH_API bool ConstantPropagationImmutableTypes(std::shared_ptr<Graph>& graph);
 // make their own determination if constant prop is appropriate - for example
 // non-deterministic ops or ops with side effects.  If ignore_custom_classes is
 // specified, nodes that output user defined classes are not run.
-TORCH_API c10::optional<Stack> runNodeIfInputsAreConstant(
+TORCH_API std::optional<Stack> runNodeIfInputsAreConstant(
     const Node* node,
     bool ignore_custom_classes = false,
     AliasDb* db = nullptr);
diff --git a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
index 162487201da7b..c5fe65537669a 100644
--- a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
+++ b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
@@ -281,7 +281,7 @@ class SubgraphSlicer {
 
   // Try to merge `producer` into `consumer`. If successful, this destroys
   // `producer` and returns the `consumer` group.
-  c10::optional<Node*> tryMerge(Node* consumer, Node* producer) {
+  std::optional<Node*> tryMerge(Node* consumer, Node* producer) {
     AT_ASSERT(consumer->kind() == prim::DifferentiableGraph);
     bool canMerge = shouldConsiderForMerge(producer) &&
         aliasDb_.moveBeforeTopologicallyValid(producer, consumer);
@@ -302,7 +302,7 @@ class SubgraphSlicer {
   std::vector<Node*>& diff_nodes_;
 };
 
-c10::optional<bool> getProfileNodeRequiresGrad(Node* n) {
+std::optional<bool> getProfileNodeRequiresGrad(Node* n) {
   TORCH_INTERNAL_ASSERT(n->kind() == prim::profile);
   if (!n->hasAttribute(attr::profiled_type)) {
     return c10::nullopt;
@@ -359,7 +359,7 @@ struct ContextMapping {
   }
 };
 
-c10::optional<bool> findRequiresGradForOutput(
+std::optional<bool> findRequiresGradForOutput(
     Node* diff_graph,
     Value* output,
     const ContextMapping& ctx_mapping) {
@@ -374,7 +374,7 @@ c10::optional<bool> findRequiresGradForOutput(
     }
 
     if (use.user->kind() == prim::profile) {
-      c10::optional<bool> req_grad_use;
+      std::optional<bool> req_grad_use;
       if ((req_grad_use = getProfileNodeRequiresGrad(use.user)).has_value()) {
         return req_grad_use.value();
       }
@@ -393,7 +393,7 @@ c10::optional<bool> findRequiresGradForOutput(
         }
 
         if (dg_use.user->kind() == prim::profile) {
-          c10::optional<bool> req_grad_use;
+          std::optional<bool> req_grad_use;
           if ((req_grad_use = getProfileNodeRequiresGrad(dg_use.user))
                   .has_value()) {
             return req_grad_use.value();
diff --git a/torch/csrc/jit/passes/decompose_ops.cpp b/torch/csrc/jit/passes/decompose_ops.cpp
index 9f5b3c80b6a07..1276a1f97245a 100644
--- a/torch/csrc/jit/passes/decompose_ops.cpp
+++ b/torch/csrc/jit/passes/decompose_ops.cpp
@@ -22,7 +22,7 @@ c10::AliasAnalysisKind aliasAnalysisFromSchema() {
 // helper to determine if an optional tensor argument/value passed in is
 // statically defined (neither a None constant nor a Optional[Tensor] type)
 // return yes, no, or no value if we can't tell
-static c10::optional<bool> isDefined(Value* tensor) {
+static std::optional<bool> isDefined(Value* tensor) {
   if (tensor->type()->isSubtypeOf(*TensorType::get())) {
     return true;
   }
diff --git a/torch/csrc/jit/passes/device_type_analysis.cpp b/torch/csrc/jit/passes/device_type_analysis.cpp
index 590ac9e2896a8..7670292696ae6 100644
--- a/torch/csrc/jit/passes/device_type_analysis.cpp
+++ b/torch/csrc/jit/passes/device_type_analysis.cpp
@@ -27,7 +27,7 @@ of the Node (based on the rule itself)
 Returns: Bool indicating if anything was changed
 */
 
-bool setDeviceType(Value* value, c10::optional<Device> device) {
+bool setDeviceType(Value* value, std::optional<Device> device) {
   auto tensor_type = value->type()->expect<TensorType>();
   bool changed = tensor_type->device() != device;
   if (changed) {
@@ -36,7 +36,7 @@ bool setDeviceType(Value* value, c10::optional<Device> device) {
   return changed;
 }
 
-bool setReturnsToDevice(Node* n, c10::optional<Device> device) {
+bool setReturnsToDevice(Node* n, std::optional<Device> device) {
   bool changed = false;
   for (Value* out : n->outputs()) {
     auto tensor_type = out->type()->cast<TensorType>();
@@ -93,7 +93,7 @@ bool propWithNoDevice(Node* n) {
 
   auto tensor_type = n->inputs()[input_num]->type()->expect<TensorType>();
   bool only_seen_cpu_zerodim = isZerodimCPUTensor(tensor_type);
-  c10::optional<Device> device = tensor_type->device();
+  std::optional<Device> device = tensor_type->device();
 
   // Now see if all inputs have a consistent device type
   for (input_num++; input_num < n->inputs().size(); input_num++) {
diff --git a/torch/csrc/jit/passes/dtype_analysis.cpp b/torch/csrc/jit/passes/dtype_analysis.cpp
index feeb5f567cd0d..f63ea6f341948 100644
--- a/torch/csrc/jit/passes/dtype_analysis.cpp
+++ b/torch/csrc/jit/passes/dtype_analysis.cpp
@@ -99,7 +99,7 @@ static bool canBeInferredWithMetaTensor(Node* n) {
   return true;
 }
 
-c10::optional<Tensor> inferWithMetaTensor(Node* n) {
+std::optional<Tensor> inferWithMetaTensor(Node* n) {
   GRAPH_DEBUG("inferWithMetaTensor", getHeader(n));
   if (!canBeInferredWithMetaTensor(n)) {
     return c10::nullopt;
diff --git a/torch/csrc/jit/passes/fold_conv_bn.cpp b/torch/csrc/jit/passes/fold_conv_bn.cpp
index 9df6887d24289..6f0c82e7bebe2 100644
--- a/torch/csrc/jit/passes/fold_conv_bn.cpp
+++ b/torch/csrc/jit/passes/fold_conv_bn.cpp
@@ -105,7 +105,7 @@ void addBiasForConvIfNone(Module& module, const std::string& pattern_name) {
     if (!t->hasAttribute("bias")) {
       auto optional_tensor_type = OptionalType::create(TensorType::get());
       t->addAttribute("bias", std::move(optional_tensor_type), true);
-      auto optional_tensor = c10::optional<at::Tensor>();
+      auto optional_tensor = std::optional<at::Tensor>();
       module.setattr("bias", std::move(optional_tensor));
       replaceConvBiasWithGetAttr(module);
     }
diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index 9ebbaa4e53e0d..4d67d5d217813 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -167,7 +167,7 @@ class AttributePropagator {
   // Examples:
   // submodule1.submodule2.foo -> {submodule2, "foo"}
   // submodule1.non_existent_module.foo -> nullopt
-  c10::optional<ResolvedName> resolveName(const std::string& name) {
+  std::optional<ResolvedName> resolveName(const std::string& name) {
     auto sub_names = splitName(name);
     if (sub_names.empty()) {
       return c10::nullopt;
@@ -225,7 +225,7 @@ class AttributePropagator {
     return true;
   }
 
-  c10::optional<std::deque<std::string>> getModulePath(
+  std::optional<std::deque<std::string>> getModulePath(
       Value* input,
       std::shared_ptr<Graph>& graph) {
     bool success = _loadModulePath(input, graph);
diff --git a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
index f6f63de01a498..c28e99a445258 100644
--- a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
+++ b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
@@ -1099,7 +1099,7 @@ class MKLDNNSubgraphSlicer {
 
   // Try to merge `consumer` into `producer`. If successful, this destroys
   // `consumer` and returns the `producer` group.
-  c10::optional<Node*> tryMerge(Node* producer, Node* consumer) {
+  std::optional<Node*> tryMerge(Node* producer, Node* consumer) {
     AT_ASSERT(producer->kind() == prim::MKLDNNGroup);
     bool canMerge = shouldConsiderForMerge(consumer) &&
         aliasDb_.moveAfterTopologicallyValid(consumer, producer);
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index 0acc6f9bd07bb..9848783072621 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -490,7 +490,7 @@ struct GraphFuser {
     return true;
   }
 
-  c10::optional<Node*> findFusedChunk(Node* group, Value* input) {
+  std::optional<Node*> findFusedChunk(Node* group, Value* input) {
     AT_ASSERT(group->kind() == prim::FusionGroup);
     auto it = std::find(group->inputs().begin(), group->inputs().end(), input);
     if (it == group->inputs().end()) {
diff --git a/torch/csrc/jit/passes/graph_rewrite_helper.cpp b/torch/csrc/jit/passes/graph_rewrite_helper.cpp
index cd06bee7fc4ab..edb9f5b9589a0 100644
--- a/torch/csrc/jit/passes/graph_rewrite_helper.cpp
+++ b/torch/csrc/jit/passes/graph_rewrite_helper.cpp
@@ -27,7 +27,7 @@ Value* getValue(
   return match_vmap.at(vmap.at(name));
 }
 
-c10::optional<IValue> getIValue(
+std::optional<IValue> getIValue(
     const std::string& name,
     const std::unordered_map<const Value*, Value*>& match_vmap,
     const std::unordered_map<std::string, Value*>& vmap) {
diff --git a/torch/csrc/jit/passes/graph_rewrite_helper.h b/torch/csrc/jit/passes/graph_rewrite_helper.h
index 0920830babb8b..9f8b9f0a1b8fa 100644
--- a/torch/csrc/jit/passes/graph_rewrite_helper.h
+++ b/torch/csrc/jit/passes/graph_rewrite_helper.h
@@ -14,7 +14,7 @@ Value* getValue(
     const std::string& name,
     const std::unordered_map<const Value*, Value*>& match_vmap,
     const std::unordered_map<std::string, Value*>& vmap);
-c10::optional<IValue> getIValue(
+std::optional<IValue> getIValue(
     const std::string& name,
     const std::unordered_map<const Value*, Value*>& match_vmap,
     const std::unordered_map<std::string, Value*>& vmap);
diff --git a/torch/csrc/jit/passes/hoist_conv_packed_params.cpp b/torch/csrc/jit/passes/hoist_conv_packed_params.cpp
index ef3b861772c31..c3db2373f2a3c 100644
--- a/torch/csrc/jit/passes/hoist_conv_packed_params.cpp
+++ b/torch/csrc/jit/passes/hoist_conv_packed_params.cpp
@@ -100,7 +100,7 @@ void HoistConvPackedParams(script::Module& m) {
           n->kind() == prim::GetAttr && n->s(attr::name) == "_packed_params";
       if (isGetPackedParamsNode) {
         // make sure the foo in {foo}.{_packed_params} is a quantized conv
-        c10::optional<std::string> moduleName = getModuleName(n->inputs()[0]);
+        std::optional<std::string> moduleName = getModuleName(n->inputs()[0]);
         bool moduleNameIsQuantizedConv = moduleName.has_value() &&
             (moduleName.value() ==
                  "__torch__.torch.ao.nn.quantized.modules.conv.Conv1d" ||
diff --git a/torch/csrc/jit/passes/integer_value_refinement.cpp b/torch/csrc/jit/passes/integer_value_refinement.cpp
index e3a339efe6d7b..16a329b3b11f3 100644
--- a/torch/csrc/jit/passes/integer_value_refinement.cpp
+++ b/torch/csrc/jit/passes/integer_value_refinement.cpp
@@ -204,7 +204,7 @@ struct IntegerValueRefiner {
     return block_refinements;
   };
 
-  c10::optional<int64_t> tryFindRefinement(Value* v) {
+  std::optional<int64_t> tryFindRefinement(Value* v) {
     for (const auto& ref : active_refinements_) {
       auto maybe_refinement = ref->find(v);
       if (maybe_refinement != ref->end()) {
diff --git a/torch/csrc/jit/passes/loop_unrolling.cpp b/torch/csrc/jit/passes/loop_unrolling.cpp
index 3df61ad8a7765..4fac1cfbe5fbf 100644
--- a/torch/csrc/jit/passes/loop_unrolling.cpp
+++ b/torch/csrc/jit/passes/loop_unrolling.cpp
@@ -19,7 +19,7 @@ static constexpr int64_t kMaxBodySize = 32;
 static constexpr int64_t kMaxBodyRepeats = 64;
 
 bool isTrueConstant(Value* val) {
-  c10::optional<bool> maybe_value = constant_as<bool>(val);
+  std::optional<bool> maybe_value = constant_as<bool>(val);
   return maybe_value && *maybe_value;
 }
 
@@ -178,7 +178,7 @@ void unroll(Node* loop) {
   // Some optimization for constant-length loops. If we know they won't run too
   // many times, then we can unroll them entirely.
   Value* trip_count = loop->inputs().at(0);
-  c10::optional<int64_t> const_len = constant_as<int64_t>(trip_count);
+  std::optional<int64_t> const_len = constant_as<int64_t>(trip_count);
   if (const_len && *const_len < kMaxBodyRepeats) {
     Block* dest = loop->addBlock();
     repeatBody(body, *const_len, dest);
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index e6cbd22efb014..85b49dd31e94e 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -218,6 +218,13 @@ py::dict BlockToONNX(
     }
   }
 
+  // Determine if all inputs are static. This is used for each node to
+  // determine whether or not to propagate shapes.
+  if (!is_sub_block) {
+    bool static_input_shape = AllGraphInputsStatic(ctx.block->owningGraph());
+    ConstantValueMap::SetAllGraphInputsStatic(static_input_shape);
+  }
+
   // Finally, visit all nodes in the graph
   for (auto node : old_block->nodes()) {
     NodeToONNX(node, ctx.block, operator_export_type, env, values_in_env);
diff --git a/torch/csrc/jit/passes/onnx/constant_fold.cpp b/torch/csrc/jit/passes/onnx/constant_fold.cpp
index 1d0457c65a5fb..4eeba79aae90c 100644
--- a/torch/csrc/jit/passes/onnx/constant_fold.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_fold.cpp
@@ -64,7 +64,7 @@ void handleNegativeStartEndIndex(
   }
 }
 
-c10::optional<at::Tensor> runTorchSlice_opset9(
+std::optional<at::Tensor> runTorchSlice_opset9(
     const Node* node,
     std::vector<at::Tensor>& inputTensorValues) {
   assert(inputTensorValues.size() == 1);
@@ -101,10 +101,10 @@ c10::optional<at::Tensor> runTorchSlice_opset9(
       return c10::nullopt;
     updated_val = at::narrow(updated_val, axis, start, length);
   }
-  return c10::optional<at::Tensor>(updated_val);
+  return std::optional<at::Tensor>(updated_val);
 }
 
-c10::optional<at::Tensor> runTorchSlice_opset10(
+std::optional<at::Tensor> runTorchSlice_opset10(
     const Node* node,
     std::vector<at::Tensor>& inputTensorValues) {
   const int maxSliceInputCount = 5;
@@ -195,7 +195,7 @@ c10::optional<at::Tensor> runTorchSlice_opset10(
       return c10::nullopt;
     updated_val = at::narrow(updated_val, axis, start, length);
   }
-  return c10::optional<at::Tensor>(updated_val);
+  return std::optional<at::Tensor>(updated_val);
 }
 
 // Refer to AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF
@@ -259,7 +259,7 @@ at::Tensor IntToTensor(int64_t value) {
   return at::squeeze(f_copy, 0);
 }
 
-c10::optional<at::Tensor> runTorchBackendForOnnx(
+std::optional<at::Tensor> runTorchBackendForOnnx(
     const Node* node,
     std::vector<at::Tensor>& inputTensorValues,
     int opset_version) {
@@ -280,10 +280,10 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
     }
     updated_val =
         at::cat(at::TensorList(inputTensorValues), node->i(attr::axis));
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Sqrt) {
     updated_val = at::sqrt(inputTensorValues[0]);
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Div) {
     // One example shows at::div(CPULongType, CPULongType) = CPUFloatType,
     // So we add a cast below.
@@ -292,16 +292,16 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
         inputTensorValues[1].scalar_type()) {
       updated_val = updated_val.to(inputTensorValues[0].scalar_type());
     }
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Mul) {
     updated_val = at::mul(inputTensorValues[0], inputTensorValues[1]);
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Sub) {
     updated_val = at::sub(inputTensorValues[0], inputTensorValues[1]);
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Add) {
     updated_val = at::add(inputTensorValues[0], inputTensorValues[1]);
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Unsqueeze) {
     if (opset_version >= ONNX_OPSET_13) {
       assert(inputTensorValues.size() == 2);
@@ -328,7 +328,7 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
       for (int64_t i = 0; i < inputTensorValues[1].sizes()[0]; ++i) {
         updated_val = at::unsqueeze(updated_val, axes[i]);
       }
-      return c10::optional<at::Tensor>(updated_val);
+      return std::optional<at::Tensor>(updated_val);
     } else if (opset_version >= ONNX_OPSET_9) {
       assert(inputTensorValues.size() == 1);
       if (!node->hasAttributeS("axes")) {
@@ -340,7 +340,7 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
       for (auto axis : axesAttr) {
         updated_val = at::unsqueeze(updated_val, axis);
       }
-      return c10::optional<at::Tensor>(updated_val);
+      return std::optional<at::Tensor>(updated_val);
     } else {
       TORCH_WARN(
           "Constant folding - unsupported opset version. "
@@ -373,7 +373,7 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
           updated_val = at::squeeze(updated_val, axes[i]);
         }
       }
-      return c10::optional<at::Tensor>(updated_val);
+      return std::optional<at::Tensor>(updated_val);
     } else if (opset_version >= ONNX_OPSET_9) {
       assert(inputTensorValues.size() == 1);
       updated_val = inputTensorValues[0];
@@ -384,7 +384,7 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
           updated_val = at::squeeze(updated_val, axis);
         }
       }
-      return c10::optional<at::Tensor>(updated_val);
+      return std::optional<at::Tensor>(updated_val);
     } else {
       TORCH_WARN(
           "Constant folding - unsupported opset version. "
@@ -397,13 +397,13 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
       return c10::nullopt;
     }
     updated_val = inputTensorValues[0].permute(node->is(attr::perm));
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Cast) {
     assert(inputTensorValues.size() == 1);
     if (node->hasAttributeS("to") && ONNXTypeToATenType(node->i(attr::to))) {
       updated_val = inputTensorValues[0].to(
           ONNXTypeToATenType(node->i(attr::to)).value());
-      return c10::optional<at::Tensor>(updated_val);
+      return std::optional<at::Tensor>(updated_val);
     }
     return c10::nullopt;
   } else if (node->kind() == onnx::Reshape) {
@@ -433,11 +433,11 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
         shape[i] = shape_a[i];
       }
     }
-    return c10::optional<at::Tensor>(at::reshape(updated_val, shape));
+    return std::optional<at::Tensor>(at::reshape(updated_val, shape));
   } else if (node->kind() == onnx::Shape) {
     TORCH_INTERNAL_ASSERT(inputTensorValues.size() == 1);
     updated_val = at::_shape_as_tensor(inputTensorValues[0]);
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::ReduceL1 || node->kind() == onnx::ReduceL2) {
     assert(inputTensorValues.size() == 1);
     if (!node->hasAttributeS("axes")) {
@@ -449,7 +449,7 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
     int p = node->kind() == onnx::ReduceL1 ? 1 : 2;
     updated_val = at::norm(
         inputTensorValues[0], p, node->is(attr::axes), node->i(attr::keepdims));
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::ReduceProd) {
     int64_t rank = inputTensorValues[0].sizes().size();
     std::vector<int64_t> axes;
@@ -469,7 +469,7 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
     for (const auto& axis : axes) {
       updated_val = at::prod(updated_val, axis, keepdims);
     }
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Gather) {
     assert(inputTensorValues.size() == 2);
     // default axis = 0
@@ -503,41 +503,41 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
     if (q < 1) {
       updated_val = updated_val.squeeze(axis);
     }
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Range) {
     updated_val = runTorchArange_opset11(node, inputTensorValues);
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Where) {
     updated_val = at::where(
         inputTensorValues[0], inputTensorValues[1], inputTensorValues[2]);
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Equal) {
     updated_val = at::eq(inputTensorValues[0], inputTensorValues[1]);
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Greater) {
     updated_val = at::greater(inputTensorValues[0], inputTensorValues[1]);
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Less) {
     updated_val = at::less(inputTensorValues[0], inputTensorValues[1]);
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Neg) {
     updated_val = at::neg(inputTensorValues[0]);
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Not) {
     auto ones =
         at::ones(inputTensorValues[0].sizes(), inputTensorValues[0].dtype());
     updated_val = at::ne(inputTensorValues[0], ones);
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else if (node->kind() == onnx::Size) {
     int64_t total_size = 1;
     for (auto size : inputTensorValues[0].sizes()) {
       total_size *= size;
     }
-    return c10::optional<at::Tensor>(IntToTensor(total_size));
+    return std::optional<at::Tensor>(IntToTensor(total_size));
   } else if (node->kind() == onnx::Softmax) {
     int64_t axis = node->hasAttributeS("axis") ? node->i(attr::axis) : -1;
     updated_val = at::softmax(inputTensorValues[0], axis);
-    return c10::optional<at::Tensor>(updated_val);
+    return std::optional<at::Tensor>(updated_val);
   } else {
     return c10::nullopt;
   }
diff --git a/torch/csrc/jit/passes/onnx/constant_fold.h b/torch/csrc/jit/passes/onnx/constant_fold.h
index 8bfb0dd081c39..201c3def32685 100644
--- a/torch/csrc/jit/passes/onnx/constant_fold.h
+++ b/torch/csrc/jit/passes/onnx/constant_fold.h
@@ -19,7 +19,7 @@ namespace onnx_constant_fold {
 
 at::Tensor IntToTensor(int64_t value);
 
-c10::optional<at::Tensor> runTorchBackendForOnnx(
+std::optional<at::Tensor> runTorchBackendForOnnx(
     const Node* node,
     std::vector<at::Tensor>& inputTensorValues,
     int opset_version);
diff --git a/torch/csrc/jit/passes/onnx/constant_map.cpp b/torch/csrc/jit/passes/onnx/constant_map.cpp
index c36440da8d811..8fd1bed0b7a1b 100644
--- a/torch/csrc/jit/passes/onnx/constant_map.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_map.cpp
@@ -32,13 +32,22 @@ bool ConstantValueMap::HasRank(const std::string& tensorName) {
       ConstantValueMap::getInstance().rankMap.end();
 }
 
-c10::optional<size_t> ConstantValueMap::GetRank(const std::string& tensorName) {
+std::optional<size_t> ConstantValueMap::GetRank(const std::string& tensorName) {
   if (!HasRank(tensorName)) {
     return c10::nullopt;
   }
   return ConstantValueMap::getInstance().rankMap[tensorName];
 }
 
+void ConstantValueMap::SetAllGraphInputsStatic(bool all_static) {
+  ConstantValueMap::getInstance().allGraphInputsStatic =
+      c10::make_optional(all_static);
+}
+
+c10::optional<bool> ConstantValueMap::GetAllGraphInputsStatic() {
+  return ConstantValueMap::getInstance().allGraphInputsStatic;
+}
+
 void ConstantValueMap::SetShape(
     const std::string& tensorName,
     const c10::SymbolicShape& shapeValue) {
@@ -51,7 +60,7 @@ bool ConstantValueMap::HasShape(const std::string& tensorName) {
       ConstantValueMap::getInstance().shapeMap.end();
 }
 
-c10::optional<c10::SymbolicShape> ConstantValueMap::GetShape(
+std::optional<c10::SymbolicShape> ConstantValueMap::GetShape(
     const std::string& tensorName) {
   if (!HasShape(tensorName)) {
     return c10::nullopt;
@@ -70,7 +79,7 @@ bool ConstantValueMap::HasValue(const std::string& tensorName) {
       ConstantValueMap::getInstance().tensorValueMap.end();
 }
 
-c10::optional<at::Tensor> ConstantValueMap::GetValue(
+std::optional<at::Tensor> ConstantValueMap::GetValue(
     const std::string& tensorName) {
   if (!HasValue(tensorName)) {
     return c10::nullopt;
@@ -94,7 +103,7 @@ std::vector<int64_t> ConstantValueMap::GetCompleteShapeInto1DInt64Vector(
   return shape_value;
 }
 
-c10::optional<std::vector<int64_t>> ConstantValueMap::GetShapeInto1DInt64Vector(
+std::optional<std::vector<int64_t>> ConstantValueMap::GetShapeInto1DInt64Vector(
     const std::string& value_name) {
   if (ConstantValueMap::HasShape(value_name)) {
     auto shape_size = ConstantValueMap::GetShape(value_name).value();
@@ -107,7 +116,7 @@ c10::optional<std::vector<int64_t>> ConstantValueMap::GetShapeInto1DInt64Vector(
   return c10::nullopt;
 }
 
-c10::optional<std::vector<int64_t>> ConstantValueMap::
+std::optional<std::vector<int64_t>> ConstantValueMap::
     GetShapeInto1DInt64VectorWithOneUnknown(const std::string& value_name) {
   if (ConstantValueMap::HasShape(value_name)) {
     auto shape_size = ConstantValueMap::GetShape(value_name).value();
@@ -163,7 +172,7 @@ bool ConstantValueMap::HasTypeReliable(const std::string& tensorName) {
       ConstantValueMap::getInstance().typeReliableMap.end();
 }
 
-c10::optional<bool> ConstantValueMap::GetTypeReliable(
+std::optional<bool> ConstantValueMap::GetTypeReliable(
     const std::string& tensorName) {
   if (!HasTypeReliable(tensorName)) {
     return c10::nullopt;
@@ -182,7 +191,7 @@ bool ConstantValueMap::HasUseInferredType(const std::string& tensorName) {
       ConstantValueMap::getInstance().useInferredTypeMap.end();
 }
 
-c10::optional<bool> ConstantValueMap::GetUseInferredType(
+std::optional<bool> ConstantValueMap::GetUseInferredType(
     const std::string& tensorName) {
   if (!HasUseInferredType(tensorName)) {
     return c10::nullopt;
@@ -201,7 +210,7 @@ bool ConstantValueMap::HasShapeValue(const std::string& tensorName) {
       ConstantValueMap::getInstance().shapeValueMap.end();
 }
 
-c10::optional<c10::SymbolicShape> ConstantValueMap::GetShapeValue(
+std::optional<c10::SymbolicShape> ConstantValueMap::GetShapeValue(
     const std::string& tensorName) {
   if (!HasShapeValue(tensorName)) {
     return c10::nullopt;
@@ -218,6 +227,10 @@ SymbolDimMap& ConstantValueMap::GetSymbolDimMap() {
   return ConstantValueMap::getInstance().symbolDimMap;
 }
 
+DimSymbolMap& ConstantValueMap::GetDimSymbolMap() {
+  return ConstantValueMap::getInstance().dimSymbolMap;
+}
+
 template <typename Map>
 void UpdateStrKey(
     Map& map,
@@ -262,6 +275,8 @@ void ConstantValueMap::ClearMaps() {
   ConstantValueMap::getInstance().shapeValueMap.clear();
   ConstantValueMap::getInstance().inferredShapeData.clear();
   ConstantValueMap::getInstance().symbolDimMap.clear();
+  ConstantValueMap::getInstance().dimSymbolMap.clear();
+  ConstantValueMap::getInstance().allGraphInputsStatic = c10::nullopt;
 }
 
 // For debug only.
@@ -349,6 +364,15 @@ void ConstantValueMap::PrintMaps() {
       std::cout << std::endl;
     }
   }
+  std::cout << "DimSymbol Map:" << std::endl;
+  count = 0;
+  for (const auto& x : ConstantValueMap::getInstance().dimSymbolMap) {
+    std::cout << "(" << x.first << ": " << x.second << "), ";
+    count++;
+    if (count % 10 == 0) {
+      std::cout << std::endl;
+    }
+  }
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/onnx/constant_map.h b/torch/csrc/jit/passes/onnx/constant_map.h
index b7b534d730587..303d373eea56f 100644
--- a/torch/csrc/jit/passes/onnx/constant_map.h
+++ b/torch/csrc/jit/passes/onnx/constant_map.h
@@ -24,49 +24,53 @@ class ConstantValueMap {
   static ConstantValueMap& getInstance();
   static void SetRank(const std::string& tensorName, size_t rankValue);
   static bool HasRank(const std::string& tensorName);
-  static c10::optional<size_t> GetRank(const std::string& tensorName);
+  static std::optional<size_t> GetRank(const std::string& tensorName);
+
+  static void SetAllGraphInputsStatic(bool all_static);
+  static c10::optional<bool> GetAllGraphInputsStatic();
 
   static void SetShape(
       const std::string& tensorName,
       const c10::SymbolicShape& shapeValue);
   static bool HasShape(const std::string& tensorName);
-  static c10::optional<c10::SymbolicShape> GetShape(
+  static std::optional<c10::SymbolicShape> GetShape(
       const std::string& tensorName);
 
   static void SetValue(const std::string& tensorName, const at::Tensor& value);
   static bool HasValue(const std::string& tensorName);
-  static c10::optional<at::Tensor> GetValue(const std::string& tensorName);
+  static std::optional<at::Tensor> GetValue(const std::string& tensorName);
   static void EraseValue(const std::string& tensorName);
 
   static std::vector<int64_t> GetCompleteShapeInto1DInt64Vector(
       const c10::SymbolicShape& shape);
-  static c10::optional<std::vector<int64_t>> GetShapeInto1DInt64Vector(
+  static std::optional<std::vector<int64_t>> GetShapeInto1DInt64Vector(
       const std::string& value_name);
-  static c10::optional<std::vector<int64_t>>
+  static std::optional<std::vector<int64_t>>
   GetShapeInto1DInt64VectorWithOneUnknown(const std::string& value_name);
   static std::vector<int64_t> GetValueInto1DInt64Vector(
       const std::string& value_name);
 
   static void SetTypeReliable(const std::string& tensorName, bool reliable);
   static bool HasTypeReliable(const std::string& tensorName);
-  static c10::optional<bool> GetTypeReliable(const std::string& tensorName);
+  static std::optional<bool> GetTypeReliable(const std::string& tensorName);
 
   static void SetUseInferredType(
       const std::string& tensorName,
       bool useInferredType);
   static bool HasUseInferredType(const std::string& tensorName);
-  static c10::optional<bool> GetUseInferredType(const std::string& tensorName);
+  static std::optional<bool> GetUseInferredType(const std::string& tensorName);
 
   static void SetShapeValue(
       const std::string& tensorName,
       const c10::SymbolicShape& shapeValue);
   static bool HasShapeValue(const std::string& tensorName);
-  static c10::optional<c10::SymbolicShape> GetShapeValue(
+  static std::optional<c10::SymbolicShape> GetShapeValue(
       const std::string& tensorName);
 
   static ShapeDataMap& GetInferredShapeData();
 
   static SymbolDimMap& GetSymbolDimMap();
+  static DimSymbolMap& GetDimSymbolMap();
 
   static void UpdateValueName(
       const std::string& old_name,
@@ -101,6 +105,9 @@ class ConstantValueMap {
   // during future node-level shape inference.
   ShapeDataMap inferredShapeData;
   SymbolDimMap symbolDimMap;
+  DimSymbolMap dimSymbolMap;
+  // Stores if all graph-level inputs have static shape
+  c10::optional<bool> allGraphInputsStatic;
 };
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/onnx/function_extraction.cpp b/torch/csrc/jit/passes/onnx/function_extraction.cpp
index d6555c5c5bb70..c545c7aba823a 100644
--- a/torch/csrc/jit/passes/onnx/function_extraction.cpp
+++ b/torch/csrc/jit/passes/onnx/function_extraction.cpp
@@ -58,8 +58,8 @@ struct FunctionExtractor {
         scope_ctx_map& scope_ctxs);
     void DebugPrint() const;
     void SetAttrName(Node* ref_n, Symbol attr, const std::string& name);
-    c10::optional<std::string> FindAttrName(Node* ref_n, Symbol attr);
-    c10::optional<std::string> FindAttrName(Node* ref_const_n);
+    std::optional<std::string> FindAttrName(Node* ref_n, Symbol attr);
+    std::optional<std::string> FindAttrName(Node* ref_const_n);
 
     ScopePtr scope_key_;
     scope_ctx_map scope_ctxs_;
@@ -76,10 +76,10 @@ struct FunctionExtractor {
   using func_ctx_map = std::unordered_map<ScopePtr, FunctionCtxPtr>;
 
   static bool IsValidScope(ScopePtr s);
-  static c10::optional<ScopePtr> InferScope(Node* n);
+  static std::optional<ScopePtr> InferScope(Node* n);
   static bool IsAncestor(ScopePtr parent, ScopePtr child);
-  static c10::optional<ScopePtr> FindCommonAncestor(ScopePtr a, ScopePtr b);
-  static c10::optional<ScopePtr> FindCommonAncestor(const scope_list& scopes);
+  static std::optional<ScopePtr> FindCommonAncestor(ScopePtr a, ScopePtr b);
+  static std::optional<ScopePtr> FindCommonAncestor(const scope_list& scopes);
   std::shared_ptr<Graph> ConstructFuncGraph(FunctionContext& ctx);
 
   void ConvertScopeToFunction(
@@ -219,7 +219,7 @@ void FunctionExtractor::FunctionContext::SetAttrName(
   auto n_attr_it = node_attr_to_name_[n_in_def][attr.toUnqualString()] = name;
 }
 
-c10::optional<std::string> FunctionExtractor::FunctionContext::FindAttrName(
+std::optional<std::string> FunctionExtractor::FunctionContext::FindAttrName(
     Node* ref_n,
     Symbol attr) {
   auto v_it =
@@ -297,7 +297,7 @@ bool FunctionExtractor::IsAncestor(ScopePtr parent, ScopePtr child) {
   return false;
 }
 
-c10::optional<ScopePtr> FunctionExtractor::FindCommonAncestor(
+std::optional<ScopePtr> FunctionExtractor::FindCommonAncestor(
     ScopePtr a,
     ScopePtr b) {
   if (!IsValidScope(a) || !IsValidScope(b)) {
@@ -330,13 +330,13 @@ c10::optional<ScopePtr> FunctionExtractor::FindCommonAncestor(
   return c10::nullopt;
 }
 
-c10::optional<ScopePtr> FunctionExtractor::FindCommonAncestor(
+std::optional<ScopePtr> FunctionExtractor::FindCommonAncestor(
     const scope_list& scopes) {
   if (scopes.empty()) {
     return c10::nullopt;
   }
 
-  c10::optional<ScopePtr> common_ancestor = scopes.at(0);
+  std::optional<ScopePtr> common_ancestor = scopes.at(0);
   for (const auto& scope : scopes) {
     common_ancestor = FindCommonAncestor(common_ancestor.value(), scope);
     if (!common_ancestor.has_value()) {
@@ -347,7 +347,7 @@ c10::optional<ScopePtr> FunctionExtractor::FindCommonAncestor(
   return common_ancestor;
 }
 
-c10::optional<ScopePtr> FunctionExtractor::InferScope(Node* n) {
+std::optional<ScopePtr> FunctionExtractor::InferScope(Node* n) {
   // The scope of node n is assigned based on the following rules.
   // 1. If all uses of outputs of n belongs to the same scope,
   //    assign that scope, otherwise
diff --git a/torch/csrc/jit/passes/onnx/function_substitution.cpp b/torch/csrc/jit/passes/onnx/function_substitution.cpp
index a6e2f89e106ec..81bfa3fd6caf5 100644
--- a/torch/csrc/jit/passes/onnx/function_substitution.cpp
+++ b/torch/csrc/jit/passes/onnx/function_substitution.cpp
@@ -12,7 +12,7 @@ namespace {
 const std::string kTopModuleVariableName = "";
 
 std::string TidyClassNameFromTorchScript(
-    const c10::optional<c10::QualifiedName>& class_name) {
+    const std::optional<c10::QualifiedName>& class_name) {
   if (!class_name) {
     return "UNKNOWN_CLASS";
   }
diff --git a/torch/csrc/jit/passes/onnx/helper.cpp b/torch/csrc/jit/passes/onnx/helper.cpp
index d6b2a6385fab4..9d4c5061414c5 100644
--- a/torch/csrc/jit/passes/onnx/helper.cpp
+++ b/torch/csrc/jit/passes/onnx/helper.cpp
@@ -61,7 +61,7 @@ void buildParamsMapFromValueToParamsMap(
   }
 }
 
-c10::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type) {
+std::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type) {
   switch (onnx_type) {
     case ::ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED:
       return at::ScalarType::Undefined;
@@ -104,7 +104,7 @@ c10::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type) {
           onnx_type,
           " is an unexpected tensor scalar type");
   }
-  return c10::optional<at::ScalarType>{};
+  return std::optional<at::ScalarType>{};
 }
 
 Node* addNodeToBlock(Block* block, Symbol kind, ArrayRef<Value*> inputs) {
diff --git a/torch/csrc/jit/passes/onnx/helper.h b/torch/csrc/jit/passes/onnx/helper.h
index 77eb98ba8a707..9e09c638779ef 100644
--- a/torch/csrc/jit/passes/onnx/helper.h
+++ b/torch/csrc/jit/passes/onnx/helper.h
@@ -40,7 +40,7 @@ TORCH_API Node* addNodeToBlock(
 
 TORCH_API Value* addInputToBlock(Block* block);
 
-TORCH_API c10::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type);
+TORCH_API std::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type);
 
 // Use int return type as no sable way exists to forward declare protobuf enum
 TORCH_API int ATenTypeToOnnxType(at::ScalarType at_type);
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp
index 41e3ac9ecc4e8..6110954990455 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp
@@ -77,7 +77,7 @@ Node* EncapsulateInplaceIndexPutForONNX(Node* index_put_node) {
 
 } // namespace
 
-c10::optional<Node*> EncapsulatePatternIntoSubblock(Node* n) {
+std::optional<Node*> EncapsulatePatternIntoSubblock(Node* n) {
   switch (n->kind()) {
     case aten::index_put_:
     case aten::index_put: {
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h
index cd78663cffc47..6673d4aba3a75 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h
@@ -28,7 +28,7 @@ namespace jit {
 // the subblock of a new placeholder node. The outputs of the new placeholder
 // node are used in place of the original nodes instead. The category of the
 // pattern is stored as attr::name.
-TORCH_API c10::optional<Node*> EncapsulatePatternIntoSubblock(Node* n);
+TORCH_API std::optional<Node*> EncapsulatePatternIntoSubblock(Node* n);
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index 9e1c17120f654..73c19851e569b 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -101,7 +101,7 @@ std::vector<size_t> getBroadcastPositions(Node* node) {
 // Determine whether `from` can broadcast to `to`, and if so at which
 // position. `from` must be a suffix of `to`, except that any
 // occurrences of 1 in `from` are treated as wildcards.
-c10::optional<size_t> fusibleExpandTo(
+std::optional<size_t> fusibleExpandTo(
     at::IntArrayRef from,
     at::IntArrayRef to) {
   if (from.size() > to.size()) {
@@ -156,7 +156,7 @@ void fuseBroadcast(Block* b) {
       }
 
       // Not all broadcasts are supported by ONNX broadcast.
-      c10::optional<size_t> axis = fusibleExpandTo(
+      std::optional<size_t> axis = fusibleExpandTo(
           unexpanded_input->type()
               ->expectRef<TensorType>()
               .sizes()
diff --git a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
index 638acd464adcd..427e5771a9f0f 100644
--- a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
+++ b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
@@ -97,7 +97,7 @@ static bool IsImplicitCastSupported(const NodeKind& nodeKind) {
       IsSelectorOp(nodeKind);
 }
 
-static c10::optional<c10::ScalarType> PromoteScalarTypes(
+static std::optional<c10::ScalarType> PromoteScalarTypes(
     const std::vector<c10::ScalarType>& types) {
   if (types.empty()) {
     return c10::nullopt;
@@ -112,7 +112,7 @@ static c10::optional<c10::ScalarType> PromoteScalarTypes(
 // Type promotion between scalars and tensors
 // per logic here
 // https://pytorch.org/docs/main/tensor_attributes.html#tensor-attributes
-static c10::optional<c10::ScalarType> PromoteScalarTypesWithCategory(
+static std::optional<c10::ScalarType> PromoteScalarTypesWithCategory(
     const std::vector<c10::ScalarType>& typesFromTensors,
     const std::vector<c10::ScalarType>& typesFromScalars) {
   auto typeFromTensor = PromoteScalarTypes(typesFromTensors);
@@ -146,12 +146,12 @@ static c10::optional<c10::ScalarType> PromoteScalarTypesWithCategory(
   return typeFromTensor;
 }
 
-static c10::optional<c10::ScalarType> InferExpectedScalarType(const Node* n) {
+static std::optional<c10::ScalarType> InferExpectedScalarType(const Node* n) {
   std::vector<c10::ScalarType> typesFromTensors;
   std::vector<c10::ScalarType> typesFromScalars;
 
   auto get_scalar_type =
-      [](const Value* input) -> c10::optional<at::ScalarType> {
+      [](const Value* input) -> std::optional<at::ScalarType> {
     if (auto* tensor_type = input->type()->castRaw<TensorType>()) {
       return tensor_type->scalarType();
     }
@@ -252,7 +252,7 @@ static c10::optional<c10::ScalarType> InferExpectedScalarType(const Node* n) {
         }
       });
 
-  c10::optional<c10::ScalarType> st = c10::nullopt;
+  std::optional<c10::ScalarType> st = c10::nullopt;
   const auto output_st = get_scalar_type(n->output());
 
   if (IsComparisonOp(n->kind())) {
@@ -280,7 +280,7 @@ static c10::optional<c10::ScalarType> InferExpectedScalarType(const Node* n) {
   return st;
 }
 
-static c10::optional<c10::ScalarType> LowPrecisionCastForStandardOps(
+static std::optional<c10::ScalarType> LowPrecisionCastForStandardOps(
     const Node* n,
     const c10::ScalarType& scalar_type) {
   // Some of standardOps do not support uint8\int8\int16 type for ONNX
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 186623bf4e049..dd79754f4c016 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -87,21 +87,24 @@ namespace onnx_torch = ::torch::onnx;
 namespace onnx = ::ONNX_NAMESPACE;
 namespace diagnostics = ::torch::onnx::diagnostics;
 
+// SymbolDimMap is a Torch-to-ONNX shape look-up. This is built so it can be
+// returned by the export function. During the export however, when we come
+// across new ONNX shapes, the reverse look-up is needed. To avoid incurring
+// a linear-time look-up, we maintain DimSymbolMap in parallel.
 c10::ShapeSymbol ONNXDimToShapeSymbol(
     const onnx::TensorShapeProto_Dimension& dim,
-    SymbolDimMap& symbol_dim_map) {
+    SymbolDimMap& symbol_dim_map,
+    DimSymbolMap& dim_symbol_map) {
   if (dim.has_dim_value()) {
     return c10::ShapeSymbol::fromStaticSize(dim.dim_value());
   }
-  c10::optional<c10::ShapeSymbol> sym = c10::nullopt;
+  std::optional<c10::ShapeSymbol> sym = c10::nullopt;
   if (dim.has_dim_param()) {
     // If this param is already known, assign the same Symbol.
     GRAPH_UPDATE("Got dim_param:", dim.dim_param());
-    for (const auto& pair : symbol_dim_map) {
-      if (pair.second == dim.dim_param()) {
-        sym = pair.first;
-        break;
-      }
+    auto maybe_symbol = dim_symbol_map.find(dim.dim_param());
+    if (maybe_symbol != dim_symbol_map.end()) {
+      sym = maybe_symbol->second;
     }
   }
   if (!sym) {
@@ -109,14 +112,16 @@ c10::ShapeSymbol ONNXDimToShapeSymbol(
     // If dim.dim_param() is empty, no need to keep track
     // because there won't be duplicates.
     symbol_dim_map[sym.value()] = dim.dim_param();
+    dim_symbol_map[dim.dim_param()] = sym.value();
   }
   return sym.value();
 }
 
 TensorTypePtr TorchTensorTypeFromONNX(
     const onnx::TypeProto_Tensor& onnx_tensor_type,
-    SymbolDimMap& symbol_dim_map) {
-  c10::optional<at::ScalarType> scalar_type;
+    SymbolDimMap& symbol_dim_map,
+    DimSymbolMap& dim_symbol_map) {
+  std::optional<at::ScalarType> scalar_type;
   if (onnx_tensor_type.has_elem_type()) {
     scalar_type = ONNXTypeToATenType(onnx_tensor_type.elem_type());
   }
@@ -132,8 +137,8 @@ TensorTypePtr TorchTensorTypeFromONNX(
     const auto& onnx_shape = onnx_tensor_type.shape();
 
     for (const auto i : c10::irange(onnx_shape.dim_size())) {
-      sizes.emplace_back(
-          ONNXDimToShapeSymbol(onnx_shape.dim(i), symbol_dim_map));
+      sizes.emplace_back(ONNXDimToShapeSymbol(
+          onnx_shape.dim(i), symbol_dim_map, dim_symbol_map));
     }
     v_type = TensorType::create(scalar_type, at::kCPU, sizes.size(), {});
     v_type = v_type->withSymbolicShapes(c10::SymbolicShape(sizes));
@@ -150,13 +155,14 @@ TensorTypePtr TorchTensorTypeFromONNX(
 
 ListTypePtr TorchListTypeFromONNX(
     const onnx::TypeProto_Sequence& onnx_sequence_type,
-    SymbolDimMap& symbol_dim_map) {
+    SymbolDimMap& symbol_dim_map,
+    DimSymbolMap& dim_symbol_map) {
   if (onnx_sequence_type.has_elem_type()) {
     const auto& onnx_seq_elem_type = onnx_sequence_type.elem_type();
     if (onnx_seq_elem_type.has_tensor_type()) {
       const auto& onnx_tensor_type = onnx_seq_elem_type.tensor_type();
-      const auto v_tensor_type =
-          TorchTensorTypeFromONNX(onnx_tensor_type, symbol_dim_map);
+      const auto v_tensor_type = TorchTensorTypeFromONNX(
+          onnx_tensor_type, symbol_dim_map, dim_symbol_map);
       auto v_type = ListType::create(v_tensor_type);
       return v_type;
     }
@@ -167,21 +173,22 @@ ListTypePtr TorchListTypeFromONNX(
 void UpdateTorchValueByOnnxValueInfo(
     Value* v,
     const onnx::ValueInfoProto& p_info,
-    SymbolDimMap& symbol_dim_map) {
+    SymbolDimMap& symbol_dim_map,
+    DimSymbolMap& dim_symbol_map) {
   if (!p_info.has_type()) {
     return;
   }
 
   const auto& p_type = p_info.type();
   if (p_type.has_tensor_type()) {
-    const auto torch_tensor_type =
-        TorchTensorTypeFromONNX(p_type.tensor_type(), symbol_dim_map);
+    const auto torch_tensor_type = TorchTensorTypeFromONNX(
+        p_type.tensor_type(), symbol_dim_map, dim_symbol_map);
     if (torch_tensor_type) {
       MergeInferredTypeAndSetMap(v, v->type(), torch_tensor_type);
     }
   } else if (p_type.has_sequence_type()) {
-    const auto torch_list_type =
-        TorchListTypeFromONNX(p_type.sequence_type(), symbol_dim_map);
+    const auto torch_list_type = TorchListTypeFromONNX(
+        p_type.sequence_type(), symbol_dim_map, dim_symbol_map);
     if (torch_list_type) {
       MergeInferredTypeAndSetMap(v, v->type(), torch_list_type);
     }
@@ -260,7 +267,7 @@ Value* CloneValueFromListConstruct(
   // is preserved. If the elemtype is Int, insert a onnx::Concat node into
   // the graph.
   TypePtr elem = v->type()->castRaw<ListType>()->getElementType();
-  c10::optional<at::ScalarType> scalar_type = c10::nullopt;
+  std::optional<at::ScalarType> scalar_type = c10::nullopt;
   if (elem->cast<IntType>()) {
     scalar_type = at::kLong;
     if (isValidToTransformToONNXConcatNode(v->node())) {
@@ -325,7 +332,7 @@ Node* CloneNodeToGraph(
             // Try to lookup input value and insert it into the graph.
             // If the input value is unknown, set it to graph input in the new
             // graph, and copy over metadata, such as datatype and shape.
-            ::c10::optional<at::Tensor> val = ::c10::nullopt;
+            ::std::optional<at::Tensor> val = ::c10::nullopt;
             auto v0 = params_dict.find(v->debugName());
             if (v0 != params_dict.end()) {
               val = v0->second.toTensor();
@@ -377,6 +384,7 @@ void ConvertGraphToONNXProto(
     std::shared_ptr<Graph> graph,
     std::shared_ptr<onnx::ModelProto>& model_proto,
     SymbolDimMap& symbol_dim_map,
+    DimSymbolMap& dim_symbol_map,
     int opset_version) {
   RawDataExportMap export_map;
   bool val_use_external_data_format;
@@ -402,12 +410,15 @@ void ConvertGraphToONNXProto(
           false,
           std::string());
   symbol_dim_map.insert(new_symbol_dim_map.begin(), new_symbol_dim_map.end());
+  for (const auto& pair : new_symbol_dim_map) {
+    dim_symbol_map[pair.second] = pair.first;
+  }
   for (int i = 0; i < model_proto->graph().output_size(); ++i) {
     model_proto->mutable_graph()->mutable_output(i)->clear_type();
   }
 }
 
-c10::optional<at::Tensor> ComputeConstantFolding(Node* n, int opset_version) {
+std::optional<at::Tensor> ComputeConstantFolding(Node* n, int opset_version) {
   if (n->inputs().empty()) {
     return c10::nullopt;
   }
@@ -437,7 +448,7 @@ c10::optional<at::Tensor> ComputeConstantFolding(Node* n, int opset_version) {
 }
 
 // Similar to the function above, but for symbolic shapes.
-c10::optional<::c10::SymbolicShape> ComputeShapeFromReshape(
+std::optional<::c10::SymbolicShape> ComputeShapeFromReshape(
     Node* n,
     const c10::SymbolicShape& input_shape,
     const c10::SymbolicShape& shape,
@@ -549,7 +560,7 @@ c10::optional<::c10::SymbolicShape> ComputeShapeFromReshape(
   return final_shape_0;
 }
 
-c10::optional<::c10::SymbolicShape> ComputeShapeFromExpand(
+std::optional<::c10::SymbolicShape> ComputeShapeFromExpand(
     const std::vector<::c10::ShapeSymbol>& input_shape,
     const std::vector<int64_t>& reshape) {
   for (const auto& it : reshape) {
@@ -588,7 +599,7 @@ c10::optional<::c10::SymbolicShape> ComputeShapeFromExpand(
   return shape;
 }
 
-c10::optional<::c10::SymbolicShape> ComputeShapeFromTile(
+std::optional<::c10::SymbolicShape> ComputeShapeFromTile(
     const std::vector<::c10::ShapeSymbol>& input_shape,
     const std::vector<int64_t>& reshape) {
   TORCH_INTERNAL_ASSERT(
@@ -616,7 +627,7 @@ c10::optional<::c10::SymbolicShape> ComputeShapeFromTile(
 void UpdateRank(Value* value, size_t rank) {
   ConstantValueMap::SetRank(value->debugName(), rank);
   if (TensorTypePtr value_type = value->type()->cast<TensorType>()) {
-    c10::optional<size_t> rank_opt = rank;
+    std::optional<size_t> rank_opt = rank;
     auto shape = ::c10::SymbolicShape(rank_opt);
     value->setType(value_type->withSymbolicShapes(shape));
   }
@@ -662,7 +673,7 @@ void UpdateShapeConstantValueMap(
   }
 }
 
-c10::optional<std::vector<int64_t>> GetValueFromListConstructNode(
+std::optional<std::vector<int64_t>> GetValueFromListConstructNode(
     Node* lc_node) {
   std::vector<int64_t> shape_size;
   for (const auto& input : lc_node->inputs()) {
@@ -676,7 +687,7 @@ c10::optional<std::vector<int64_t>> GetValueFromListConstructNode(
     }
   }
   return lc_node->inputs().size() == shape_size.size()
-      ? c10::optional<std::vector<int64_t>>(shape_size)
+      ? std::optional<std::vector<int64_t>>(shape_size)
       : c10::nullopt;
 }
 
@@ -1548,26 +1559,19 @@ bool IsListConstructIntType(const Value* v) {
   return false;
 }
 
-bool AllGraphInputsStatic(const Graph* g) {
-  for (auto n : g->inputs()) {
-    if (TensorTypePtr input_type = n->type()->cast<TensorType>()) {
-      if (input_type->dim()) {
-        auto shape = input_type->symbolic_sizes();
-        if (!ConstantValueMap::HasShape(n->debugName())) {
-          UpdateShapeConstantValueMap(n, shape);
-        }
-      }
-    }
-  }
-  for (auto n : g->inputs()) {
-    // Some inputs can be non-Tensor type, e.g.,
-    // __torch__.torch.classes.quantized.LinearPackedParamsBase
-    // so we only need check Tensor type here.
-    if (n->type()->cast<TensorType>() && !n->isCompleteTensor()) {
-      return false;
-    }
+// Check if all graph inputs are static and allow a cached value to return.
+// Since this traverses all inputs of the graph (including weights), it can be
+// costly for large graphs. Since this is called for each node in an export,
+// and the inputs remain unchanged, we can cut down export time by caching.
+bool AllGraphInputsStaticWithCaching(const Graph* g) {
+  auto maybe_is_static = ConstantValueMap::GetAllGraphInputsStatic();
+  if (maybe_is_static.has_value()) {
+    return maybe_is_static.value();
+  } else {
+    bool ret = AllGraphInputsStatic(g);
+    ConstantValueMap::SetAllGraphInputsStatic(ret);
+    return ret;
   }
-  return true;
 }
 
 void ProcessConstantValueMap(Node* n, int opset_version) {
@@ -1581,7 +1585,7 @@ void ProcessConstantValueMap(Node* n, int opset_version) {
   // shapes
   UpdateReliable(n);
 
-  auto static_input_shape = AllGraphInputsStatic(n->owningGraph());
+  auto static_input_shape = AllGraphInputsStaticWithCaching(n->owningGraph());
   for (auto i : c10::irange(n->outputs().size())) {
     if (TensorTypePtr output_type = n->output(i)->type()->cast<TensorType>()) {
       if (output_type->dim().has_value()) {
@@ -1803,7 +1807,8 @@ void UpdateOutputTypeByONNXProto(
     Node* n,
     Node* clone_node,
     const onnx::ModelProto& model_proto,
-    SymbolDimMap& symbol_dim_map) {
+    SymbolDimMap& symbol_dim_map,
+    DimSymbolMap& dim_symbol_map) {
   const auto& graph_proto = model_proto.graph();
 
   // get data from value_info and updated original graph.
@@ -1812,7 +1817,7 @@ void UpdateOutputTypeByONNXProto(
         for (size_t i = 0; i < n->outputs().size(); ++i) {
           if (clone_node->output(i)->debugName() == v_info.name()) {
             UpdateTorchValueByOnnxValueInfo(
-                n->output(i), v_info, symbol_dim_map);
+                n->output(i), v_info, symbol_dim_map, dim_symbol_map);
           }
         }
       };
@@ -1914,6 +1919,28 @@ void ONNXShapeTypeInference(
 static std::unordered_map<std::string, std::unordered_set<int64_t>>
     non_required_shape_inference_idx_map = {{"onnx::LSTM", {4}}};
 
+bool AllGraphInputsStatic(const Graph* g) {
+  for (auto n : g->inputs()) {
+    if (TensorTypePtr input_type = n->type()->cast<TensorType>()) {
+      if (input_type->dim()) {
+        auto shape = input_type->symbolic_sizes();
+        if (!ConstantValueMap::HasShape(n->debugName())) {
+          UpdateShapeConstantValueMap(n, shape);
+        }
+      }
+    }
+  }
+  for (auto n : g->inputs()) {
+    // Some inputs can be non-Tensor type, e.g.,
+    // __torch__.torch.classes.quantized.LinearPackedParamsBase
+    // so we only need check Tensor type here.
+    if (n->type()->cast<TensorType>() && !n->isCompleteTensor()) {
+      return false;
+    }
+  }
+  return true;
+}
+
 std::pair<bool, bool> AreInputsReliableOrStatic(Node* n) {
   auto reliable = true;
   auto complete = true;
@@ -2025,6 +2052,7 @@ void ONNXShapeTypeInference(
   auto& original_shape_data = ConstantValueMap::GetInferredShapeData();
   ShapeDataMap inferred_shape_data;
   auto& symbol_dim_map = ConstantValueMap::GetSymbolDimMap();
+  auto& dim_symbol_map = ConstantValueMap::GetDimSymbolMap();
 
   SetGraphInputTypeReliable(n->owningGraph());
   GRAPH_UPDATE(
@@ -2079,7 +2107,7 @@ void ONNXShapeTypeInference(
       //       e.g: ListConstruct, ListUnpack, etc.
       std::shared_ptr<onnx::ModelProto> model_proto;
       ConvertGraphToONNXProto(
-          n_graph, model_proto, symbol_dim_map, opset_version);
+          n_graph, model_proto, symbol_dim_map, dim_symbol_map, opset_version);
       GRAPH_DEBUG(
           "ONNX graph to run shape inference: ", prettyPrint(*model_proto));
 
@@ -2104,7 +2132,7 @@ void ONNXShapeTypeInference(
           }
         }
         UpdateOutputTypeByONNXProto(
-            n, clone_node, *model_proto, symbol_dim_map);
+            n, clone_node, *model_proto, symbol_dim_map, dim_symbol_map);
       } catch (std::runtime_error& ex) {
         // TODO: include this as warning once we have a more consolidated
         // warning system.
@@ -2146,8 +2174,8 @@ void ONNXShapeTypeInference(
       int rank = inferred_shape.dim_size();
       std::vector<::c10::ShapeSymbol> final_shape(rank);
       for (int i = 0; i < rank; ++i) {
-        final_shape[i] =
-            ONNXDimToShapeSymbol(inferred_shape.dim(i), symbol_dim_map);
+        final_shape[i] = ONNXDimToShapeSymbol(
+            inferred_shape.dim(i), symbol_dim_map, dim_symbol_map);
       }
       c10::SymbolicShape shape_value(final_shape);
       // Store data propagation result into shapeValueMap
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.h b/torch/csrc/jit/passes/onnx/shape_type_inference.h
index 03e927a01bff4..685ca39c16dec 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.h
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.h
@@ -86,6 +86,7 @@ TORCH_API void ONNXShapeTypeInference(
     const ParamMap& params_dict,
     int opset_version);
 
+bool AllGraphInputsStatic(const Graph* g);
 std::pair<bool, bool> AreInputsReliableOrStatic(Node* n);
 void UpdateReliable(
     torch::jit::Value* output,
diff --git a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
index 9270028b98808..7390bea56e77b 100644
--- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
+++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
@@ -30,7 +30,7 @@ using namespace ::c10::onnx;
 // we traverse up the graph to get the scale from its input until we hit a node
 // where scale is explicitly specified.
 double getScaleFromInput(Node* input_node) {
-  c10::optional<IValue> scale;
+  std::optional<IValue> scale;
   std::string input_name = input_node->kind().toQualString();
   std::unordered_set<std::string> noscale_ops = {
       "quantized::max_pool2d",
@@ -332,7 +332,7 @@ void unpackQuantizedWeightsHelper(
           "getValues: Quantized weight value not found amongst constant parameters.");
     }
     at::Tensor unpacked_weight;
-    c10::optional<at::Tensor> bias;
+    std::optional<at::Tensor> bias;
     constexpr int64_t stride_idx = 2;
     constexpr int64_t padding_idx = 3;
     int64_t output_padding_idx;
@@ -346,10 +346,10 @@ void unpackQuantizedWeightsHelper(
       dilation_idx = 4;
       groups_idx = 5;
     }
-    c10::optional<torch::List<int64_t>> stride, padding, dilation,
+    std::optional<torch::List<int64_t>> stride, padding, dilation,
         output_padding;
-    c10::optional<int64_t> groups;
-    c10::optional<int64_t> transpose;
+    std::optional<int64_t> groups;
+    std::optional<int64_t> transpose;
 
     torch::List<int64_t> stride_int, padding_int, dilation_int,
         output_padding_int;
@@ -371,9 +371,9 @@ void unpackQuantizedWeightsHelper(
         TORCH_INTERNAL_ASSERT(elements.size() == 3, "Wrong tuple size.");
 
         auto config_vals = elements[1].to<std::vector<int64_t>>();
-        auto tensors = elements[2].to<std::vector<c10::optional<at::Tensor>>>();
+        auto tensors = elements[2].to<std::vector<std::optional<at::Tensor>>>();
 
-        c10::optional<at::Tensor> weight = tensors[1];
+        std::optional<at::Tensor> weight = tensors[1];
         TORCH_INTERNAL_ASSERT(
             weight, "Weight should always be present in serialized qconv.");
         unpacked_weight = *weight;
@@ -534,7 +534,7 @@ void unpackQuantizedWeightsHelper(
       at::Tensor packed_weight = itr->second.toTensor();
       auto op = Dispatcher::singleton()
                     .findSchemaOrThrow(unpack_fn.c_str(), "")
-                    .typed<std::tuple<at::Tensor, c10::optional<at::Tensor>>(
+                    .typed<std::tuple<at::Tensor, std::optional<at::Tensor>>(
                         at::Tensor)>();
       std::tie(unpacked_weight, bias) = op.call(packed_weight);
     }
@@ -598,7 +598,7 @@ void unpackQuantizedWeightsHelper(
     if (stride.has_value() && padding.has_value() && dilation.has_value() &&
         groups.has_value() &&
         (!expect_output_padding || output_padding.has_value())) {
-      std::vector<c10::optional<torch::List<int64_t>>> conv_ints_args;
+      std::vector<std::optional<torch::List<int64_t>>> conv_ints_args;
       conv_ints_args.push_back(stride);
       conv_ints_args.push_back(padding);
       if (expect_output_padding) {
diff --git a/torch/csrc/jit/passes/peephole.cpp b/torch/csrc/jit/passes/peephole.cpp
index b1e38697ef59d..aa4e2176f1905 100644
--- a/torch/csrc/jit/passes/peephole.cpp
+++ b/torch/csrc/jit/passes/peephole.cpp
@@ -19,7 +19,7 @@ namespace jit {
 // Conservatively compare two optionals. If both are undefined, assume
 // they aren't equal
 template <typename T>
-static bool mustBeEqual(const c10::optional<T>& a, const c10::optional<T>& b) {
+static bool mustBeEqual(const std::optional<T>& a, const c10::optional<T>& b) {
   return a == b && a.has_value();
 }
 
diff --git a/torch/csrc/jit/passes/peephole_dict_idioms.cpp b/torch/csrc/jit/passes/peephole_dict_idioms.cpp
index 4e2a56a9d06bd..d3a5cfa36261b 100644
--- a/torch/csrc/jit/passes/peephole_dict_idioms.cpp
+++ b/torch/csrc/jit/passes/peephole_dict_idioms.cpp
@@ -125,7 +125,7 @@ class DictNode {
     return 0;
   }
 
-  c10::optional<Value*> getOrNullopt(const IValue& key) const {
+  std::optional<Value*> getOrNullopt(const IValue& key) const {
     if (impl_ && impl_->contains(key)) {
       return impl_->get(key);
     }
@@ -181,7 +181,7 @@ class PeepholeOptimizeDictIdiomsImpl {
     return cached->second;
   }
 
-  c10::optional<Value*> getValueFromDict(Node* dict_creation_node, Value* key) {
+  std::optional<Value*> getValueFromDict(Node* dict_creation_node, Value* key) {
     const DictNode& dict_node = getDictNode(dict_creation_node);
     auto key_opt = toIValue(key);
     // Key is not constant if we cannot convert to IValue
@@ -195,7 +195,7 @@ class PeepholeOptimizeDictIdiomsImpl {
     return c10::nullopt;
   }
 
-  c10::optional<int64_t> computeLen(Node* dict_creation_node) {
+  std::optional<int64_t> computeLen(Node* dict_creation_node) {
     const DictNode& dict_node = getDictNode(dict_creation_node);
     if (dict_node.canOptimize()) {
       return static_cast<int64_t>(dict_node.size());
diff --git a/torch/csrc/jit/passes/peephole_list_idioms.cpp b/torch/csrc/jit/passes/peephole_list_idioms.cpp
index 15f4c807335fd..9c106e13edf1f 100644
--- a/torch/csrc/jit/passes/peephole_list_idioms.cpp
+++ b/torch/csrc/jit/passes/peephole_list_idioms.cpp
@@ -14,7 +14,7 @@
 namespace torch {
 namespace jit {
 
-static c10::optional<size_t> normalizeIndex(int64_t index, size_t len) {
+static std::optional<size_t> normalizeIndex(int64_t index, size_t len) {
   if (index < 0) {
     index = index + len;
   }
@@ -129,7 +129,7 @@ struct ListLenRefiner {
     return block_refinements;
   };
 
-  c10::optional<int64_t> tryFindRefinement(Value* v) {
+  std::optional<int64_t> tryFindRefinement(Value* v) {
     for (const auto& ref : active_refinements_) {
       auto maybe_refinement = ref->find(v);
       if (maybe_refinement != ref->end()) {
diff --git a/torch/csrc/jit/passes/peephole_non_tensor.cpp b/torch/csrc/jit/passes/peephole_non_tensor.cpp
index 5cd2b6c2ee65d..5fa9c89b1fb0e 100644
--- a/torch/csrc/jit/passes/peephole_non_tensor.cpp
+++ b/torch/csrc/jit/passes/peephole_non_tensor.cpp
@@ -19,7 +19,7 @@ namespace {
  * @post if there's one constant in two operands, then the second operand is
  *       constant.
  */
-c10::optional<int64_t> checkArithNode(Node& node) {
+std::optional<int64_t> checkArithNode(Node& node) {
   if (node.inputs().size() != 2 || node.input(0)->type() != IntType::get() ||
       node.input(1)->type() != IntType::get()) {
     return {};
diff --git a/torch/csrc/jit/passes/quantization/helper.cpp b/torch/csrc/jit/passes/quantization/helper.cpp
index a4ac1f6fe4be9..8a74ec01086a5 100644
--- a/torch/csrc/jit/passes/quantization/helper.cpp
+++ b/torch/csrc/jit/passes/quantization/helper.cpp
@@ -235,7 +235,7 @@ std::vector<std::string> _propagate_quant_binary_ops = {
 bool matchAtenFuncToUse(
     const Use& use,
     const std::string& func_name,
-    c10::optional<int> n) {
+    std::optional<int> n) {
   Node* node = use.user;
   return node->kind() == Symbol::aten(func_name) &&
       (!n.has_value() || static_cast<size_t>(n.value()) == use.offset);
@@ -244,7 +244,7 @@ bool matchAtenFuncToUse(
 bool matchCallFuncToUse(
     const Use& use,
     const std::string& func_name,
-    c10::optional<int> n) {
+    std::optional<int> n) {
   Node* node = use.user;
   return node->kind() == prim::CallFunction &&
       getFuncName(node->inputs()[0]) == func_name &&
@@ -316,7 +316,7 @@ bool isEmbeddingBagNonInput(Value* v) {
   return result;
 }
 
-c10::optional<Use> getClampScalarInputUse(Value* v) {
+std::optional<Use> getClampScalarInputUse(Value* v) {
   for (const auto& use : v->uses()) {
     for (const auto& aten_func : _clamp_funcs) {
       if (matchAtenFuncToUse(use, aten_func, 1) ||
@@ -493,7 +493,7 @@ bool isBinaryOpWithScalarInput(Node* n) {
   return isPropagateQuantBinaryOp(n) && isScalar(n->input(1));
 }
 
-c10::optional<std::tuple<c10::QScheme, QParamVector>> getFixedQParams(Node* n) {
+std::optional<std::tuple<c10::QScheme, QParamVector>> getFixedQParams(Node* n) {
   static std::vector<NodeKind> fixed_qparam_funcs;
   std::transform(
       _fixed_qparams_map.begin(),
@@ -642,7 +642,7 @@ Module getInvokedModule(Module& module, Node* n, Value* self) {
   return findChildModule(module, path);
 }
 
-c10::optional<Module> getInvokedModuleOpt(
+std::optional<Module> getInvokedModuleOpt(
     const Module& module,
     Node* n,
     Value* self) {
@@ -686,7 +686,7 @@ std::string removeTorchMangle(const std::string& orig_name) {
   return qualified_name;
 }
 
-c10::optional<std::string> getModuleName(Value* value) {
+std::optional<std::string> getModuleName(Value* value) {
   auto type = value->type()->cast<ClassType>();
   if (type && type->name()) {
     return removeTorchMangle(type->name()->qualifiedName());
diff --git a/torch/csrc/jit/passes/quantization/helper.h b/torch/csrc/jit/passes/quantization/helper.h
index b5a5adf40b65c..680e3c7ca43d5 100644
--- a/torch/csrc/jit/passes/quantization/helper.h
+++ b/torch/csrc/jit/passes/quantization/helper.h
@@ -32,7 +32,7 @@ TORCH_API bool isBiasOfConvOrLinear(Value* v);
 TORCH_API bool isEmbeddingBagNonInput(Value* v);
 
 // Get the use as scalar input of clamp ops for the input value
-c10::optional<Use> getClampScalarInputUse(Value* v);
+std::optional<Use> getClampScalarInputUse(Value* v);
 
 // For a given value `v`, get the list of values that we need to check
 // if they are observed/quantized or not, if so, we can say the
@@ -59,7 +59,7 @@ TORCH_API bool hitGraphInput(Value* value);
 TORCH_API std::string removeTorchMangle(const std::string& orig_name);
 
 // Return the module name that corresponds to the value.
-TORCH_API c10::optional<std::string> getModuleName(Value* value);
+TORCH_API std::optional<std::string> getModuleName(Value* value);
 
 // =========== helper functions for Node =========
 TORCH_API bool isSingleInputGeneralShapeAtenFunction(Node* n);
@@ -91,7 +91,7 @@ TORCH_API bool isPropagateQuantOp(Node* n);
 // quantized::{op}_scalar
 TORCH_API bool isBinaryOpWithScalarInput(Node* n);
 
-TORCH_API c10::optional<std::tuple<c10::QScheme, QParamVector>> getFixedQParams(
+TORCH_API std::optional<std::tuple<c10::QScheme, QParamVector>> getFixedQParams(
     Node* n);
 
 // We don't want to analyze the graph for some `builtin` CallFunctions
@@ -121,14 +121,14 @@ TORCH_API std::shared_ptr<Graph> getCallFunctionGraph(Node* n);
 bool matchCallFuncToUse(
     const Use& use,
     const std::string& func_name,
-    c10::optional<int> nth_arg);
+    std::optional<int> nth_arg);
 
 // Check if `use` is a AtenFunction of name `func_name` and if value
 // `v` is the nth argument (if provided) of the function
 bool matchAtenFuncToUse(
     const Use& use,
     const std::string& func_name,
-    c10::optional<int> nth_arg);
+    std::optional<int> nth_arg);
 
 // =========== helper functions for Block =========
 // checks if a block will always raise an Exception
@@ -151,7 +151,7 @@ TORCH_API Module getInvokedModule(Module& module, Node* n, Value* self);
 // Given an CallMethod node, get the module instance corresponding
 // to the instance Value if the instance is a module, otherwise return
 // c10::nullopt
-c10::optional<Module> getInvokedModuleOpt(
+std::optional<Module> getInvokedModuleOpt(
     const Module& module,
     Node* n,
     Value* self);
diff --git a/torch/csrc/jit/passes/quantization/insert_observers.cpp b/torch/csrc/jit/passes/quantization/insert_observers.cpp
index f514fbc193ddd..e5df64f1929c7 100644
--- a/torch/csrc/jit/passes/quantization/insert_observers.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_observers.cpp
@@ -20,12 +20,12 @@
 namespace torch {
 namespace jit {
 
-using ModuleQConfigMap = std::unordered_map<ModulePtr, c10::optional<QConfig>>;
+using ModuleQConfigMap = std::unordered_map<ModulePtr, std::optional<QConfig>>;
 
 namespace {
 
 struct OptionalQConfigHash {
-  inline size_t operator()(const c10::optional<QConfig>& qconfig_opt) const {
+  inline size_t operator()(const std::optional<QConfig>& qconfig_opt) const {
     if (qconfig_opt.has_value()) {
       const auto& m1 = std::get<0>(*qconfig_opt);
       const auto& m2 = std::get<1>(*qconfig_opt);
@@ -36,9 +36,9 @@ struct OptionalQConfigHash {
   }
 };
 using QConfigTypePtrMap =
-    std::unordered_map<c10::optional<QConfig>, TypePtr, OptionalQConfigHash>;
+    std::unordered_map<std::optional<QConfig>, TypePtr, OptionalQConfigHash>;
 using NameModuleVector = std::vector<std::pair<std::string, Module>>;
-using OptionalModuleVector = std::vector<c10::optional<Module>>;
+using OptionalModuleVector = std::vector<std::optional<Module>>;
 using ModuleMethodVector = std::vector<std::pair<Module, std::string>>;
 using graph_rewrite_helper::PatternInfo;
 using graph_rewrite_helper::replaceConvolutionWithAtenConv;
@@ -49,8 +49,8 @@ void fillQConfigMap(
     const QConfigDict& qconfig_dict,
     ModuleQConfigMap& map,
     const std::string& key = "",
-    const c10::optional<QConfig>& parent_qconfig = c10::nullopt) {
-  c10::optional<QConfig> qconfig;
+    const std::optional<QConfig>& parent_qconfig = c10::nullopt) {
+  std::optional<QConfig> qconfig;
   if (qconfig_dict.find(key) != qconfig_dict.end()) {
     GRAPH_DEBUG("Got module config for key:", key);
     qconfig = qconfig_dict.at(key);
@@ -187,7 +187,7 @@ class ModuleCloneHelper {
       const Module& source,
       Module& target,
       const ModuleQConfigMap& module_qconfig_map,
-      const std::function<TypePtr(TypePtr, c10::optional<QConfig>)>&
+      const std::function<TypePtr(TypePtr, std::optional<QConfig>)>&
           type_remap_fn) {
     // remap of %self will be done outside of the function
     // and we don't support the case when people pass in
@@ -239,7 +239,7 @@ class ModuleCloneHelper {
       const Module& source,
       Module& target,
       const ModuleQConfigMap& module_qconfig_map,
-      const std::function<TypePtr(TypePtr, c10::optional<QConfig>)>&
+      const std::function<TypePtr(TypePtr, std::optional<QConfig>)>&
           type_remap_fn) {
     remapTypes(
         graph->block(),
@@ -257,7 +257,7 @@ class ModuleCloneHelper {
       const ModuleQConfigMap& module_qconfig_map,
       const std::unordered_map<TypePtr, QConfigTypePtrMap>& type_remap) {
     auto type_remap_fn = [&](TypePtr type_ptr,
-                             const c10::optional<QConfig>& qconfig) {
+                             const std::optional<QConfig>& qconfig) {
       if (type_remap.find(type_ptr) != type_remap.end()) {
         const auto& qconfig_map = type_remap.at(type_ptr);
         if (qconfig_map.find(qconfig) != qconfig_map.end()) {
@@ -401,7 +401,7 @@ class InsertObserversHelper {
 
   // Uses the state created by fillBoundaryValueMap and fillValueObserverMap
   // to return an observer configured for a value, if it is needed.
-  c10::optional<Module> getObserverFor(Value* v);
+  std::optional<Module> getObserverFor(Value* v);
 
   // Uses the state created by fillPassThroughValueMap to propagage observed
   // property which should pass through from inputs to outputs.
@@ -1312,13 +1312,13 @@ void InsertObserversHelper::fillValueObserverMap(
   }
 }
 
-c10::optional<Module> InsertObserversHelper::getObserverFor(Value* v) {
+std::optional<Module> InsertObserversHelper::getObserverFor(Value* v) {
   if (observer_for_value_.count(v)) {
     auto observer = observer_for_value_.at(v);
     GRAPH_DEBUG("Got observer module config for:", v->debugName());
     return observer;
   }
-  c10::optional<Module> result;
+  std::optional<Module> result;
   if (boundary_value_map_.count(v)) {
     for (Value* next : boundary_value_map_.at(v)) {
       GRAPH_DEBUG(
@@ -1384,9 +1384,9 @@ InsertObserversHelper::insertObserversFor(
   // the graph itself can be shared
   std::unordered_set<Value*> inputs_outputs;
   // list of observer modules for input values
-  std::vector<c10::optional<Module>> block_input_observers;
+  std::vector<std::optional<Module>> block_input_observers;
   // list of observer modules for output values
-  std::vector<c10::optional<Module>> block_output_observers;
+  std::vector<std::optional<Module>> block_output_observers;
 
   // if the current block is the block for entry point graph(the forward graph
   // of the top level module), we can insert observers in the block directly
diff --git a/torch/csrc/jit/passes/quantization/insert_observers.h b/torch/csrc/jit/passes/quantization/insert_observers.h
index 6fa7fe0449112..e8857318261c8 100644
--- a/torch/csrc/jit/passes/quantization/insert_observers.h
+++ b/torch/csrc/jit/passes/quantization/insert_observers.h
@@ -18,7 +18,7 @@ namespace torch {
 namespace jit {
 
 using QConfig = std::tuple<Module, Module>;
-using QConfigDict = std::unordered_map<std::string, c10::optional<QConfig>>;
+using QConfigDict = std::unordered_map<std::string, std::optional<QConfig>>;
 
 /** \brief Insert observer module and observer function call for
  *  the Tensors that needs to be observed.
diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
index 93683a308dc86..02f4f10969760 100644
--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
@@ -59,7 +59,7 @@ bool isWeight(Module& module, Value* v) {
   if (isWeight(v)) {
     return true;
   }
-  c10::optional<bool> result;
+  std::optional<bool> result;
   auto* self = v->owningGraph()->inputs()[0];
   for (const Use& u : v->uses()) {
     Node* n = u.user;
@@ -221,7 +221,7 @@ Node* insertFP16CastOps(Graph* graph, Value* observer_out) {
 }
 
 // find the observer for Value `v` and return the name of the observer
-c10::optional<std::string> findObserverName(Value* v) {
+std::optional<std::string> findObserverName(Value* v) {
   // Note that here we just check for the name of observer, but the ideally
   // we should be comparing the type of observer, this is a temporary
   // work around until data only clone of module.clone is supported.
@@ -258,7 +258,7 @@ at::ScalarType getObserverDtype(Module& module, Value* v) {
   return at::ScalarType::Undefined;
 }
 
-c10::optional<std::string> getEmbeddingBagObsName(
+std::optional<std::string> getEmbeddingBagObsName(
     script::Module& module,
     Node* n) {
   Value* v = n->output();
@@ -273,7 +273,7 @@ c10::optional<std::string> getEmbeddingBagObsName(
 
 bool isEmbeddingBagOp(
     Node* observer,
-    c10::optional<std::string> embedding_bag_name) {
+    std::optional<std::string> embedding_bag_name) {
   return embedding_bag_name &&
       embedding_bag_name.value().find("embedding_bag_") != std::string::npos;
 }
@@ -791,7 +791,7 @@ class InsertQuantDeQuantHelper {
       Value* original_output,
       const std::vector<Value*>& inputs,
       bool is_scalar = false,
-      const c10::optional<std::tuple<c10::QScheme, QParamVector>>& qparams_opt =
+      const std::optional<std::tuple<c10::QScheme, QParamVector>>& qparams_opt =
           c10::nullopt);
 
   bool isQuantized(Value* v) {
@@ -1125,7 +1125,7 @@ ModuleMethodVector InsertQuantDeQuantHelper::getInvokedMethods(
       if (n->kind() == prim::CallMethod) {
         auto module_instance = n->inputs()[0];
         auto module_method_name = n->s(attr::name);
-        c10::optional<Module> m;
+        std::optional<Module> m;
         // calling method on self
         if (module_instance == graph->inputs()[0]) {
           m = module;
@@ -1152,7 +1152,7 @@ void InsertQuantDeQuantHelper::propagateQParams(
     Value* original_output,
     const std::vector<Value*>& inputs,
     bool is_scalar,
-    const c10::optional<std::tuple<c10::QScheme, QParamVector>>& qparams_opt) {
+    const std::optional<std::tuple<c10::QScheme, QParamVector>>& qparams_opt) {
   Node* n = original_output->node();
   Graph* graph = n->owningGraph();
   if (is_scalar) {
@@ -1248,7 +1248,7 @@ void removeDequantizeFromInputs(const std::unordered_set<Value*>& inputs) {
 
 // Check if we need to propagate the quantization ops from input to
 // output
-c10::optional<std::vector<Value*>> getDequantizedInputs(Value* output) {
+std::optional<std::vector<Value*>> getDequantizedInputs(Value* output) {
   auto inputs = getPassThroughInputs(output);
   if (!inputs.empty()) {
     // note that we don't need to recursively check for prim::If
diff --git a/torch/csrc/jit/passes/remove_mutation.cpp b/torch/csrc/jit/passes/remove_mutation.cpp
index 183c7894f0867..84b990f628336 100644
--- a/torch/csrc/jit/passes/remove_mutation.cpp
+++ b/torch/csrc/jit/passes/remove_mutation.cpp
@@ -360,7 +360,7 @@ bool RemoveListMutation(const std::shared_ptr<Graph>& graph) {
 
 bool RemoveTensorMutation(
     const std::shared_ptr<Graph>& graph,
-    c10::optional<std::function<bool(Node*)>> mutation_filter) {
+    std::optional<std::function<bool(Node*)>> mutation_filter) {
   MutationRemover mr(graph, std::move(mutation_filter));
   return mr.removeTensorMutation();
 }
diff --git a/torch/csrc/jit/passes/remove_mutation.h b/torch/csrc/jit/passes/remove_mutation.h
index eb8cf195ee4ca..be8fc12b11f3d 100644
--- a/torch/csrc/jit/passes/remove_mutation.h
+++ b/torch/csrc/jit/passes/remove_mutation.h
@@ -11,7 +11,7 @@ namespace jit {
 struct TORCH_API MutationRemover {
   MutationRemover(
       std::shared_ptr<Graph> graph,
-      c10::optional<std::function<bool(Node*)>> mutation_filter = c10::nullopt)
+      std::optional<std::function<bool(Node*)>> mutation_filter = c10::nullopt)
       : mutation_filter_(mutation_filter),
         aliasDb_(nullptr),
         graph_(std::move(graph)) {}
@@ -55,7 +55,7 @@ struct TORCH_API MutationRemover {
     return aliasDb_.get();
   }
 
-  c10::optional<std::function<bool(Node*)>> mutation_filter_;
+  std::optional<std::function<bool(Node*)>> mutation_filter_;
   std::unique_ptr<AliasDb> aliasDb_ = nullptr;
   std::shared_ptr<Graph> graph_;
 };
@@ -71,7 +71,7 @@ TORCH_API bool RemoveListMutation(const std::shared_ptr<Graph>& graph);
 // return true if graph is modified
 TORCH_API bool RemoveTensorMutation(
     const std::shared_ptr<Graph>& graph,
-    c10::optional<std::function<bool(Node*)>> mutation_filter = c10::nullopt);
+    std::optional<std::function<bool(Node*)>> mutation_filter = c10::nullopt);
 
 // Replaces in-place aten activation ops with their functional equivalence
 TORCH_API bool InplaceToFunctionalActivation(
diff --git a/torch/csrc/jit/passes/replacement_of_old_operators.cpp b/torch/csrc/jit/passes/replacement_of_old_operators.cpp
index 430cd4f743fdc..38255ad141877 100644
--- a/torch/csrc/jit/passes/replacement_of_old_operators.cpp
+++ b/torch/csrc/jit/passes/replacement_of_old_operators.cpp
@@ -30,7 +30,7 @@ struct OldOpsReplacerWithUpgraders {
     Node* node = graph_it.next();
     while (node) {
       // load the schema name for this op
-      c10::optional<std::string> schema_name = c10::nullopt;
+      std::optional<std::string> schema_name = c10::nullopt;
       if (auto op_schema = node->maybeSchema()) {
         schema_name = getFullSchemaName(*op_schema);
       } else {
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 706a17bf13e02..abc7bb6411dba 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -153,7 +153,7 @@ bool containsTensorType(const TypePtr& t) {
 // for each node in the schema with type Tensor, extract the T type
 // returns c10::nullopt if any Tensor in the schema does not have a known
 // shape ignores non-tensor in the list of inputs
-c10::optional<std::vector<TensorTypePtr>> gatherTensorTypes(
+std::optional<std::vector<TensorTypePtr>> gatherTensorTypes(
     Node* node,
     bool complete = false) {
   std::vector<TensorTypePtr> tensor_types;
@@ -209,7 +209,7 @@ c10::ScalarType unionScalarTypes(
 // new type promotion logic. See tensor_attributes.rst for details.
 // This doesn't handle the case of arithmetic ops with Scalar arguments (when
 // `Tensor.getUnsafeTensorImpl()->is_wrapped_number()` would return true)
-c10::optional<c10::ScalarType> getPromotedTypeForArithmeticOp(Node* node) {
+std::optional<c10::ScalarType> getPromotedTypeForArithmeticOp(Node* node) {
   c10::ScalarType dimmed = c10::ScalarType::Undefined;
   c10::ScalarType zerodim = c10::ScalarType::Undefined;
   // binary arithmetic ops, more than 2 args is alpha.
@@ -741,7 +741,7 @@ class ShapePropagator : public PropertyPropBase {
     return setUnshapedType(node);
   }
 
-  static c10::optional<size_t> determineListSize(Value* list) {
+  static std::optional<size_t> determineListSize(Value* list) {
     AT_ASSERT(list->type()->cast<ListType>());
     if (auto shape = constant_as<c10::List<int64_t>>(list)) {
       return shape->size();
@@ -769,7 +769,7 @@ class ShapePropagator : public PropertyPropBase {
   bool PropagateTensorShapeOnNode(Node* node, bool insert_expands) {
     static const auto broadcast =
         [](std::vector<TensorTypePtr>& tensor_types,
-           c10::optional<at::ScalarType> t) -> TensorTypePtr {
+           std::optional<at::ScalarType> t) -> TensorTypePtr {
       if (tensor_types.size() == 1) {
         return tensor_types[0]->dimensionedOnly()->withScalarType(t);
       }
@@ -1244,7 +1244,7 @@ class ShapePropagator : public PropertyPropBase {
     static const auto reduce_op_handler = [](Node* node,
                                              int64_t num_reduced_dim = 0,
                                              bool upcast_integer = false,
-                                             c10::optional<IValue> opt_dtype =
+                                             std::optional<IValue> opt_dtype =
                                                  c10::nullopt) -> type_vec_t {
       if (auto type = node->input(0)->type()->cast<TensorType>()) {
         if (!type->scalarType() || !type->dim()) {
diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
index 96aa425b291a1..1765e65d02a6e 100644
--- a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
@@ -65,7 +65,7 @@ namespace jit {
 
 struct ShapeArg
     : public std::
-          pair<c10::optional<c10::ShapeSymbol>, c10::optional<int64_t>> {
+          pair<std::optional<c10::ShapeSymbol>, c10::optional<int64_t>> {
   using pair::pair;
 
   static ShapeArg unknownInteger() {
@@ -87,11 +87,11 @@ struct ShapeArg
     }
   }
 
-  c10::optional<int64_t> asConstantInt() const {
+  std::optional<int64_t> asConstantInt() const {
     return this->second;
   }
 
-  c10::optional<c10::ShapeSymbol> asShapeSymbol() const {
+  std::optional<c10::ShapeSymbol> asShapeSymbol() const {
     return this->first;
   }
 
@@ -208,7 +208,7 @@ bool isListOfTensors(const TypePtr& type) {
       type->cast<ListType>()->getElementType()->cast<TensorType>();
 }
 
-c10::optional<size_t> normIndex(int64_t index, size_t len) {
+std::optional<size_t> normIndex(int64_t index, size_t len) {
   if (index < 0) {
     index = index + len;
   }
@@ -255,7 +255,7 @@ c10::SymbolicShape extractListShape(
     return c10::SymbolicShape();
   }
   Node* list_construct = list->node();
-  std::vector<c10::optional<int64_t>> output_shape;
+  std::vector<std::optional<int64_t>> output_shape;
   for (Value* input : list_construct->inputs()) {
     if (symbolic_shape_values.count(input)) {
       output_shape.emplace_back(symbolic_shape_values[input]);
@@ -605,7 +605,7 @@ struct SymbolicShapeOpAnalyzer {
     shape_compute_graph_ = graph->copy();
   }
 
-  c10::optional<std::vector<c10::SymbolicShape>> run(
+  std::optional<std::vector<c10::SymbolicShape>> run(
       std::vector<SSArgument>& inputs) {
     if (!shape_compute_graph_) {
       return c10::nullopt;
@@ -813,7 +813,7 @@ struct SymbolicShapeGraphAnalyzer {
         beg_->owningBlock() == end_->owningBlock() && end_->isAfter(beg_));
   }
 
-  c10::optional<ShapeComputeGraphMapping> run() {
+  std::optional<ShapeComputeGraphMapping> run() {
     AliasDb db(graph_);
     std::unordered_map<Node*, std::shared_ptr<Graph>> partial_evaluated_graphs =
         propagateShapesAndGatherPartialEvalShapeGraphs(db);
@@ -1120,7 +1120,7 @@ void PropagateShapesOnGraph(std::shared_ptr<Graph>& graph) {
   PropagateShapesOnBlock(graph->block(), db);
 }
 
-c10::optional<ShapeComputeGraphMapping>
+std::optional<ShapeComputeGraphMapping>
 PropagateShapesAndBuildLargeShapeComputeGraph(
     std::shared_ptr<Graph>& graph,
     Node* beg,
@@ -1128,7 +1128,7 @@ PropagateShapesAndBuildLargeShapeComputeGraph(
   return SymbolicShapeGraphAnalyzer(graph, beg, end).run();
 }
 
-TORCH_API c10::optional<std::vector<c10::SymbolicShape>>
+TORCH_API std::optional<std::vector<c10::SymbolicShape>>
 calculateSymbolicShapesOnOp(
     const FunctionSchema* schema,
     const std::vector<SSAInput>& inputs) {
diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.h b/torch/csrc/jit/passes/symbolic_shape_analysis.h
index 824740792aaf0..f5a17f2c5e550 100644
--- a/torch/csrc/jit/passes/symbolic_shape_analysis.h
+++ b/torch/csrc/jit/passes/symbolic_shape_analysis.h
@@ -36,7 +36,7 @@ struct ShapeComputeGraphMapping {
   std::unordered_map<Value*, int64_t> graph_output_to_symbolic_shape_dim_;
 };
 
-TORCH_API c10::optional<ShapeComputeGraphMapping>
+TORCH_API std::optional<ShapeComputeGraphMapping>
 PropagateShapesAndBuildLargeShapeComputeGraph(
     std::shared_ptr<Graph>& graph,
     Node* beg,
@@ -50,7 +50,7 @@ TORCH_API bool setSymbolicShapeAnalysisTestMode(bool value);
 TORCH_API bool symbolicShapeAnalysisTestModeEnabled();
 
 using SSAInput = std::variant<IValue, c10::SymbolicShape>;
-TORCH_API c10::optional<std::vector<c10::SymbolicShape>>
+TORCH_API std::optional<std::vector<c10::SymbolicShape>>
 calculateSymbolicShapesOnOp(
     const FunctionSchema* schema,
     const std::vector<SSAInput>& inputs);
diff --git a/torch/csrc/jit/passes/symbolic_shape_cache.cpp b/torch/csrc/jit/passes/symbolic_shape_cache.cpp
index be8179f18786d..4a742b3f5f635 100644
--- a/torch/csrc/jit/passes/symbolic_shape_cache.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_cache.cpp
@@ -109,7 +109,7 @@ TORCH_API void cache_shape_function(
   shapeCache.Add(std::move(cache_key), std::move(can_ret_vec));
 }
 
-TORCH_API c10::optional<std::vector<at::SymbolicShape>>
+TORCH_API std::optional<std::vector<at::SymbolicShape>>
 get_cached_shape_function(
     const FunctionSchema* schema,
     const std::vector<SSAInput>& arg_vec) {
diff --git a/torch/csrc/jit/passes/symbolic_shape_cache.h b/torch/csrc/jit/passes/symbolic_shape_cache.h
index 02e00acac08d2..b842c731c0ce4 100644
--- a/torch/csrc/jit/passes/symbolic_shape_cache.h
+++ b/torch/csrc/jit/passes/symbolic_shape_cache.h
@@ -31,7 +31,7 @@ struct TORCH_API CanonicalizedSymbolicShape {
       const CanonicalizedSymbolicShape& b);
 
  private:
-  c10::optional<std::vector<int64_t>> values_;
+  std::optional<std::vector<int64_t>> values_;
 
   void init(
       const c10::SymbolicShape& orig_shape,
@@ -39,7 +39,7 @@ struct TORCH_API CanonicalizedSymbolicShape {
 };
 
 // SHAPE CACHE API
-TORCH_API c10::optional<std::vector<at::SymbolicShape>>
+TORCH_API std::optional<std::vector<at::SymbolicShape>>
 get_cached_shape_function(
     const FunctionSchema* schema,
     const std::vector<SSAInput>& arg_vec);
diff --git a/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp b/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp
index b4902a1d5a0d4..9c213f2480d51 100644
--- a/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp
@@ -178,7 +178,7 @@ static StrideInput summarizeOutputStrides(const TensorType& tt) {
 // Also summarize input striding behavior. The Size information is stored on the
 // type, The striding is returned. See StrideInput for description of stride
 // specializations
-static c10::optional<std::vector<std::vector<StrideInput>>>
+static std::optional<std::vector<std::vector<StrideInput>>>
 TryGeneralizeInputDimensionsToSymbolicShapes(
     std::shared_ptr<Graph> tensorexpr_graph) {
   std::map<size_t, int64_t> shape_to_sym_shape;
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index cd95af3424dc2..c9b9b974600dc 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -780,7 +780,7 @@ class TensorExprFuser {
     }
   }
 
-  c10::optional<Node*> tryMerge(Node* fusion_group, Node* to_merge) {
+  std::optional<Node*> tryMerge(Node* fusion_group, Node* to_merge) {
     if (!canMerge(fusion_group, to_merge)) {
       return c10::nullopt;
     }
diff --git a/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp b/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp
index b926939910c3a..15cefadd8cc76 100644
--- a/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp
+++ b/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp
@@ -8,7 +8,7 @@ namespace jit {
 
 static void UpdateDifferentiableGraphRequiresGrad(
     Block* block,
-    c10::optional<bool> new_requires_grad) {
+    std::optional<bool> new_requires_grad) {
   for (Node* n : block->nodes()) {
     for (Value* v : n->inputs()) {
       auto ty = v->type()->cast<TensorType>();
@@ -31,7 +31,7 @@ static void UpdateDifferentiableGraphRequiresGrad(
 
 void UpdateDifferentiableGraphRequiresGrad(
     std::shared_ptr<Graph>& diff_forward_graph,
-    c10::optional<bool> new_requires_grad) {
+    std::optional<bool> new_requires_grad) {
   UpdateDifferentiableGraphRequiresGrad(
       diff_forward_graph->block(), new_requires_grad);
 }
diff --git a/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h b/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h
index eb51ba00c4c9f..0ba8696088934 100644
--- a/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h
+++ b/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h
@@ -14,7 +14,7 @@ namespace jit {
 // the types of prim::profiles
 TORCH_API void UpdateDifferentiableGraphRequiresGrad(
     std::shared_ptr<Graph>& diff_forward_graph,
-    c10::optional<bool> new_requires_grad);
+    std::optional<bool> new_requires_grad);
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/utils/check_alias_annotation.cpp b/torch/csrc/jit/passes/utils/check_alias_annotation.cpp
index d538e33a21359..4c081200715a7 100644
--- a/torch/csrc/jit/passes/utils/check_alias_annotation.cpp
+++ b/torch/csrc/jit/passes/utils/check_alias_annotation.cpp
@@ -188,7 +188,7 @@ const Node* findNodeForOp(
 // Handle a few special cases where we need to propagate constants
 // manually
 // TODO(suo): we should be able to move this stuff to constant prop
-c10::optional<IValue> toIValueProp(const Value* v) {
+std::optional<IValue> toIValueProp(const Value* v) {
   if (v->node()->kind() == prim::ListConstruct) {
     std::vector<IValue> genericList;
     for (auto input : v->node()->inputs()) {
diff --git a/torch/csrc/jit/passes/utils/memory_dag.h b/torch/csrc/jit/passes/utils/memory_dag.h
index f3068588dae85..da5584f9d4bd3 100644
--- a/torch/csrc/jit/passes/utils/memory_dag.h
+++ b/torch/csrc/jit/passes/utils/memory_dag.h
@@ -62,9 +62,9 @@ struct Element {
   // We memoize the results of `getMemoryLocations` to speed up queries.
   // A nullopt means that this cache is not yet populated. Since `MemoryDAG` is
   // immutable, this cache should never need to be invalidated.
-  mutable c10::optional<MemoryLocations> cachedMemoryLocations_;
+  mutable std::optional<MemoryLocations> cachedMemoryLocations_;
 
-  mutable c10::optional<MemoryLocations> cachedAllContainedMemoryLocations_;
+  mutable std::optional<MemoryLocations> cachedAllContainedMemoryLocations_;
 };
 
 // class MemoryDAG
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.cpp b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
index 36515e9e849e3..1bb82432e218f 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
@@ -18,9 +18,9 @@ bool hasSubgraph(Node* n) {
   return n->hasAttribute(attr::Subgraph);
 }
 
-std::vector<c10::optional<const Use>> gatherLastUses(
+std::vector<std::optional<const Use>> gatherLastUses(
     at::ArrayRef<Value*> values) {
-  return fmap(values, [&](Value* v) -> c10::optional<const Use> {
+  return fmap(values, [&](Value* v) -> std::optional<const Use> {
     return firstOrLastUse(v, /*find_first*/ false);
   });
 }
@@ -38,7 +38,7 @@ struct ValueMapper {
   ValueMapper(
       Node* to_merge,
       AliasDb& db,
-      c10::optional<Node*> existing_subgraph) {
+      std::optional<Node*> existing_subgraph) {
     last_uses_ = gatherLastUses(to_merge->outputs());
     if (existing_subgraph) {
       existing_last_uses_ = gatherLastUses((*existing_subgraph)->outputs());
@@ -91,14 +91,14 @@ struct ValueMapper {
     placeholder_node_->destroy();
   }
 
-  std::vector<c10::optional<const Use>> last_uses_;
-  std::vector<c10::optional<const Use>> existing_last_uses_;
+  std::vector<std::optional<const Use>> last_uses_;
+  std::vector<std::optional<const Use>> existing_last_uses_;
   Node* placeholder_node_;
 };
 
 Node* executeSubgraphMergeAndUpdateAliasing(
     Node* to_merge,
-    c10::optional<Node*> existing,
+    std::optional<Node*> existing,
     AliasDb& db,
     const std::function<Node*(void)>& merge_fn) {
   // When we merge a node into a subgraph, the new subgraph outputs
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index a5e3c6059bc84..290a10d06af5a 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -151,7 +151,7 @@ static bool opAllowsNumbersAsTensors(c10::Symbol symbol) {
        torch::should_allow_numbers_as_tensors(symbol.toUnqualString()));
 }
 
-c10::optional<IValue> toTypeInferredIValueOptional(py::handle input) {
+std::optional<IValue> toTypeInferredIValueOptional(py::handle input) {
   // Errors need to be caught here because toTypeInferredIValue errors out
   // on various object types, but we want it to work with all types.
   try {
@@ -217,7 +217,7 @@ void initJITBindings(PyObject* module) {
           []() { return c10::ShapeSymbol::newSymbol().value(); })
       .def(
           "_jit_shape_compute_graph_for_node",
-          [](Node* n) -> c10::optional<std::shared_ptr<Graph>> {
+          [](Node* n) -> std::optional<std::shared_ptr<Graph>> {
             if (!n->maybeSchema()) {
               return c10::nullopt;
             }
@@ -225,7 +225,7 @@ void initJITBindings(PyObject* module) {
           })
       .def(
           "_jit_decomposition_graph_for_node",
-          [](Node* n) -> c10::optional<std::shared_ptr<Graph>> {
+          [](Node* n) -> std::optional<std::shared_ptr<Graph>> {
             if (!n->maybeSchema()) {
               return c10::nullopt;
             }
@@ -320,7 +320,7 @@ void initJITBindings(PyObject* module) {
              int quant_type_int) {
             auto dict = py::cast<std::unordered_map<
                 std::string,
-                c10::optional<std::tuple<Module, Module>>>>(qconfig_dict);
+                std::optional<std::tuple<Module, Module>>>>(qconfig_dict);
             auto quant_type = static_cast<QuantType>(quant_type_int);
             return InsertObservers(
                 module, method_name, dict, inplace, quant_type);
@@ -339,7 +339,7 @@ void initJITBindings(PyObject* module) {
              int quant_type_int) {
             auto dict = py::cast<std::unordered_map<
                 std::string,
-                c10::optional<std::tuple<Module, Module>>>>(qconfig_dict);
+                std::optional<std::tuple<Module, Module>>>>(qconfig_dict);
             auto quant_type = static_cast<QuantType>(quant_type_int);
             return InsertObserversForOnDevicePTQ(
                 module, method_name, dict, inplace, quant_type);
@@ -1389,14 +1389,36 @@ void initJITBindings(PyObject* module) {
             return size;
           }
           py::gil_scoped_acquire acquire;
-          auto memory_view = py::memoryview::from_memory(
-              reinterpret_cast<const char*>(data), size);
-          buffer.attr("write")(std::move(memory_view));
+          if (!data) {
+            // See [Note: write_record_metadata]
+            buffer.attr("seek")(
+                size, py::module::import("os").attr("SEEK_CUR"));
+          } else {
+            auto memory_view = py::memoryview::from_memory(
+                reinterpret_cast<const char*>(data), size);
+            buffer.attr("write")(std::move(memory_view));
+          }
           return size;
         };
         return std::make_unique<PyTorchStreamWriter>(std::move(writer_func));
       }))
       .def(py::init<const std::function<size_t(const void*, size_t)>&>())
+      // [Note: write_record_metadata]
+      // The write_record_metadata function is intended to write metadata (i.e.
+      // the zipfile header and end of central directory record) for a file
+      // while reserving nbytes of space for the file for the bytes of the
+      // actual file to be added in later. This functionality is achieved by
+      // defining `m_pWrite` to seek instead of write if the buffer passed is a
+      // nullptr. This has implications on CRC-32 which will not be written at
+      // write_record_metadata time, and will not be combined with the hash in
+      // combined_uncomp_crc32_. We define this in `m_pWrite` rather than
+      // extending the interface of miniz to have an `m_pSeek` since different
+      // versions of miniz are used in fbcode/oss.
+      .def(
+          "write_record_metadata",
+          [](PyTorchStreamWriter& self, const std::string& name, size_t size) {
+            return self.writeRecord(name, nullptr, size);
+          })
       .def(
           "write_record",
           [](PyTorchStreamWriter& self,
@@ -1652,7 +1674,7 @@ void initJITBindings(PyObject* module) {
               auto func_dk = py::cpp_function(
                   [op, symbol, allow_numbers_as_tensors](
                       c10::DispatchKey dk_, py::args args, py::kwargs kwargs) {
-                    c10::optional<c10::DispatchKey> dk =
+                    std::optional<c10::DispatchKey> dk =
                         c10::make_optional(dk_);
                     ToIValueAllowNumbersAsTensors g(allow_numbers_as_tensors);
                     return _get_operation_for_overload_or_packet(
@@ -1821,7 +1843,7 @@ void initJITBindings(PyObject* module) {
           [](SchemaInfo& self,
              const std::string& name,
              const py::object& value) {
-            c10::optional<IValue> i_value = toTypeInferredIValueOptional(value);
+            std::optional<IValue> i_value = toTypeInferredIValueOptional(value);
             if (i_value) {
               // For normalization purposes there is an inconsistency within
               // torch.fx that turns all arguments named "self" into "input".
@@ -1841,7 +1863,7 @@ void initJITBindings(PyObject* module) {
           TORCH_INTERNAL_ASSERT(
               key.isString(),
               "Add argument value keys types should be strings.");
-          c10::optional<IValue> value =
+          std::optional<IValue> value =
               toTypeInferredIValueOptional(key_pair.second);
           if (value) {
             // For normalization purposes there is an inconsistency within
@@ -2077,8 +2099,8 @@ void initJITBindings(PyObject* module) {
           py::call_guard<py::gil_scoped_release>());
 
   m.def("_is_alias_of", [](const py::object& self, const py::object& other) {
-    c10::optional<IValue> self_value = toTypeInferredIValueOptional(self);
-    c10::optional<IValue> other_value = toTypeInferredIValueOptional(other);
+    std::optional<IValue> self_value = toTypeInferredIValueOptional(self);
+    std::optional<IValue> other_value = toTypeInferredIValueOptional(other);
 
     // Only return true if we are certain that self and other are aliasing.
     if (!self_value || !other_value) {
@@ -2087,8 +2109,8 @@ void initJITBindings(PyObject* module) {
     return self_value->isAliasOf(*other_value);
   });
   m.def("_overlaps", [](const py::object& self, const py::object& other) {
-    c10::optional<IValue> self_value = toTypeInferredIValueOptional(self);
-    c10::optional<IValue> other_value = toTypeInferredIValueOptional(other);
+    std::optional<IValue> self_value = toTypeInferredIValueOptional(self);
+    std::optional<IValue> other_value = toTypeInferredIValueOptional(other);
 
     // Only return true if we are certain that self and other are overlapping.
     if (!self_value || !other_value) {
diff --git a/torch/csrc/jit/python/module_python.h b/torch/csrc/jit/python/module_python.h
index 3ab34f5cd8e77..5c7fbbb42d6cf 100644
--- a/torch/csrc/jit/python/module_python.h
+++ b/torch/csrc/jit/python/module_python.h
@@ -8,7 +8,7 @@ namespace py = pybind11;
 
 namespace torch::jit {
 
-inline c10::optional<Module> as_module(py::handle obj) {
+inline std::optional<Module> as_module(py::handle obj) {
   static py::handle ScriptModule =
       py::module::import("torch.jit").attr("ScriptModule");
   if (py::isinstance(obj, ScriptModule)) {
@@ -17,7 +17,7 @@ inline c10::optional<Module> as_module(py::handle obj) {
   return c10::nullopt;
 }
 
-inline c10::optional<Object> as_object(py::handle obj) {
+inline std::optional<Object> as_object(py::handle obj) {
   static py::handle ScriptObject =
       py::module::import("torch").attr("ScriptObject");
   if (py::isinstance(obj, ScriptObject)) {
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index 23107d91d99ac..4cfe3309a766b 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -55,7 +55,7 @@ IValue listToIValue(py::handle obj) {
   return c10::impl::toList<T>(rs);
 }
 
-IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
+IValue toIValue(py::handle obj, const TypePtr& type, std::optional<int32_t> N) {
   switch (type->kind()) {
     case TypeKind::TensorType: {
       if (obj.ptr() == Py_None) {
@@ -802,7 +802,7 @@ py::object invokeOperatorFromPython(
     const std::vector<std::shared_ptr<Operator>>& operations,
     py::args args,
     const py::kwargs& kwargs,
-    c10::optional<c10::DispatchKey> dk) {
+    std::optional<c10::DispatchKey> dk) {
   auto [found_op, stack] = getOpWithStack(operations, args, kwargs);
   {
     pybind11::gil_scoped_release no_gil_guard;
@@ -881,7 +881,7 @@ py::object _get_operation_for_overload_or_packet(
     py::args args,
     const py::kwargs& kwargs,
     bool is_overload,
-    c10::optional<c10::DispatchKey> dk) {
+    std::optional<c10::DispatchKey> dk) {
   std::string ns = symbol.ns().toUnqualString();
   std::string method_name = symbol.toUnqualString();
   std::string overload_name = operations[0]->schema().overload_name();
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index a78c3e0c0be34..242da11af7c04 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -62,7 +62,7 @@ void clear_registered_instances(void* ptr);
 TORCH_PYTHON_API IValue toIValue(
     py::handle obj,
     const TypePtr& type,
-    c10::optional<int32_t> N = c10::nullopt);
+    std::optional<int32_t> N = c10::nullopt);
 
 TORCH_PYTHON_API py::object toPyObject(IValue ivalue);
 
@@ -111,7 +111,7 @@ struct VISIBILITY_HIDDEN PythonFutureWrapper
 
   explicit PythonFutureWrapper(
       c10::intrusive_ptr<c10::ivalue::Future> fut,
-      c10::optional<UnwrapFunc> unwrap_func = c10::nullopt)
+      std::optional<UnwrapFunc> unwrap_func = c10::nullopt)
       : fut(std::move(fut)), unwrap_func(std::move(unwrap_func)) {}
 
   explicit PythonFutureWrapper(const PythonFutureWrapper&) = delete;
@@ -232,7 +232,7 @@ struct VISIBILITY_HIDDEN PythonFutureWrapper
   c10::intrusive_ptr<c10::ivalue::Future> fut;
   // unwrap_func works like a callback for the value returned by
   // PythonFutureWrapper::wait().
-  c10::optional<UnwrapFunc> unwrap_func;
+  std::optional<UnwrapFunc> unwrap_func;
 
  private:
   std::shared_ptr<PythonFutureWrapper> getPtr() {
@@ -348,7 +348,7 @@ inline TypedIValue toDictKeyIValue(py::handle key) {
   }
 }
 
-inline c10::optional<TypePtr> unifyOrInitializeType(
+inline std::optional<TypePtr> unifyOrInitializeType(
     const TypePtr& accum,
     const TypePtr& unify) {
   if (!accum) {
@@ -987,7 +987,7 @@ inline Stack createStackForSchema(
     const FunctionSchema& schema,
     const tuple_slice& args,
     const py::kwargs& kwargs,
-    c10::optional<IValue> self) {
+    std::optional<IValue> self) {
   size_t all_arguments = (self ? 1 : 0) + args.size() + kwargs.size();
   if (all_arguments > schema.arguments().size()) {
     throw schema_match_error(c10::str(
@@ -1102,7 +1102,7 @@ inline py::object runAndInsertCall(
     Function& callee,
     const tuple_slice& args,
     const py::kwargs& kwargs,
-    c10::optional<IValue> self,
+    std::optional<IValue> self,
     // Lambda that tells this function how to insert `callee` into the graph if
     // we're tracing.
     const std::function<Value*(Graph&, const MatchedSchema& match)>&
@@ -1158,7 +1158,7 @@ inline py::object runAndInsertCall(
   return toPyObject(std::move(stack.back()));
 }
 
-inline c10::optional<py::object> maybeTorchFunctionDispatch(
+inline std::optional<py::object> maybeTorchFunctionDispatch(
     const py::object& callee,
     const tuple_slice& args_no_self,
     const py::kwargs& kwargs,
@@ -1255,7 +1255,7 @@ TORCH_PYTHON_API py::object invokeOperatorFromPython(
     const std::vector<std::shared_ptr<Operator>>& operations,
     py::args args,
     const py::kwargs& kwargs,
-    c10::optional<c10::DispatchKey> dk = c10::nullopt);
+    std::optional<c10::DispatchKey> dk = c10::nullopt);
 
 TORCH_PYTHON_API py::tuple _maybe_handle_torch_function(
     const std::string& ns,
@@ -1276,6 +1276,6 @@ TORCH_PYTHON_API py::object _get_operation_for_overload_or_packet(
     py::args args,
     const py::kwargs& kwargs,
     bool is_overload,
-    c10::optional<c10::DispatchKey> dk = c10::nullopt);
+    std::optional<c10::DispatchKey> dk = c10::nullopt);
 
 } // namespace torch::jit
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index 7c6c5089b6d38..2442ef0573545 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -131,7 +131,7 @@ void ConcretePythonOp::cloneFrom(Node* other_) {
 // recover the autograd.Function instance, if this PythonOp's function
 // was originally SomeFunction.apply
 // used in ONNX for discovering symbolics
-c10::optional<THPObjectPtr> ConcretePythonOp::autogradFunction() const {
+std::optional<THPObjectPtr> ConcretePythonOp::autogradFunction() const {
   pybind11::gil_scoped_acquire gil;
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   py::handle obj = const_cast<PyObject*>(pyobj.get());
@@ -865,7 +865,7 @@ void initPythonIRBindings(PyObject* module_) {
           })
       .def(
           "with_sizes",
-          [](Type& t, c10::optional<std::vector<c10::optional<int64_t>>> sizes)
+          [](Type& t, std::optional<std::vector<c10::optional<int64_t>>> sizes)
               -> py::object {
             auto ptt = t.expect<TensorType>();
             if (!ptt) {
diff --git a/torch/csrc/jit/python/python_ir.h b/torch/csrc/jit/python/python_ir.h
index 296fc3f0b1f2e..26adf8c0e4941 100644
--- a/torch/csrc/jit/python/python_ir.h
+++ b/torch/csrc/jit/python/python_ir.h
@@ -42,7 +42,7 @@ struct ConcretePythonOp : public PythonOp {
   // recover the autograd.Function instance, if this PythonOp's function
   // was originally SomeFunction.apply
   // used in ONNX for discovering symbolics
-  c10::optional<THPObjectPtr> autogradFunction() const override;
+  std::optional<THPObjectPtr> autogradFunction() const override;
   void writeScalars(std::ostream& out) const override;
   void lint_python() const override;
 };
diff --git a/torch/csrc/jit/python/python_ivalue.h b/torch/csrc/jit/python/python_ivalue.h
index f33ceca30f2d0..4cdc8e430b9a8 100644
--- a/torch/csrc/jit/python/python_ivalue.h
+++ b/torch/csrc/jit/python/python_ivalue.h
@@ -31,7 +31,7 @@ struct C10_EXPORT ConcretePyObjectHolder final : PyObjectHolder {
     return torch::jit::tryToInferType(py_obj_);
   }
 
-  IValue toIValue(const TypePtr& type, c10::optional<int32_t> N = c10::nullopt)
+  IValue toIValue(const TypePtr& type, std::optional<int32_t> N = c10::nullopt)
       override {
     pybind11::gil_scoped_acquire ag;
     return torch::jit::toIValue(py_obj_, type, N);
diff --git a/torch/csrc/jit/python/python_list.h b/torch/csrc/jit/python/python_list.h
index d70e653043c93..b5bb88b3aeb20 100644
--- a/torch/csrc/jit/python/python_list.h
+++ b/torch/csrc/jit/python/python_list.h
@@ -175,7 +175,7 @@ class ScriptList final {
 
   // Remove and return the element at the specified index from the list. If no
   // index is passed, the last element is removed and returned.
-  IValue pop(c10::optional<size_type> idx = c10::nullopt) {
+  IValue pop(std::optional<size_type> idx = c10::nullopt) {
     IValue ret;
 
     if (idx) {
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index 4b854c884d026..d6f014759c05e 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -24,7 +24,7 @@ std::string typeString(py::handle h) {
   return py::str(h.get_type().attr("__name__"));
 }
 
-c10::optional<StrongFunctionPtr> as_function(const py::object& obj) {
+std::optional<StrongFunctionPtr> as_function(const py::object& obj) {
   if (py::isinstance<StrongFunctionPtr>(obj)) {
     return py::cast<StrongFunctionPtr>(obj);
   }
@@ -169,7 +169,7 @@ std::string PythonValue::kind() const {
 std::vector<std::shared_ptr<SugaredValue>> PythonValue::asTuple(
     const SourceRange& loc,
     GraphFunction& m,
-    const c10::optional<size_t>& size_hint) {
+    const std::optional<size_t>& size_hint) {
   const std::string type_str = typeString(self);
   std::stringstream ss;
   ss << kind() << " cannot be used as a tuple";
@@ -927,7 +927,7 @@ std::shared_ptr<SugaredValue> BooleanDispatchValue::call(
     at::ArrayRef<NamedValue> args,
     at::ArrayRef<NamedValue> kwargs,
     size_t n_binders) {
-  c10::optional<bool> result;
+  std::optional<bool> result;
   Graph& graph = *(caller.graph());
 
   auto index = py::cast<size_t>(dispatched_fn_["index"]);
diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h
index 35298e30b08a6..cb397796c9f55 100644
--- a/torch/csrc/jit/python/python_sugared_value.h
+++ b/torch/csrc/jit/python/python_sugared_value.h
@@ -27,12 +27,12 @@ std::shared_ptr<SugaredValue> toSugaredValue(
     const SourceRange& loc,
     bool is_constant = false);
 
-c10::optional<StrongFunctionPtr> as_function(const py::object& obj);
+std::optional<StrongFunctionPtr> as_function(const py::object& obj);
 
 struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
   PythonValue(
       py::object the_self,
-      c10::optional<py::object> rcb = c10::nullopt,
+      std::optional<py::object> rcb = c10::nullopt,
       Value* module_self = nullptr)
       : self(std::move(the_self)),
         rcb(std::move(rcb)),
@@ -56,7 +56,7 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
   std::vector<std::shared_ptr<SugaredValue>> asTuple(
       const SourceRange& loc,
       GraphFunction& m,
-      const c10::optional<size_t>& size_hint = {}) override;
+      const std::optional<size_t>& size_hint = {}) override;
 
   std::shared_ptr<SugaredValue> attr(
       const SourceRange& loc,
@@ -79,7 +79,7 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   py::object self;
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  c10::optional<py::object> rcb;
+  std::optional<py::object> rcb;
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   Value* moduleSelf_ = nullptr;
 };
diff --git a/torch/csrc/jit/python/python_tracer.cpp b/torch/csrc/jit/python/python_tracer.cpp
index bdc62d33568de..92e6e2d3ace23 100644
--- a/torch/csrc/jit/python/python_tracer.cpp
+++ b/torch/csrc/jit/python/python_tracer.cpp
@@ -45,7 +45,7 @@ std::vector<StackEntry> _pythonCallstack() {
 
 SourceRange getPythonInterpreterSourceRange() {
   auto cs = pythonCallstack();
-  c10::optional<std::string> source_filename;
+  std::optional<std::string> source_filename;
   size_t source_line = 0;
   std::stringstream stack_trace;
   for (const auto& entry : cs) {
diff --git a/torch/csrc/jit/python/python_tree_views.cpp b/torch/csrc/jit/python/python_tree_views.cpp
index a171314099c3e..50d18b908107e 100644
--- a/torch/csrc/jit/python/python_tree_views.cpp
+++ b/torch/csrc/jit/python/python_tree_views.cpp
@@ -12,7 +12,7 @@ namespace py = pybind11;
 
 namespace torch::jit {
 
-c10::optional<std::string> maybeConvertToString(const py::object& obj) {
+std::optional<std::string> maybeConvertToString(const py::object& obj) {
   if (obj.is_none()) {
     return c10::nullopt;
   }
@@ -177,10 +177,10 @@ void initTreeViewBindings(PyObject* module) {
           [](const Property& property) { return property.getter().name(); })
       .def("setter_name", [](const Property& property) {
         if (property.setter().present()) {
-          return c10::optional<Ident>(property.setter().get().name());
+          return std::optional<Ident>(property.setter().get().name());
         }
 
-        return c10::optional<Ident>(c10::nullopt);
+        return std::optional<Ident>(c10::nullopt);
       });
 
   py::class_<ClassDef, TreeView>(m, "ClassDef")
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index 22809069f8809..971b6c76ca47e 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -207,7 +207,7 @@ void checkOverloadDecl(const Decl& new_decl, const Decl& old_decl) {
   }
 }
 
-c10::optional<IValue> tryCalculateDefaultParam(
+std::optional<IValue> tryCalculateDefaultParam(
     const Argument& arg,
     const py::object& def_value) {
   auto n = arg.N();
@@ -287,7 +287,7 @@ FunctionSchema getSchemaWithNameAndDefaults(
     auto it = default_args.find(arg.name());
     if (it != default_args.end()) {
       checkMutableFunctionDefault(range, arg, it->second);
-      c10::optional<IValue> value = tryCalculateDefaultParam(arg, it->second);
+      std::optional<IValue> value = tryCalculateDefaultParam(arg, it->second);
       if (!value) {
         ErrorReport error(range);
         error << "Expected a default value of type " << arg.type()->repr_str()
@@ -1369,10 +1369,10 @@ void initJitScriptBindings(PyObject* module) {
           [](std::shared_ptr<CompilationUnit> self, const std::string& name) {
             auto fn = self->find_function(QualifiedName(name));
             if (fn) {
-              return c10::optional<StrongFunctionPtr>(
+              return std::optional<StrongFunctionPtr>(
                   StrongFunctionPtr(std::move(self), fn));
             } else {
-              return c10::optional<StrongFunctionPtr>(c10::nullopt);
+              return std::optional<StrongFunctionPtr>(c10::nullopt);
             }
           })
       .def(
@@ -1852,7 +1852,7 @@ void initJitScriptBindings(PyObject* module) {
          py::object map_location,
          const py::dict& extra_files,
          bool restore_shapes = false) {
-        c10::optional<at::Device> optional_device;
+        std::optional<at::Device> optional_device;
         if (!map_location.is_none()) {
           AT_ASSERT(THPDevice_Check(map_location.ptr()));
           optional_device =
@@ -1877,7 +1877,7 @@ void initJitScriptBindings(PyObject* module) {
              storage_context,
          py::object map_location,
          std::string ts_id) {
-        c10::optional<at::Device> optional_device;
+        std::optional<at::Device> optional_device;
         if (!map_location.is_none()) {
           AT_ASSERT(THPDevice_Check(map_location.ptr()));
           optional_device =
@@ -1898,7 +1898,7 @@ void initJitScriptBindings(PyObject* module) {
          const py::dict& extra_files,
          bool restore_shapes = false) {
         std::istringstream in(buffer);
-        c10::optional<at::Device> optional_device;
+        std::optional<at::Device> optional_device;
         if (!map_location.is_none()) {
           AT_ASSERT(THPDevice_Check(map_location.ptr()));
           optional_device =
@@ -1918,7 +1918,7 @@ void initJitScriptBindings(PyObject* module) {
   m.def(
       "_load_for_lite_interpreter",
       [](const std::string& filename, py::object map_location) {
-        c10::optional<at::Device> optional_device;
+        std::optional<at::Device> optional_device;
         if (!map_location.is_none()) {
           AT_ASSERT(THPDevice_Check(map_location.ptr()));
           optional_device =
@@ -1930,7 +1930,7 @@ void initJitScriptBindings(PyObject* module) {
       "_load_for_lite_interpreter_from_buffer",
       [](const std::string& buffer, py::object map_location) {
         std::istringstream in(buffer);
-        c10::optional<at::Device> optional_device;
+        std::optional<at::Device> optional_device;
         if (!map_location.is_none()) {
           AT_ASSERT(THPDevice_Check(map_location.ptr()));
           optional_device =
@@ -1975,7 +1975,7 @@ void initJitScriptBindings(PyObject* module) {
   m.def(
       "_get_model_extra_files",
       [](const std::string& filename, const py::dict& py_extra_files) {
-        c10::optional<at::Device> optional_device;
+        std::optional<at::Device> optional_device;
         ExtraFilesMap cpp_extra_files = ExtraFilesMap();
         _load_for_mobile(filename, optional_device, cpp_extra_files);
         extra_files_to_python(cpp_extra_files, py_extra_files);
@@ -1990,7 +1990,7 @@ void initJitScriptBindings(PyObject* module) {
   m.def(
       "_get_model_extra_files_from_buffer",
       [](const std::string& buffer, const py::dict& py_extra_files) {
-        c10::optional<at::Device> optional_device;
+        std::optional<at::Device> optional_device;
         ExtraFilesMap cpp_extra_files = ExtraFilesMap();
         std::istringstream in(buffer);
         _load_for_mobile(in, optional_device, cpp_extra_files);
@@ -2124,7 +2124,7 @@ void initJitScriptBindings(PyObject* module) {
 
   m.def(
       "_get_graph_executor_optimize",
-      [](c10::optional<bool> new_setting = c10::nullopt) {
+      [](std::optional<bool> new_setting = c10::nullopt) {
         bool old_value = getGraphExecutorOptimize();
         if (new_setting) {
           setGraphExecutorOptimize(*new_setting);
diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h
index 06c77edca718c..7a815e815d8e9 100644
--- a/torch/csrc/jit/runtime/argument_spec.h
+++ b/torch/csrc/jit/runtime/argument_spec.h
@@ -47,7 +47,7 @@ struct ArgumentInfo {
       return TensorType::get();
 
     return TensorType::create(
-        type(), device(), c10::optional<size_t>(dim()), requires_grad());
+        type(), device(), std::optional<size_t>(dim()), requires_grad());
   }
   operator TypePtr() const {
     return toType();
@@ -460,10 +460,10 @@ inline CompleteArgumentInfo CompleteArgumentSpec::at(size_t i) const {
   return CompleteArgumentInfo(*this, i);
 }
 
-inline c10::optional<int8_t> convertOptional(
-    c10::optional<c10::ScalarType> const& from) {
-  return (from) ? c10::optional<int8_t>(static_cast<int8_t>(*from))
-                : c10::optional<int8_t>{};
+inline std::optional<int8_t> convertOptional(
+    std::optional<c10::ScalarType> const& from) {
+  return (from) ? std::optional<int8_t>(static_cast<int8_t>(*from))
+                : std::optional<int8_t>{};
 }
 
 } // namespace torch::jit
@@ -475,7 +475,7 @@ struct hash<c10::VaryingShape<T>> {
   size_t operator()(const c10::VaryingShape<T>& vs) const {
     return c10::get_hash(
         vs.size(),
-        vs.size() ? vs.sizes().value() : std::vector<c10::optional<T>>());
+        vs.size() ? vs.sizes().value() : std::vector<std::optional<T>>());
   }
 };
 
@@ -483,10 +483,10 @@ template <>
 struct hash<c10::TensorType> {
   size_t operator()(const c10::TensorType& ptt) const {
     return c10::get_hash<
-        c10::optional<int8_t>,
+        std::optional<int8_t>,
         c10::VaryingShape<int64_t>,
         c10::VaryingShape<int64_t>,
-        c10::optional<bool>>(
+        std::optional<bool>>(
         torch::jit::convertOptional(ptt.scalarType()),
         ptt.sizes(),
         ptt.strides(),
diff --git a/torch/csrc/jit/runtime/autodiff.cpp b/torch/csrc/jit/runtime/autodiff.cpp
index 0d33abb217ee9..3987521f658f9 100644
--- a/torch/csrc/jit/runtime/autodiff.cpp
+++ b/torch/csrc/jit/runtime/autodiff.cpp
@@ -128,7 +128,7 @@ bool isDifferentiable(Graph& g) {
 // will be cleaned up later using EliminateDeadCode(block). TupleUnPack node in
 // backward graph will be removed in eliminateDeadcode(ReverseDetails) defined
 // in this file.
-static c10::optional<std::vector<Value*>> build_script_grad(
+static std::optional<std::vector<Value*>> build_script_grad(
     Node* node,
     const ArrayRef<Value*>& grads) {
   auto graph = node->owningGraph();
@@ -352,7 +352,7 @@ bool outputRequiresGrad(Value* output) {
   if (output->type()->castRaw<TensorType>() == nullptr) {
     return output->requires_grad();
   }
-  c10::optional<bool> requiresGrad =
+  std::optional<bool> requiresGrad =
       output->type()->expectRef<TensorType>().requiresGrad();
   if (requiresGrad.has_value()) {
     return *requiresGrad;
diff --git a/torch/csrc/jit/runtime/custom_operator.h b/torch/csrc/jit/runtime/custom_operator.h
index 64d514374f58e..faa8c90754a0e 100644
--- a/torch/csrc/jit/runtime/custom_operator.h
+++ b/torch/csrc/jit/runtime/custom_operator.h
@@ -18,8 +18,8 @@ struct TORCH_API RegisterOperators {
   /// Registers a vector of already created `Operator`s.
   /// The operator element is now optional to filter null ops. It's backward
   /// compatible and works for selective operator registration.
-  explicit RegisterOperators(std::vector<c10::optional<Operator>> operators) {
-    for (c10::optional<Operator>& o : operators) {
+  explicit RegisterOperators(std::vector<std::optional<Operator>> operators) {
+    for (std::optional<Operator>& o : operators) {
       if (o) {
         registerOperator(std::move(o.value()));
       }
diff --git a/torch/csrc/jit/runtime/decomposition_registry.cpp b/torch/csrc/jit/runtime/decomposition_registry.cpp
index 0c5f5f0876c1b..900ee32746906 100644
--- a/torch/csrc/jit/runtime/decomposition_registry.cpp
+++ b/torch/csrc/jit/runtime/decomposition_registry.cpp
@@ -107,7 +107,7 @@ void RunDecompositions(std::shared_ptr<Graph> g) {
   }
 }
 
-c10::optional<std::shared_ptr<Graph>> GetDecomposition(
+std::optional<std::shared_ptr<Graph>> GetDecomposition(
     const FunctionSchema& schema) {
   loadDecompositionFunctions();
   GRAPH_DEBUG("Trying to find schema: ", schema);
@@ -120,7 +120,7 @@ c10::optional<std::shared_ptr<Graph>> GetDecomposition(
   return c10::nullopt;
 }
 
-c10::optional<GraphFunction*> GetDecompositionFunction(
+std::optional<GraphFunction*> GetDecompositionFunction(
     const FunctionSchema& schema) {
   loadDecompositionFunctions();
   auto cache_it = schema_to_function.find(&schema);
diff --git a/torch/csrc/jit/runtime/decomposition_registry.h b/torch/csrc/jit/runtime/decomposition_registry.h
index 8633609bcf2a8..59f5aa796f76c 100644
--- a/torch/csrc/jit/runtime/decomposition_registry.h
+++ b/torch/csrc/jit/runtime/decomposition_registry.h
@@ -7,7 +7,7 @@
 
 namespace torch::jit {
 
-TORCH_API c10::optional<std::shared_ptr<Graph>> GetDecomposition(
+TORCH_API std::optional<std::shared_ptr<Graph>> GetDecomposition(
     const FunctionSchema& schema);
 
 TORCH_API void RegisterDecomposition(
@@ -16,7 +16,7 @@ TORCH_API void RegisterDecomposition(
 
 TORCH_API void RunDecompositions(std::shared_ptr<Graph> g);
 
-TORCH_API c10::optional<GraphFunction*> GetDecompositionFunction(
+TORCH_API std::optional<GraphFunction*> GetDecompositionFunction(
     const FunctionSchema& schema);
 
 // For invocation in C++, recommended is to assign to static local variable
diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp
index b1888f6344f18..d46e9028bf0af 100644
--- a/torch/csrc/jit/runtime/graph_executor.cpp
+++ b/torch/csrc/jit/runtime/graph_executor.cpp
@@ -636,7 +636,7 @@ struct GraphExecutorImpl : public GraphExecutorImplBase {
 
   const ExecutionPlan& getPlanFor(
       Stack& stack,
-      c10::optional<size_t> remaining_bailout_depth) override {
+      std::optional<size_t> remaining_bailout_depth) override {
     return getGraphExecutorOptimize() ? getOrCompile(stack)
                                       : getOrCompileFallback();
   }
@@ -838,7 +838,7 @@ c10::intrusive_ptr<Future> GraphExecutor::runAsync(
 
 const ExecutionPlan& GraphExecutor::getPlanFor(
     Stack& inputs,
-    c10::optional<size_t> remaining_bailout_depth) {
+    std::optional<size_t> remaining_bailout_depth) {
   return pImpl->getPlanFor(inputs, remaining_bailout_depth);
 }
 
diff --git a/torch/csrc/jit/runtime/graph_executor.h b/torch/csrc/jit/runtime/graph_executor.h
index d82d69ad5dce5..fce8d4a02e66c 100644
--- a/torch/csrc/jit/runtime/graph_executor.h
+++ b/torch/csrc/jit/runtime/graph_executor.h
@@ -87,7 +87,7 @@ struct TORCH_API GraphExecutor {
   // current global fusion strategy settings.
   const ExecutionPlan& getPlanFor(
       Stack& inputs,
-      c10::optional<size_t> remaining_bailout_depth = c10::nullopt);
+      std::optional<size_t> remaining_bailout_depth = c10::nullopt);
   GraphExecutorState getDebugState();
 
   void debugFlushCompilationCache();
diff --git a/torch/csrc/jit/runtime/graph_executor_impl.h b/torch/csrc/jit/runtime/graph_executor_impl.h
index 3aae2eb852796..22a563f00be28 100644
--- a/torch/csrc/jit/runtime/graph_executor_impl.h
+++ b/torch/csrc/jit/runtime/graph_executor_impl.h
@@ -78,7 +78,7 @@ struct GraphExecutorImplBase {
 
   virtual const ExecutionPlan& getPlanFor(
       Stack& stack,
-      c10::optional<size_t> remaining_bailout_depth = c10::nullopt) = 0;
+      std::optional<size_t> remaining_bailout_depth = c10::nullopt) = 0;
   virtual GraphExecutorState getDebugState() = 0;
   virtual ~GraphExecutorImplBase() = default;
 
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index e5f0f69a45498..18231173dd70e 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -181,7 +181,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
   void callFunction(
       Function& f,
       Stack& stack,
-      c10::optional<size_t> bailOut = c10::nullopt,
+      std::optional<size_t> bailOut = c10::nullopt,
       bool next = true) {
     bool newFrame = f.call(stack, bailOut, [&](const Code& code) {
       enterFrame(code, stack.size() - code.num_inputs());
@@ -882,7 +882,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
       // Janky af.  See https://github.com/pytorch/pytorch/issues/54612
       auto* not_implemented_error = dynamic_cast<c10::NotImplementedError*>(&e);
 
-      c10::optional<std::string> python_class_name;
+      std::optional<std::string> python_class_name;
       if (jit_exception) {
         python_class_name = jit_exception->getPythonClassName();
       }
@@ -913,7 +913,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
       const std::exception& e,
       bool is_jit_exception,
       c10::NotImplementedError* not_implemented_error,
-      c10::optional<std::string> python_class_name) {
+      std::optional<std::string> python_class_name) {
     ExceptionMessage msg(e);
     std::ostringstream ss;
     std::string class_name =
diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h
index e47a581fd5def..a28b1eb93526b 100644
--- a/torch/csrc/jit/runtime/interpreter.h
+++ b/torch/csrc/jit/runtime/interpreter.h
@@ -124,7 +124,7 @@ struct InterpreterContinuation {
       InterpreterState state_,
       Stack stack_,
       int64_t dist_autograd_context_id = 0,
-      c10::optional<at::ThreadLocalState> tls_state = c10::nullopt)
+      std::optional<at::ThreadLocalState> tls_state = c10::nullopt)
       : state(std::move(state_)),
         stack(std::move(stack_)),
         tls_state_(std::move(tls_state))
@@ -140,7 +140,7 @@ struct InterpreterContinuation {
  private:
   InterpreterState state;
   Stack stack;
-  c10::optional<at::ThreadLocalState> tls_state_ = c10::nullopt;
+  std::optional<at::ThreadLocalState> tls_state_ = c10::nullopt;
 #ifdef USE_DISTRIBUTED
   int64_t dist_autograd_context_id_;
 #endif
diff --git a/torch/csrc/jit/runtime/interpreter/code_impl.h b/torch/csrc/jit/runtime/interpreter/code_impl.h
index 98701aa23b365..60948da5a86d6 100644
--- a/torch/csrc/jit/runtime/interpreter/code_impl.h
+++ b/torch/csrc/jit/runtime/interpreter/code_impl.h
@@ -111,8 +111,8 @@ struct CodeImpl {
   // It is also very useful for debugging interpreter problems to
   // keep this around.
   std::shared_ptr<Graph> graph_;
-  c10::optional<std::vector<GraphExecutor*>> grad_executors_;
-  c10::optional<std::vector<GraphExecutor*>> forward_executors_;
+  std::optional<std::vector<GraphExecutor*>> grad_executors_;
+  std::optional<std::vector<GraphExecutor*>> forward_executors_;
   PreprocessGraph preprocess_;
 
   // map from unique of nodes to register in register table
diff --git a/torch/csrc/jit/runtime/interpreter/frame.h b/torch/csrc/jit/runtime/interpreter/frame.h
index e3de0a02ff7fa..c6873605d0deb 100644
--- a/torch/csrc/jit/runtime/interpreter/frame.h
+++ b/torch/csrc/jit/runtime/interpreter/frame.h
@@ -26,7 +26,7 @@ struct Frame {
   size_t base_pointer;
 
   // unique to every frame with prim::profile across all threads
-  c10::optional<size_t> id;
+  std::optional<size_t> id;
 
   // RecordFunction object associated with this frame
   std::unique_ptr<at::RecordFunction> record_function;
diff --git a/torch/csrc/jit/runtime/jit_exception.cpp b/torch/csrc/jit/runtime/jit_exception.cpp
index 809b1b2f5e599..2586f904c9871 100644
--- a/torch/csrc/jit/runtime/jit_exception.cpp
+++ b/torch/csrc/jit/runtime/jit_exception.cpp
@@ -7,8 +7,8 @@ static thread_local std::string caughtPythonClassName = "";
 
 JITException::JITException(
     const std::string& msg,
-    c10::optional<std::string> python_class_name,
-    c10::optional<std::string> original_msg)
+    std::optional<std::string> python_class_name,
+    std::optional<std::string> original_msg)
     : std::runtime_error(msg),
       python_class_name_(std::move(python_class_name)),
       original_msg_(std::move(original_msg)) {}
diff --git a/torch/csrc/jit/runtime/jit_exception.h b/torch/csrc/jit/runtime/jit_exception.h
index 728675ed78418..34c3ebd6fca84 100644
--- a/torch/csrc/jit/runtime/jit_exception.h
+++ b/torch/csrc/jit/runtime/jit_exception.h
@@ -11,17 +11,17 @@ namespace torch::jit {
 struct TORCH_API JITException : public std::runtime_error {
   explicit JITException(
       const std::string& msg,
-      c10::optional<std::string> python_class_name = c10::nullopt,
-      c10::optional<std::string> original_msg = c10::nullopt);
+      std::optional<std::string> python_class_name = c10::nullopt,
+      std::optional<std::string> original_msg = c10::nullopt);
 
-  c10::optional<std::string> getPythonClassName() const {
+  std::optional<std::string> getPythonClassName() const {
     return python_class_name_;
   }
 
   // the original msg if this is from a python exception. The interpretor has
   // changed the original message by adding "The following operation failed in
   // the TorchScript interpreter." in front of it in the handleError function.
-  c10::optional<std::string> getOriginalMsg() const {
+  std::optional<std::string> getOriginalMsg() const {
     return original_msg_;
   }
 
@@ -31,8 +31,8 @@ struct TORCH_API JITException : public std::runtime_error {
   static void setCaughtPythonClassName(const std::string& pythonClassName);
 
  private:
-  c10::optional<std::string> python_class_name_;
-  c10::optional<std::string> original_msg_;
+  std::optional<std::string> python_class_name_;
+  std::optional<std::string> original_msg_;
 };
 
 } // namespace torch::jit
diff --git a/torch/csrc/jit/runtime/operator.h b/torch/csrc/jit/runtime/operator.h
index bcab476441e29..dbc2638457c05 100644
--- a/torch/csrc/jit/runtime/operator.h
+++ b/torch/csrc/jit/runtime/operator.h
@@ -67,7 +67,7 @@ struct TORCH_API Operator {
   };
   struct UnparsedFunctionSchema final {
     std::string schema_string_;
-    mutable c10::optional<c10::AliasAnalysisKind> alias_analysis_;
+    mutable std::optional<c10::AliasAnalysisKind> alias_analysis_;
   };
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   struct JitOnlyOperator final {
@@ -298,16 +298,16 @@ TORCH_API bool aliasAnalysisHasSpecialCaseFor(c10::Symbol sym);
 // compile-time function for the selective op registration based on schema
 // string.
 template <typename Func>
-c10::optional<Operator> OperatorGenerator(
+std::optional<Operator> OperatorGenerator(
     const char* schema_str,
     Func&& op,
     AliasAnalysisKind alias_analysis) {
-  return c10::optional<Operator>(Operator(
+  return std::optional<Operator>(Operator(
       std::string(schema_str), std::forward<Func>(op), alias_analysis));
 }
 
 template <typename Func>
-c10::optional<Operator> OperatorGenerator(
+std::optional<Operator> OperatorGenerator(
     torch::detail::SelectiveStr<true> schema_str,
     Func&& op,
     AliasAnalysisKind alias_analysis) {
@@ -318,7 +318,7 @@ c10::optional<Operator> OperatorGenerator(
 }
 
 template <typename Func>
-c10::optional<Operator> OperatorGenerator(
+std::optional<Operator> OperatorGenerator(
     torch::detail::SelectiveStr<false> schema_str,
     Func&& op,
     AliasAnalysisKind alias_analysis) {
@@ -326,14 +326,14 @@ c10::optional<Operator> OperatorGenerator(
 }
 
 template <typename Func>
-c10::optional<Operator> OperatorGenerator(
+std::optional<Operator> OperatorGenerator(
     const std::string name,
     const std::string overload_name,
     const std::vector<c10::Argument> arguments,
     const std::vector<c10::Argument> returns,
     Func&& op,
     AliasAnalysisKind alias_analysis) {
-  return c10::optional<Operator>(Operator(
+  return std::optional<Operator>(Operator(
       name,
       overload_name,
       arguments,
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
index 58d80c48f9c87..48c7a1959ab22 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@@ -118,7 +118,7 @@ static FusionStrategy getInitialStrategy() {
 }
 
 // defer initial value so that we can load in gflags
-static c10::optional<FusionStrategy> fusion_strategy = c10::nullopt;
+static std::optional<FusionStrategy> fusion_strategy = c10::nullopt;
 
 FusionStrategy getFusionStrategy() {
   std::lock_guard<std::mutex> guard(fusion_strategy_lock);
@@ -613,7 +613,7 @@ size_t ProfilingGraphExecutorImpl::getInstantiatedBailoutDepth() {
 
 const ExecutionPlan& ProfilingGraphExecutorImpl::getOptimizedPlanFor(
     Stack& stack,
-    c10::optional<size_t> remaining_bailout_depth) {
+    std::optional<size_t> remaining_bailout_depth) {
   GRAPH_DEBUG("Running ProfilingGraphExecutorImpl ", this);
 
   // TODO: instantiate simple executor when getProfilingMode() is false
@@ -700,7 +700,7 @@ const ExecutionPlan& ProfilingGraphExecutorImpl::getOptimizedPlanFor(
 
 const ExecutionPlan& ProfilingGraphExecutorImpl::getPlanFor(
     Stack& stack,
-    c10::optional<size_t> remaining_bailout_depth) {
+    std::optional<size_t> remaining_bailout_depth) {
   std::lock_guard<std::mutex> lock(compile_mutex);
 
   // IMPORTANT: This is a hot path of calling a torchscript function. Try not to
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
index 45da1f030e962..a49ef18e2fa42 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
@@ -18,7 +18,7 @@ struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase {
 
   const ExecutionPlan& getPlanFor(
       Stack& stack,
-      c10::optional<size_t> remaining_bailout_depth) override;
+      std::optional<size_t> remaining_bailout_depth) override;
   GraphExecutorState getDebugState() override;
   ~ProfilingGraphExecutorImpl() override = default;
 
@@ -31,7 +31,7 @@ struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase {
  private:
   const ExecutionPlan& getOptimizedPlanFor(
       Stack& stack,
-      c10::optional<size_t> remaining_bailout_depth);
+      std::optional<size_t> remaining_bailout_depth);
   void runProfilingInsensitiveOptimizations(std::shared_ptr<Graph>& graph);
   void runProfilingOptimizations(
       std::shared_ptr<Graph>& graph,
@@ -47,13 +47,13 @@ struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase {
   void clearTheGraphCompilationIntermediateGraphs();
 
   std::unique_ptr<ProfilingRecord> pr_;
-  c10::optional<ExecutionPlan>
+  std::optional<ExecutionPlan>
       profiling_plan_; // plan to run in order to profiling the code
-  c10::optional<ExecutionPlan> optimized_plan_;
+  std::optional<ExecutionPlan> optimized_plan_;
   FusionStrategy fusion_strategy_;
 
   // this plan is used if getGraphExecutorOptimize is unset
-  c10::optional<ExecutionPlan> fallback_plan_;
+  std::optional<ExecutionPlan> fallback_plan_;
   // fallback functions are inserted for tensorexpr fusion groups
   // and by specialize_autogradzero. Whenever, at runtime, input
   // tensor don't match profiled properties, fallback functions are called
@@ -63,7 +63,7 @@ struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase {
   // They only exist in the optimized graph which is a private property
   // of the GraphExecutor and only shared with InterpreterState
   std::vector<std::unique_ptr<Function>> fallback_functions_;
-  c10::optional<size_t> remaining_bailout_depth_;
+  std::optional<size_t> remaining_bailout_depth_;
   // The time the optimized_plan_ is created.
   int32_t time_optimized_plan_created_ = 0;
   // Has the extra memory used by the graph for profiling is released?
diff --git a/torch/csrc/jit/runtime/register_ops_utils.cpp b/torch/csrc/jit/runtime/register_ops_utils.cpp
index b926c59e75dee..7335f132dfbf5 100644
--- a/torch/csrc/jit/runtime/register_ops_utils.cpp
+++ b/torch/csrc/jit/runtime/register_ops_utils.cpp
@@ -403,7 +403,7 @@ void listSetItem(Stack& stack) {
 
 at::Generator make_generator_for_device(
     c10::Device device,
-    c10::optional<int64_t> seed) {
+    std::optional<int64_t> seed) {
   if (device.is_cpu()) {
     if (seed.has_value()) {
       return at::detail::createCPUGenerator(seed.value());
diff --git a/torch/csrc/jit/runtime/register_ops_utils.h b/torch/csrc/jit/runtime/register_ops_utils.h
index de70cea3a1d50..15e59acb9fe6e 100644
--- a/torch/csrc/jit/runtime/register_ops_utils.h
+++ b/torch/csrc/jit/runtime/register_ops_utils.h
@@ -879,6 +879,6 @@ struct OperatorGeneratorArgs {
 
 TORCH_API at::Generator make_generator_for_device(
     c10::Device device,
-    c10::optional<int64_t> seed = c10::nullopt);
+    std::optional<int64_t> seed = c10::nullopt);
 
 } // namespace torch::jit
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index ee1c0c9e29ef8..bb9c08465c0ae 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -34,8 +34,8 @@ namespace {
 
 std::string stringSlice(
     std::string string,
-    c10::optional<int64_t> start,
-    c10::optional<int64_t> end,
+    std::optional<int64_t> start,
+    std::optional<int64_t> end,
     int64_t step) {
   int64_t start_val = start.has_value() ? start.value() : INT64_MAX;
   int64_t end_val = end.has_value() ? end.value() : INT64_MAX;
@@ -1167,7 +1167,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
             "aten::index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"),
         [](Stack& stack) {
           auto indices = pop(stack).to<c10::List<at::Tensor>>();
-          c10::List<c10::optional<at::Tensor>> opt_list_indices;
+          c10::List<std::optional<at::Tensor>> opt_list_indices;
           opt_list_indices.reserve(indices.size());
           for (const auto& ten : indices) {
             opt_list_indices.push_back(ten);
@@ -1182,7 +1182,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
             "aten::_unsafe_index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"),
         [](Stack& stack) {
           auto indices = pop(stack).to<c10::List<at::Tensor>>();
-          c10::List<c10::optional<at::Tensor>> opt_list_indices;
+          c10::List<std::optional<at::Tensor>> opt_list_indices;
           opt_list_indices.reserve(indices.size());
           for (const auto& ten : indices) {
             opt_list_indices.push_back(ten);
@@ -1200,7 +1200,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
           auto accumulate = pop(stack).toBool();
           auto values = pop(stack).toTensor();
           auto indices = pop(stack).to<c10::List<at::Tensor>>();
-          c10::List<c10::optional<at::Tensor>> opt_list_indices;
+          c10::List<std::optional<at::Tensor>> opt_list_indices;
           opt_list_indices.reserve(indices.size());
           for (const auto& ten : indices) {
             opt_list_indices.push_back(ten);
@@ -1218,7 +1218,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
           auto accumulate = pop(stack).toBool();
           auto values = pop(stack).toTensor();
           auto indices = pop(stack).to<c10::List<at::Tensor>>();
-          c10::List<c10::optional<at::Tensor>> opt_list_indices;
+          c10::List<std::optional<at::Tensor>> opt_list_indices;
           opt_list_indices.reserve(indices.size());
           for (const auto& ten : indices) {
             opt_list_indices.push_back(ten);
@@ -1236,7 +1236,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
           auto accumulate = pop(stack).toBool();
           auto values = pop(stack).toTensor();
           auto indices = pop(stack).to<c10::List<at::Tensor>>();
-          c10::List<c10::optional<at::Tensor>> opt_list_indices;
+          c10::List<std::optional<at::Tensor>> opt_list_indices;
           opt_list_indices.reserve(indices.size());
           for (const auto& ten : indices) {
             opt_list_indices.push_back(ten);
@@ -1254,7 +1254,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
           auto accumulate = pop(stack).toBool();
           auto values = pop(stack).toTensor();
           auto indices = pop(stack).to<c10::List<at::Tensor>>();
-          c10::List<c10::optional<at::Tensor>> opt_list_indices;
+          c10::List<std::optional<at::Tensor>> opt_list_indices;
           opt_list_indices.reserve(indices.size());
           for (const auto& ten : indices) {
             opt_list_indices.push_back(ten);
@@ -1275,9 +1275,9 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           bool copy;
           pop(stack, non_blocking, copy);
-          c10::optional<at::ScalarType> scalarType =
+          std::optional<at::ScalarType> scalarType =
               pop(stack).toOptional<at::ScalarType>();
-          c10::optional<c10::Device> device =
+          std::optional<c10::Device> device =
               pop(stack).toOptional<c10::Device>();
           at::Tensor self = pop(stack).toTensor();
           push(
@@ -1404,9 +1404,9 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
                                 }
                               }))};
 
-static std::vector<c10::optional<Operator>> createOperators(
+static std::vector<std::optional<Operator>> createOperators(
     const std::vector<OperatorGeneratorArgs>& args) {
-  std::vector<c10::optional<Operator>> result;
+  std::vector<std::optional<Operator>> result;
   result.reserve(args.size());
   for (const auto& arg : args) {
     if (arg.schema_str) {
@@ -1769,8 +1769,8 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
             "aten::slice.str(str string, int? start=None, int? end=None, int step=1) -> str"),
         [](Stack& stack) {
           int64_t step = pop(stack).toInt();
-          c10::optional<int64_t> end = pop(stack).toOptional<int64_t>();
-          c10::optional<int64_t> start = pop(stack).toOptional<int64_t>();
+          std::optional<int64_t> end = pop(stack).toOptional<int64_t>();
+          std::optional<int64_t> start = pop(stack).toOptional<int64_t>();
           std::string string = pop(stack).toStringRef();
           push(stack, stringSlice(string, start, end, step));
         },
@@ -2397,7 +2397,7 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
           for (const auto& v : ivalues) {
             values.emplace_back(v.toStringRef());
           }
-          c10::optional<std::string> opt_string =
+          std::optional<std::string> opt_string =
               pop(stack).toOptional<std::string>();
           const std::string& string = opt_string.value_or("");
           std::stringstream ss;
@@ -2463,8 +2463,8 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs1{
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           bool copy;
           pop(stack, self, non_blocking, copy);
-          c10::optional<c10::Device> device = c10::nullopt;
-          c10::optional<at::ScalarType> scalarType = c10::nullopt;
+          std::optional<c10::Device> device = c10::nullopt;
+          std::optional<at::ScalarType> scalarType = c10::nullopt;
           push(
               stack, to_dispatch(self, device, scalarType, non_blocking, copy));
         },
diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
index d48a981666c83..4359b852b6a38 100644
--- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
@@ -427,8 +427,8 @@ at::Tensor interpolate(
     const IValue& size,
     const IValue& scale_factors,
     const std::string& mode,
-    c10::optional<bool> align_corners,
-    c10::optional<bool> recompute_scale_factor) {
+    std::optional<bool> align_corners,
+    std::optional<bool> recompute_scale_factor) {
   if ((mode == "nearest" || mode == "area")) {
     if (align_corners != c10::nullopt) {
       throw std::runtime_error(
diff --git a/torch/csrc/jit/runtime/register_special_ops.cpp b/torch/csrc/jit/runtime/register_special_ops.cpp
index 5e33d8cf27d39..5b8c70c404ae9 100644
--- a/torch/csrc/jit/runtime/register_special_ops.cpp
+++ b/torch/csrc/jit/runtime/register_special_ops.cpp
@@ -406,7 +406,7 @@ RegisterOperators reg({
           double a;
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           double b;
-          c10::optional<at::Generator> generator =
+          std::optional<at::Generator> generator =
               pop(stack).toOptional<at::Generator>();
 
           pop(stack, tensor, a, b);
@@ -425,7 +425,7 @@ RegisterOperators reg({
           double mean;
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           double std;
-          c10::optional<at::Generator> generator =
+          std::optional<at::Generator> generator =
               pop(stack).toOptional<at::Generator>();
 
           pop(stack, tensor, mean, std);
diff --git a/torch/csrc/jit/runtime/simple_graph_executor_impl.cpp b/torch/csrc/jit/runtime/simple_graph_executor_impl.cpp
index 742915995469e..c1dbbddc6d337 100644
--- a/torch/csrc/jit/runtime/simple_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/simple_graph_executor_impl.cpp
@@ -13,7 +13,7 @@ SimpleGraphExecutorImpl::SimpleGraphExecutorImpl(
 
 const ExecutionPlan& SimpleGraphExecutorImpl::getPlanFor(
     Stack& stack,
-    c10::optional<size_t> remaining_bailout_depth) {
+    std::optional<size_t> remaining_bailout_depth) {
   std::lock_guard<std::mutex> lock(compile_mutex);
 
   // IMPORTANT: This is a hot path of calling a torchscript function. Try not to
diff --git a/torch/csrc/jit/runtime/simple_graph_executor_impl.h b/torch/csrc/jit/runtime/simple_graph_executor_impl.h
index 34272000f0d1a..e1ebed46ede80 100644
--- a/torch/csrc/jit/runtime/simple_graph_executor_impl.h
+++ b/torch/csrc/jit/runtime/simple_graph_executor_impl.h
@@ -12,12 +12,12 @@ struct TORCH_API SimpleGraphExecutorImpl : public GraphExecutorImplBase {
 
   const ExecutionPlan& getPlanFor(
       Stack& stack,
-      c10::optional<size_t> remaining_bailout_depth) override;
+      std::optional<size_t> remaining_bailout_depth) override;
   GraphExecutorState getDebugState() override;
   ~SimpleGraphExecutorImpl() override = default;
 
  private:
-  c10::optional<ExecutionPlan> execution_plan_;
+  std::optional<ExecutionPlan> execution_plan_;
 };
 
 } // namespace torch::jit
diff --git a/torch/csrc/jit/runtime/static/fusion.cpp b/torch/csrc/jit/runtime/static/fusion.cpp
index 5ba3b1a0268f2..ffac37efc9b76 100644
--- a/torch/csrc/jit/runtime/static/fusion.cpp
+++ b/torch/csrc/jit/runtime/static/fusion.cpp
@@ -168,7 +168,7 @@ static void debugDumpFusionGroup(const std::string& msg, Node* n) {
   }
 }
 
-static c10::optional<Node*> tryMerge(
+static std::optional<Node*> tryMerge(
     Node* fusion_group,
     Node* to_merge,
     AliasDb* aliasDb) {
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 9f62d631bce88..193675672f6b8 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -286,7 +286,7 @@ void PrepareGraphForStaticModule(
   ForceNonEmptyOutputs(*graph);
 }
 
-std::pair<std::shared_ptr<Graph>, c10::optional<Module>> PrepareForStaticModule(
+std::pair<std::shared_ptr<Graph>, std::optional<Module>> PrepareForStaticModule(
     const torch::jit::Module& m,
     bool is_frozen,
     const StaticModuleOptions& opts,
@@ -316,7 +316,7 @@ std::pair<std::shared_ptr<Graph>, c10::optional<Module>> PrepareForStaticModule(
   return std::make_pair(graph, module);
 }
 
-std::pair<std::shared_ptr<Graph>, c10::optional<Module>> PrepareForStaticModule(
+std::pair<std::shared_ptr<Graph>, std::optional<Module>> PrepareForStaticModule(
     std::shared_ptr<torch::jit::Graph> graph,
     const StaticModuleOptions& opts,
     std::vector<IValue> sample_inputs) {
@@ -544,7 +544,7 @@ StaticModule::StaticModule(
           opts) {}
 
 StaticModule::StaticModule(
-    std::pair<std::shared_ptr<torch::jit::Graph>, c10::optional<Module>>
+    std::pair<std::shared_ptr<torch::jit::Graph>, std::optional<Module>>
         graph_and_module,
     const StaticModuleOptions& opts)
     : opts_(opts),
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index 48af8ef02afbf..2e840e582a0a1 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -417,7 +417,7 @@ class TORCH_API StaticModule {
 
  private:
   explicit StaticModule(
-      std::pair<std::shared_ptr<torch::jit::Graph>, c10::optional<Module>>
+      std::pair<std::shared_ptr<torch::jit::Graph>, std::optional<Module>>
           graph_and_module,
       const StaticModuleOptions& opts);
 
@@ -490,7 +490,7 @@ class TORCH_API StaticModule {
 
   C10_NODISCARD Node* findNodeWithKindForTesting(const std::string& kind) const;
 
-  const c10::optional<c10::FunctionSchema>& schema() const {
+  const std::optional<c10::FunctionSchema>& schema() const {
     return schema_;
   }
 
@@ -539,8 +539,8 @@ class TORCH_API StaticModule {
   // metadata that is stored in IR nodes as attribute
   at::intrusive_ptr<jit::StaticRuntimeMetadata> sr_metadata_;
   std::shared_ptr<torch::jit::Graph> graph_;
-  c10::optional<torch::jit::Module> module_;
-  c10::optional<c10::FunctionSchema> schema_;
+  std::optional<torch::jit::Module> module_;
+  std::optional<c10::FunctionSchema> schema_;
   std::unique_ptr<StaticRuntime> cached_runtime_;
 
   // Bookkeeping for creating new StaticRuntime instances
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index b4f4c38c2aaf5..b1b8a081c4ce6 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -209,7 +209,7 @@ at::Tensor& to_copy_out(
     const Tensor& self,
     bool non_blocking,
     bool copy_strides,
-    c10::optional<MemoryFormat> memory_format) {
+    std::optional<MemoryFormat> memory_format) {
   if (copy_strides) {
     at::native::resize_impl_cpu_(
         out.unsafeGetTensorImpl(), self.sizes(), self.strides());
@@ -259,7 +259,7 @@ static Tensor& linear_out(
     Tensor& output,
     const Tensor& input,
     const Tensor& weight,
-    const c10::optional<Tensor>& bias_opt) {
+    const std::optional<Tensor>& bias_opt) {
   TORCH_CHECK(!input.is_mkldnn());
 
   auto bias = bias_opt.has_value()
@@ -1048,7 +1048,7 @@ REGISTER_OPERATOR_FUNCTOR(aten::logit, aten_logit, [](Node* n) -> SROperator {
     LogAndDumpSchema(n);
     return nullptr;
   }
-  c10::optional<float> clamp = c10::nullopt;
+  std::optional<float> clamp = c10::nullopt;
   if (n->inputs()[1]->node()->kind() == prim::Constant) {
     auto clamp_d = toIValue(n->inputs()[1])->toOptional<double>();
     clamp = clamp_d
@@ -1353,10 +1353,10 @@ namespace {
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct ToArgs {
-  c10::optional<at::ScalarType> dtype;
+  std::optional<at::ScalarType> dtype;
   c10::Layout layout;
   bool know_to_will_alias = false;
-  c10::optional<c10::MemoryFormat> memory_format;
+  std::optional<c10::MemoryFormat> memory_format;
 };
 
 template <bool has_constant_non_tensor_dtype_and_flags, bool has_memory_format>
@@ -1440,8 +1440,8 @@ C10_ALWAYS_INLINE void to_copy_functor_impl(
   // handle memory format
   bool copy_strides = false;
 
-  c10::optional<c10::MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
-  c10::optional<ToArgs> my_args;
+  std::optional<c10::MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
+  std::optional<ToArgs> my_args;
   if (!args) {
     my_args = extract_to_args<
         has_constant_non_tensor_dtype_and_flags,
@@ -1905,7 +1905,7 @@ REGISTER_OPERATOR_FUNCTOR(aten::div, aten_div, [](Node* n) -> SROperator {
 
   return [te = createDiv()](ProcessedNode* p_node) {
     const auto& in0_t = p_node->Input(0).toTensor();
-    c10::optional<c10::string_view> rounding_mode = c10::nullopt;
+    std::optional<c10::string_view> rounding_mode = c10::nullopt;
     if (p_node->num_inputs() > 2) {
       rounding_mode = p_node->Input(2).toOptional<c10::string_view>();
     }
@@ -2396,8 +2396,8 @@ REGISTER_OPERATOR_FUNCTOR(
 // device & pin_memory matter only when CUDA is enabled.
 static bool hasTensorWithOptions(
     const IValue& ivalue,
-    c10::optional<c10::ScalarType> dtype,
-    c10::optional<c10::Layout> layout) {
+    std::optional<c10::ScalarType> dtype,
+    std::optional<c10::Layout> layout) {
   if (!ivalue.isTensor()) {
     return false;
   }
@@ -2412,9 +2412,9 @@ static bool hasTensorWithOptions(
 
 static bool hasTensorWithOptions(
     const IValue& ivalue,
-    c10::optional<c10::ScalarType> dtype,
-    c10::optional<c10::Layout> layout,
-    c10::optional<c10::MemoryFormat> memory_format) {
+    std::optional<c10::ScalarType> dtype,
+    std::optional<c10::Layout> layout,
+    std::optional<c10::MemoryFormat> memory_format) {
   return hasTensorWithOptions(ivalue, dtype, layout) &&
       (memory_format == ivalue.toTensor().options().memory_format_opt());
 }
diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h
index 53aa0dc787d1b..362837e7ce78f 100644
--- a/torch/csrc/jit/runtime/static/ops.h
+++ b/torch/csrc/jit/runtime/static/ops.h
@@ -15,7 +15,7 @@ at::Tensor& to_copy_out(
     const Tensor& self,
     bool non_blocking,
     bool copy_strides,
-    c10::optional<MemoryFormat> memory_format);
+    std::optional<MemoryFormat> memory_format);
 } // namespace at::native
 
 namespace torch::jit {
diff --git a/torch/csrc/jit/runtime/symbolic_script.cpp b/torch/csrc/jit/runtime/symbolic_script.cpp
index ff8513f016daf..6aa65c528a42b 100644
--- a/torch/csrc/jit/runtime/symbolic_script.cpp
+++ b/torch/csrc/jit/runtime/symbolic_script.cpp
@@ -1614,7 +1614,7 @@ static void loadFunctions() {
   loadModule(compilation_unit);
 }
 
-c10::optional<GradientPair> gradientInfoForSchema(
+std::optional<GradientPair> gradientInfoForSchema(
     const FunctionSchema& schema) {
   std::lock_guard<std::mutex> guard(lock);
   if (schema_to_graphs.empty()) {
diff --git a/torch/csrc/jit/runtime/symbolic_script.h b/torch/csrc/jit/runtime/symbolic_script.h
index 64e0d6661baeb..271bf66916f3d 100644
--- a/torch/csrc/jit/runtime/symbolic_script.h
+++ b/torch/csrc/jit/runtime/symbolic_script.h
@@ -12,7 +12,7 @@ struct GradientPair {
   std::shared_ptr<Graph> backward;
 };
 
-TORCH_API c10::optional<GradientPair> gradientInfoForSchema(
+TORCH_API std::optional<GradientPair> gradientInfoForSchema(
     const FunctionSchema& schema);
 TORCH_API bool hasGradientInfoForSchema(const FunctionSchema& schema);
 } // namespace torch::jit
diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
index 5e380c1f437a7..ddea031aba73c 100644
--- a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
+++ b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
@@ -377,7 +377,7 @@ void loadFunctions() {
 }
 } // anonymous namespace
 
-c10::optional<std::shared_ptr<Graph>> shapeComputeGraphForSchema(
+std::optional<std::shared_ptr<Graph>> shapeComputeGraphForSchema(
     const FunctionSchema& schema) {
   std::lock_guard<std::mutex> guard(lock);
   if (cached_schema_to_graph.empty()) {
@@ -394,7 +394,7 @@ c10::optional<std::shared_ptr<Graph>> shapeComputeGraphForSchema(
   return c10::nullopt;
 }
 
-TORCH_API c10::optional<BoundedShapeGraphs> boundedGraphsForSchema(
+TORCH_API std::optional<BoundedShapeGraphs> boundedGraphsForSchema(
     const FunctionSchema& schema) {
   std::lock_guard<std::mutex> guard(lock);
   if (cached_bounded_schema_to_graph.empty()) {
diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry.h b/torch/csrc/jit/runtime/symbolic_shape_registry.h
index 2d09eb27876b7..a14d327aab429 100644
--- a/torch/csrc/jit/runtime/symbolic_shape_registry.h
+++ b/torch/csrc/jit/runtime/symbolic_shape_registry.h
@@ -54,10 +54,10 @@ TORCH_API void RegisterShapeComputeGraphForSchema(
     const FunctionSchema& schema,
     std::shared_ptr<Graph> g);
 
-TORCH_API c10::optional<std::shared_ptr<Graph>> shapeComputeGraphForSchema(
+TORCH_API std::optional<std::shared_ptr<Graph>> shapeComputeGraphForSchema(
     const FunctionSchema& schema);
 
-TORCH_API c10::optional<BoundedShapeGraphs> boundedGraphsForSchema(
+TORCH_API std::optional<BoundedShapeGraphs> boundedGraphsForSchema(
     const FunctionSchema& schema);
 
 TORCH_API std::vector<const FunctionSchema*> RegisteredShapeComputeSchemas();
diff --git a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp
index 7674c5324ce9f..4a326285b2974 100644
--- a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp
+++ b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp
@@ -59,7 +59,7 @@ c10::IValue InlinedCallStackSerializer::serialize(
 }
 
 c10::IValue InlinedCallStackSerializer::serialize_module_instance_info(
-    const c10::optional<ModuleInstanceInfo>& m) {
+    const std::optional<ModuleInstanceInfo>& m) {
   if (!m) {
     return c10::IValue();
   }
@@ -168,7 +168,7 @@ InlinedCallStackPtr InlinedCallStackDeserializer::deserialize(
   return cs_ptr;
 }
 
-c10::optional<ModuleInstanceInfo> InlinedCallStackDeserializer::
+std::optional<ModuleInstanceInfo> InlinedCallStackDeserializer::
     deserialize_module_instance_info(
         const c10::IValue& iv,
         const std::shared_ptr<CompilationUnit>& cu) {
diff --git a/torch/csrc/jit/serialization/callstack_debug_info_serialization.h b/torch/csrc/jit/serialization/callstack_debug_info_serialization.h
index ac1bdf8d3b1d8..46fd2850d20bd 100644
--- a/torch/csrc/jit/serialization/callstack_debug_info_serialization.h
+++ b/torch/csrc/jit/serialization/callstack_debug_info_serialization.h
@@ -32,7 +32,7 @@ class InlinedCallStackSerializer {
  private:
   // module_info = [ClassType.qualifiedName, instance_name]
   c10::IValue serialize_module_instance_info(
-      const c10::optional<ModuleInstanceInfo>& m);
+      const std::optional<ModuleInstanceInfo>& m);
 
   // This caches serialized inlined callstack ptr, since many
   // InlinedCallStackPtr can refer to the same one.
@@ -64,7 +64,7 @@ class InlinedCallStackDeserializer {
       const std::shared_ptr<CompilationUnit>& cu);
 
  private:
-  c10::optional<ModuleInstanceInfo> deserialize_module_instance_info(
+  std::optional<ModuleInstanceInfo> deserialize_module_instance_info(
       const c10::IValue& iv,
       const std::shared_ptr<CompilationUnit>& cu);
 
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index c23e3b52bfb1b..6ef9bdbf4abfa 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -145,26 +145,6 @@ void validateBlock(
             "\n\nDefined at:\n" + getNodeStackTraceString(node))
       }
     } else {
-#ifdef BUILD_CAFFE2
-      // Assuming this is a Caffe2 change as it only modifies an aten op
-      // for operator_export_type == ONNX_ATEN_FALLBACK, which is a common
-      // pattern for Caffe2-specific scenarios.
-      if (node->kind() == aten::expand) {
-        if (operator_export_type ==
-            onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK) {
-          WithInsertPoint guard(node);
-          auto* new_node =
-              b->owningGraph()->insertNode(b->owningGraph()->create(
-                  Symbol(::c10::aten::ATen),
-                  node->inputs(),
-                  node->outputs().size()));
-          for (size_t i = 0; i < node->outputs().size(); ++i) {
-            node->output(i)->replaceAllUsesWith(new_node->output(i));
-          }
-          new_node->s_(Symbol::fromQualString("attr::operator"), "expand");
-        }
-      }
-#endif
       if (node->kind() == prim::PackPadded || node->kind() == prim::PadPacked) {
         if (operator_export_type !=
             onnx_torch::OperatorExportTypes::ONNX_FALLTHROUGH) {
@@ -209,7 +189,7 @@ std::string GetFileRootPath(const std::string& rootPath) {
 }
 
 std::string GetExternalFileName(
-    const c10::optional<std::string>& external_ref) {
+    const std::optional<std::string>& external_ref) {
   auto tensorName = external_ref.value();
   const std::string illegalChars = "\\/:?\"<>|";
   for (char& i : tensorName) {
@@ -363,7 +343,7 @@ class GraphEncoder {
   void EncodeTensor(
       onnx::TensorProto* tensor_proto,
       const at::Tensor& tensor,
-      const c10::optional<std::string> external_ref = {},
+      const std::optional<std::string> external_ref = {},
       const bool use_external_data_format = false,
       const std::string& onnx_file_path = std::string());
 
@@ -1300,7 +1280,7 @@ void GraphEncoder::EncodeTypeProto(
 void GraphEncoder::EncodeTensor(
     onnx::TensorProto* tensor_proto,
     const at::Tensor& tensor,
-    const c10::optional<std::string> external_ref,
+    const std::optional<std::string> external_ref,
     const bool use_external_data_format,
     const std::string& onnx_file_path) {
   for (auto d : tensor.sizes()) {
diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h
index 3a56cfc7788fb..9a7ab2c4fcc87 100644
--- a/torch/csrc/jit/serialization/export.h
+++ b/torch/csrc/jit/serialization/export.h
@@ -30,6 +30,7 @@ namespace jit {
 using RawDataExportMap = std::unordered_map<std::string, at::Tensor>;
 
 using SymbolDimMap = std::map<c10::ShapeSymbol, std::string>;
+using DimSymbolMap = std::map<std::string, c10::ShapeSymbol>;
 
 using NodeNameMap = std::unordered_map<const Node*, std::string>;
 
diff --git a/torch/csrc/jit/serialization/export_bytecode.cpp b/torch/csrc/jit/serialization/export_bytecode.cpp
index 9ec2dbcaa2da3..9f194cd0ad31b 100644
--- a/torch/csrc/jit/serialization/export_bytecode.cpp
+++ b/torch/csrc/jit/serialization/export_bytecode.cpp
@@ -166,7 +166,7 @@ mobile::Code compileGraphToMobileCode(
       // and is not allowed. For an operator with num_args = -1, it means the
       // number of arguments is not available for this operator, we don't do any
       // backward compatibility adaptation at runtime.
-      c10::optional<int> num_args = c10::nullopt;
+      std::optional<int> num_args = c10::nullopt;
       auto it = op_to_specified_args.find(unique_name);
       if (it != op_to_specified_args.end()) {
         num_args = it->second;
diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp
index cdb878d4062c8..5bd7714c4e8d2 100644
--- a/torch/csrc/jit/serialization/export_module.cpp
+++ b/torch/csrc/jit/serialization/export_module.cpp
@@ -254,7 +254,7 @@ std::pair<IValue, IValue> getFunctionTuple(
 
   // schema
   const auto& schema = func.getSchema();
-  auto type_printer = [&](const c10::Type& t) -> c10::optional<std::string> {
+  auto type_printer = [&](const c10::Type& t) -> std::optional<std::string> {
     auto namedType = t.cast<c10::NamedType>();
     if (namedType && namedType->name()) {
       return type_name_uniquer_.getUniqueName(namedType).qualifiedName();
@@ -313,7 +313,7 @@ std::pair<IValue, IValue> getFunctionTuple(
   }
   auto bytecode_vals = to_tuple({qn, codeTable, schemaTable});
 
-  c10::optional<IValue> debug_info_vals;
+  std::optional<IValue> debug_info_vals;
   // module debug info
   // This is just a set of debug handles.
   // We always save debug handles.
@@ -754,7 +754,7 @@ void ScriptModuleSerializer::writeByteCode(
 
 namespace {
 
-c10::optional<std::string> type_printer(
+std::optional<std::string> type_printer(
     const c10::Type& type,
     torch::jit::TypeNameUniquer& type_name_uniquer) {
   if (auto dyn = type.castRaw<c10::DynamicType>()) {
diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
index a3dada3c715f0..5a47fe900f3fd 100644
--- a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
+++ b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
@@ -61,7 +61,7 @@ static TypePtr realType(TypePtr type) {
   }
 }
 
-auto print_type(const c10::Type& t) -> c10::optional<std::string> {
+auto print_type(const c10::Type& t) -> std::optional<std::string> {
   auto namedType = t.cast<c10::NamedType>();
   if (namedType && namedType->name()) {
     return namedType->name().value().qualifiedName();
@@ -298,7 +298,7 @@ flatbuffers::Offset<mobile::serialization::Function> FlatbufferSerializer::
   auto register_size = static_cast<int>(code.register_size_);
 
   // schema
-  auto type_printer = [&](const c10::Type& t) -> c10::optional<std::string> {
+  auto type_printer = [&](const c10::Type& t) -> std::optional<std::string> {
     auto namedType = t.cast<c10::NamedType>();
     if (namedType && namedType->name()) {
       return namedType->name().value().qualifiedName();
diff --git a/torch/csrc/jit/serialization/import.cpp b/torch/csrc/jit/serialization/import.cpp
index e724853e70c1c..40d155e61c758 100644
--- a/torch/csrc/jit/serialization/import.cpp
+++ b/torch/csrc/jit/serialization/import.cpp
@@ -152,7 +152,7 @@ class ScriptModuleDeserializer final {
             reader_->version()) {}
 
   Module deserialize(
-      c10::optional<at::Device> device,
+      std::optional<at::Device> device,
       ExtraFilesMap& extra_files,
       bool restore_shapes = false);
 
@@ -162,7 +162,7 @@ class ScriptModuleDeserializer final {
   std::shared_ptr<CompilationUnit> compilation_unit_;
   std::shared_ptr<PyTorchStreamReader> reader_;
   std::shared_ptr<DeserializationStorageContext> storage_context_;
-  c10::optional<at::Device> device_;
+  std::optional<at::Device> device_;
   std::vector<at::IValue> constants_table_;
   std::string code_prefix_;
   std::string pickle_dir_prefix_;
@@ -248,7 +248,7 @@ graph(%x, %packed_params, %stride, %padding, %dilation, %groups, %r_scale, %r_ze
 }
 
 Module ScriptModuleDeserializer::deserialize(
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files,
     bool restore_shapes) {
   // we populate the upgraders map before any load starts
@@ -311,7 +311,7 @@ Module ScriptModuleDeserializer::deserialize(
 Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     std::istream& in,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     bool load_debug_files) {
   ExtraFilesMap extra_files;
   return import_ir_module(
@@ -322,7 +322,7 @@ static Module _load_jit_module_from_bytes(
     std::shared_ptr<char> data,
     size_t size,
     std::shared_ptr<CompilationUnit> cu,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     bool restore_shapes);
 
@@ -330,7 +330,7 @@ Module parse_and_initialize_jit_module(
     std::shared_ptr<char> data,
     size_t size,
     ExtraFilesMap& extra_files,
-    c10::optional<at::Device> device) {
+    std::optional<at::Device> device) {
   populate_upgraders_graph_map();
   ExtraFilesMap jit_files;
   std::vector<IValue> jit_constants;
@@ -349,7 +349,7 @@ Module parse_and_initialize_jit_module(
 Module load_jit_module_from_file(
     const std::string& filename,
     ExtraFilesMap& extra_files,
-    c10::optional<at::Device> device) {
+    std::optional<at::Device> device) {
   auto data = get_file_content(filename.c_str());
   return parse_and_initialize_jit_module(
       std::move(std::get<0>(data)), std::get<1>(data), extra_files, device);
@@ -358,7 +358,7 @@ Module load_jit_module_from_file(
 Module load_jit_module_from_stream(
     std::istream& in,
     ExtraFilesMap& extra_files,
-    c10::optional<at::Device> device) {
+    std::optional<at::Device> device) {
   auto data = get_stream_content(in);
   return parse_and_initialize_jit_module(
       std::move(std::get<0>(data)), std::get<1>(data), extra_files, device);
@@ -367,7 +367,7 @@ Module load_jit_module_from_stream(
 Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     std::istream& in,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files,
     bool restore_shapes) {
@@ -390,7 +390,7 @@ Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     std::shared_ptr<PyTorchStreamReader> reader,
     std::shared_ptr<DeserializationStorageContext> storage_context,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     std::string ts_id) {
   ScriptModuleDeserializer deserializer(
       std::move(cu),
@@ -405,7 +405,7 @@ Module import_ir_module(
 Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     const std::string& filename,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     bool load_debug_files) {
   ExtraFilesMap extra_files;
   return import_ir_module(
@@ -415,7 +415,7 @@ Module import_ir_module(
 Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     const std::string& filename,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files,
     bool restore_shapes) {
@@ -435,7 +435,7 @@ Module import_ir_module(
 Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     std::unique_ptr<ReadAdapterInterface> rai,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     bool load_debug_files) {
   ExtraFilesMap extra_files;
   return import_ir_module(
@@ -445,7 +445,7 @@ Module import_ir_module(
 Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     std::unique_ptr<ReadAdapterInterface> rai,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files) {
   std::shared_ptr<ReadAdapterInterface> rai_shared = std::move(rai);
@@ -456,7 +456,7 @@ Module import_ir_module(
 Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     std::shared_ptr<ReadAdapterInterface> rai,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files) {
   auto reader = std::make_shared<PyTorchStreamReader>(std::move(rai));
@@ -467,7 +467,7 @@ Module import_ir_module(
 
 Module load(
     std::istream& in,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     bool load_debug_files) {
   auto cu = std::make_shared<CompilationUnit>();
   return import_ir_module(std::move(cu), in, device, load_debug_files);
@@ -475,7 +475,7 @@ Module load(
 
 Module load(
     std::istream& in,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files) {
   auto cu = std::make_shared<CompilationUnit>();
@@ -485,7 +485,7 @@ Module load(
 
 Module load(
     const std::string& filename,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     bool load_debug_files) {
   auto cu = std::make_shared<CompilationUnit>();
   return import_ir_module(std::move(cu), filename, device, load_debug_files);
@@ -493,7 +493,7 @@ Module load(
 
 Module load(
     const std::string& filename,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files) {
   auto cu = std::make_shared<CompilationUnit>();
@@ -503,7 +503,7 @@ Module load(
 
 Module load(
     std::shared_ptr<ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     bool load_debug_files) {
   auto cu = std::make_shared<CompilationUnit>();
   ExtraFilesMap extra_files;
@@ -513,7 +513,7 @@ Module load(
 
 Module load(
     std::shared_ptr<ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files) {
   auto cu = std::make_shared<CompilationUnit>();
@@ -525,7 +525,7 @@ Module _load_jit_module_from_bytes(
     std::shared_ptr<char> data,
     size_t size,
     std::shared_ptr<CompilationUnit> cu,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     bool restore_shapes) {
   TORCH_CHECK(size >= kFileFormatHeaderSize, "Unrecognized data format");
diff --git a/torch/csrc/jit/serialization/import.h b/torch/csrc/jit/serialization/import.h
index c8379f38810f7..b090a1c80a3cd 100644
--- a/torch/csrc/jit/serialization/import.h
+++ b/torch/csrc/jit/serialization/import.h
@@ -21,25 +21,25 @@ class DeserializationStorageContext;
 TORCH_API Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     const std::string& filename,
-    c10::optional<c10::Device> device = c10::nullopt,
+    std::optional<c10::Device> device = c10::nullopt,
     bool load_debug_files = true);
 
 TORCH_API Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     std::istream& in,
-    c10::optional<c10::Device> device = c10::nullopt,
+    std::optional<c10::Device> device = c10::nullopt,
     bool load_debug_files = true);
 
 TORCH_API Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     std::unique_ptr<caffe2::serialize::ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device = c10::nullopt,
+    std::optional<c10::Device> device = c10::nullopt,
     bool load_debug_files = true);
 
 TORCH_API Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     const std::string& filename,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files = true,
     bool restore_shapes = false);
@@ -49,13 +49,13 @@ TORCH_API Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     std::shared_ptr<caffe2::serialize::PyTorchStreamReader> reader,
     std::shared_ptr<torch::jit::DeserializationStorageContext> storage_context,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     std::string ts_id /* torchscript identifier inside package */);
 
 TORCH_API Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     std::istream& in,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files = true,
     bool restore_shapes = false);
@@ -63,14 +63,14 @@ TORCH_API Module import_ir_module(
 TORCH_API Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     std::unique_ptr<caffe2::serialize::ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files = true);
 
 TORCH_API Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files = true);
 
@@ -80,12 +80,12 @@ TORCH_API Module import_ir_module(
 /// `torch::jit::ExportModule` in C++.
 TORCH_API Module load(
     std::istream& in,
-    c10::optional<c10::Device> device = c10::nullopt,
+    std::optional<c10::Device> device = c10::nullopt,
     bool load_debug_files = true);
 
 TORCH_API Module load(
     std::istream& in,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files = true);
 
@@ -96,12 +96,12 @@ TORCH_API Module load(
 /// Python or `torch::jit::ExportModule` in C++.
 TORCH_API Module load(
     const std::string& filename,
-    c10::optional<c10::Device> device = c10::nullopt,
+    std::optional<c10::Device> device = c10::nullopt,
     bool load_debug_files = true);
 
 TORCH_API Module load(
     const std::string& filename,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files = true);
 
@@ -112,12 +112,12 @@ TORCH_API Module load(
 /// Python or `torch::jit::ExportModule` in C++.
 TORCH_API Module load(
     std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device = c10::nullopt,
+    std::optional<c10::Device> device = c10::nullopt,
     bool load_debug_files = true);
 
 TORCH_API Module load(
     std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai,
-    c10::optional<c10::Device> device,
+    std::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
     bool load_debug_files = true);
 
@@ -131,23 +131,23 @@ TORCH_API Module parse_and_initialize_jit_module(
     std::shared_ptr<char> data,
     size_t size,
     ExtraFilesMap& extra_files,
-    c10::optional<at::Device> device = c10::nullopt);
+    std::optional<at::Device> device = c10::nullopt);
 
 TORCH_API Module load_jit_module_from_file(
     const std::string& filename,
     ExtraFilesMap& extra_files,
-    c10::optional<at::Device> device = c10::nullopt);
+    std::optional<at::Device> device = c10::nullopt);
 
 TORCH_API Module load_jit_module_from_stream(
     std::istream& in,
     ExtraFilesMap& extra_files,
-    c10::optional<at::Device> device = c10::nullopt);
+    std::optional<at::Device> device = c10::nullopt);
 
 TORCH_API Module parse_and_initialize_jit_module(
     std::shared_ptr<char> data,
     size_t size,
     ExtraFilesMap& extra_files,
-    c10::optional<at::Device> device);
+    std::optional<at::Device> device);
 
 TORCH_API c10::intrusive_ptr<c10::ivalue::Object> ObjLoaderFunc(
     const at::StrongTypePtr& type,
diff --git a/torch/csrc/jit/serialization/import_legacy.cpp b/torch/csrc/jit/serialization/import_legacy.cpp
index 85ec2675a9c23..d7c592d18c72f 100644
--- a/torch/csrc/jit/serialization/import_legacy.cpp
+++ b/torch/csrc/jit/serialization/import_legacy.cpp
@@ -41,7 +41,7 @@ class ScriptModuleDeserializer final {
   ScriptModuleDeserializer(
       std::shared_ptr<CompilationUnit> cu,
       std::shared_ptr<PyTorchStreamReader> reader,
-      const c10::optional<at::Device>& device)
+      const std::optional<at::Device>& device)
       : compilation_unit_(std::move(cu)),
         reader_(std::move(reader)),
         device_(device),
@@ -77,7 +77,7 @@ class ScriptModuleDeserializer final {
 
   std::shared_ptr<CompilationUnit> compilation_unit_;
   std::shared_ptr<PyTorchStreamReader> reader_;
-  c10::optional<at::Device> device_;
+  std::optional<at::Device> device_;
   // Legacy only tensor can be a constant.
   std::vector<at::IValue> constant_table_;
   std::vector<at::Tensor> tensor_table_;
@@ -377,7 +377,7 @@ Module ScriptModuleDeserializer::LEGACY_convertModule(
 Module LEGACY_deserialize(
     std::shared_ptr<CompilationUnit> cu,
     std::shared_ptr<caffe2::serialize::PyTorchStreamReader> reader,
-    const c10::optional<c10::Device>& device) {
+    const std::optional<c10::Device>& device) {
   ScriptModuleDeserializer deserializer(
       std::move(cu), std::move(reader), device);
   return deserializer.LEGACY_deserialize();
diff --git a/torch/csrc/jit/serialization/import_legacy.h b/torch/csrc/jit/serialization/import_legacy.h
index a261828109596..2e206eae09bcf 100644
--- a/torch/csrc/jit/serialization/import_legacy.h
+++ b/torch/csrc/jit/serialization/import_legacy.h
@@ -17,7 +17,7 @@ struct CompilationUnit;
 Module LEGACY_deserialize(
     std::shared_ptr<CompilationUnit> cu,
     std::shared_ptr<caffe2::serialize::PyTorchStreamReader> reader,
-    const c10::optional<c10::Device>& device);
+    const std::optional<c10::Device>& device);
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/import_read.cpp b/torch/csrc/jit/serialization/import_read.cpp
index 533fed491773f..eeaa79c856627 100644
--- a/torch/csrc/jit/serialization/import_read.cpp
+++ b/torch/csrc/jit/serialization/import_read.cpp
@@ -7,9 +7,9 @@ IValue readArchiveAndTensors(
     const std::string& archive_name,
     const std::string& pickle_prefix,
     const std::string& tensor_prefix,
-    c10::optional<TypeResolver> type_resolver,
-    c10::optional<ObjLoader> obj_loader,
-    c10::optional<at::Device> device,
+    std::optional<TypeResolver> type_resolver,
+    std::optional<ObjLoader> obj_loader,
+    std::optional<at::Device> device,
     caffe2::serialize::PyTorchStreamReader& stream_reader,
     c10::TypePtr (*type_parser)(const std::string&),
     std::shared_ptr<DeserializationStorageContext> storage_context) {
diff --git a/torch/csrc/jit/serialization/import_read.h b/torch/csrc/jit/serialization/import_read.h
index ab89f93880c34..ae78f1979f10a 100644
--- a/torch/csrc/jit/serialization/import_read.h
+++ b/torch/csrc/jit/serialization/import_read.h
@@ -16,9 +16,9 @@ TORCH_API IValue readArchiveAndTensors(
     const std::string& archive_name,
     const std::string& pickle_prefix,
     const std::string& tensor_prefix,
-    c10::optional<TypeResolver> type_resolver,
-    c10::optional<ObjLoader> obj_loader,
-    c10::optional<at::Device> device,
+    std::optional<TypeResolver> type_resolver,
+    std::optional<ObjLoader> obj_loader,
+    std::optional<at::Device> device,
     caffe2::serialize::PyTorchStreamReader& stream_reader,
     c10::TypePtr (*type_parser)(const std::string&) =
         Unpickler::defaultTypeParser,
diff --git a/torch/csrc/jit/serialization/import_source.cpp b/torch/csrc/jit/serialization/import_source.cpp
index 53d0d9fd47359..f67c2a22e9eb1 100644
--- a/torch/csrc/jit/serialization/import_source.cpp
+++ b/torch/csrc/jit/serialization/import_source.cpp
@@ -304,7 +304,7 @@ void SourceImporterImpl::importNamedType(
   }
 }
 
-c10::optional<Assign> SourceImporterImpl::
+std::optional<Assign> SourceImporterImpl::
     attributeAssignmentSpecialHandlingHack(
         const QualifiedName& qualified_classname,
         const Assign& assign) {
@@ -703,7 +703,7 @@ void SourceImporterImpl::importNamedTuple(
     const auto assign = Assign(statement);
 
     auto name = Var(Assign(statement).lhs()).name().name();
-    c10::optional<IValue> default_val;
+    std::optional<IValue> default_val;
     if (assign.rhs().present()) {
       std::vector<IValue> parsed = type_parser.evaluateDefaults(
           assign.rhs().range(), {assign.rhs().get()}, {assign.type().get()});
diff --git a/torch/csrc/jit/serialization/import_source.h b/torch/csrc/jit/serialization/import_source.h
index 9a720a81bcbb2..9b364f379b409 100644
--- a/torch/csrc/jit/serialization/import_source.h
+++ b/torch/csrc/jit/serialization/import_source.h
@@ -45,7 +45,7 @@ struct SourceImporterImpl : public Resolver,
  private:
   void importFunction(const std::string& qualifier, const Def& def);
   void importNamedType(const std::string& qualifier, const ClassDef& class_def);
-  c10::optional<Assign> attributeAssignmentSpecialHandlingHack(
+  std::optional<Assign> attributeAssignmentSpecialHandlingHack(
       const QualifiedName& qualified_classname,
       const Assign& assign);
   void importClass(
@@ -66,7 +66,7 @@ struct SourceImporterImpl : public Resolver,
   std::shared_ptr<CompilationUnit> cu_;
   std::unordered_map<std::string, std::shared_ptr<SugaredValue>> env_;
   SourceLoader source_loader_;
-  c10::optional<size_t> version_ = c10::nullopt;
+  std::optional<size_t> version_ = c10::nullopt;
   std::unordered_set<std::string> loaded_sources_;
   // named types and functions loaded from a file but not yet defined because
   // their type has not been requested yet.
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 6e1b399e40fd4..173ab5c13e5da 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -601,7 +601,7 @@ void Pickler::startTypeTag() {
   }
 }
 namespace {
-c10::optional<std::string> type_printer(const c10::Type& type) {
+std::optional<std::string> type_printer(const c10::Type& type) {
   if (auto dyn = type.castRaw<c10::DynamicType>()) {
     return dyn->fallback()->annotation_str(type_printer);
   }
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index 4f553b6f7ca8a..39726d00b0998 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -311,14 +311,14 @@ inline std::unordered_set<c10::DeviceType>& GetBackendMetaAllowlist() {
 // Dynamically obtain serialization function pairs
 // that require the corresponding backend.
 inline std::array<
-    c10::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>,
+    std::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>,
     at::COMPILE_TIME_MAX_DEVICE_TYPES>&
 GetBackendMetaSerialization() {
   // The array to save function pointer for BackendMeta serialization.
   // key is the DeviceType, value is std::pair obj.
   // value.first represent get function and value.seconde represent set function
   static std::array<
-      c10::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>,
+      std::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>,
       at::COMPILE_TIME_MAX_DEVICE_TYPES>
       BackendMetaSerialization;
   return BackendMetaSerialization;
@@ -348,7 +348,7 @@ TORCH_API inline void TensorBackendMetaRegistry(
       t,
       " has been registered.");
   BackendMetaSerialization[device_type] =
-      c10::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>(
+      std::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>(
           std::make_pair(get_fptr, set_fptr));
 }
 
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index cac31c6ce5868..f1b0865032c39 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -1714,7 +1714,7 @@ static std::vector<IValue> traverseIValueAndGetObjects(IValue ivalue) {
   return result;
 }
 
-static c10::optional<std::string> printType(
+static std::optional<std::string> printType(
     const c10::Type& type,
     torch::jit::TypeNameUniquer& type_name_uniquer) {
   if (auto dyn = type.castRaw<c10::DynamicType>()) {
diff --git a/torch/csrc/jit/serialization/source_range_serialization.cpp b/torch/csrc/jit/serialization/source_range_serialization.cpp
index d3c4eaf7bf491..118becd20dc7c 100644
--- a/torch/csrc/jit/serialization/source_range_serialization.cpp
+++ b/torch/csrc/jit/serialization/source_range_serialization.cpp
@@ -68,7 +68,7 @@ std::shared_ptr<Source> SourceRangeDeserializer::deserialize_source(
     const auto& textIndex = tup_elems[0].toIntList();
     int64_t fnameIndex = tup_elems[1].toInt();
     int64_t starting_line_no_ = tup_elems[2].toInt();
-    c10::optional<std::string> filename = c10::nullopt;
+    std::optional<std::string> filename = c10::nullopt;
 
     TORCH_CHECK(
         (uint64_t)fnameIndex < text_table_.size(),
@@ -88,7 +88,7 @@ std::shared_ptr<Source> SourceRangeDeserializer::deserialize_source(
     source = std::make_shared<Source>(str_cord, filename, starting_line_no_);
   } else {
     std::string text_ = tup_elems[0].toStringRef();
-    c10::optional<std::string> filename_ =
+    std::optional<std::string> filename_ =
         tup_elems[1].toOptional<std::string>();
     int64_t starting_line_no_ = tup_elems[2].toInt();
     source = std::make_shared<Source>(
@@ -229,7 +229,7 @@ void ConcreteSourceRangeUnpickler::unpickle() {
   }
 }
 
-c10::optional<SourceRange> ConcreteSourceRangeUnpickler::
+std::optional<SourceRange> ConcreteSourceRangeUnpickler::
     findSourceRangeThatGenerated(const SourceRange& range) {
   unpickle();
 
diff --git a/torch/csrc/jit/serialization/source_range_serialization.h b/torch/csrc/jit/serialization/source_range_serialization.h
index bbfd533cd1789..044e9655a9ea1 100644
--- a/torch/csrc/jit/serialization/source_range_serialization.h
+++ b/torch/csrc/jit/serialization/source_range_serialization.h
@@ -55,7 +55,7 @@ class SourceRangeDeserializer {
 
 class SourceRangeUnpickler {
  public:
-  virtual c10::optional<SourceRange> findSourceRangeThatGenerated(
+  virtual std::optional<SourceRange> findSourceRangeThatGenerated(
       const SourceRange& range) = 0;
 
   virtual ~SourceRangeUnpickler() = default;
diff --git a/torch/csrc/jit/serialization/source_range_serialization_impl.h b/torch/csrc/jit/serialization/source_range_serialization_impl.h
index 2b7cd5a14ba92..9b00956ccd048 100644
--- a/torch/csrc/jit/serialization/source_range_serialization_impl.h
+++ b/torch/csrc/jit/serialization/source_range_serialization_impl.h
@@ -12,7 +12,7 @@ class ConcreteSourceRangeUnpickler : public SourceRangeUnpickler {
  public:
   ConcreteSourceRangeUnpickler(at::DataPtr&& data, size_t size);
 
-  c10::optional<SourceRange> findSourceRangeThatGenerated(
+  std::optional<SourceRange> findSourceRangeThatGenerated(
       const SourceRange& range) override;
 
  private:
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index 26fa21575368d..ee5793b14856a 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -822,7 +822,7 @@ void Unpickler::readGlobal(
     // like the other branches here because no REDUCE or BUILD will
     // be called on this value. Instead, we just put it on the stack
     // and return early
-    c10::optional<c10::ScalarType> scalar_type;
+    std::optional<c10::ScalarType> scalar_type;
 #define CHECK_SCALAR(_, name)          \
   if (class_name == #name "Storage") { \
     scalar_type = c10::k##name;        \
@@ -834,7 +834,7 @@ void Unpickler::readGlobal(
       return;
     }
 
-    c10::optional<at::QScheme> qscheme;
+    std::optional<at::QScheme> qscheme;
     for (int i = 0; i < at::COMPILE_TIME_NUM_QSCHEMES; ++i) {
       if (class_name == toString(static_cast<at::QScheme>(i))) {
         qscheme = static_cast<at::QScheme>(i);
diff --git a/torch/csrc/jit/serialization/unpickler.h b/torch/csrc/jit/serialization/unpickler.h
index bc980bf90522b..eed216455f3e2 100644
--- a/torch/csrc/jit/serialization/unpickler.h
+++ b/torch/csrc/jit/serialization/unpickler.h
@@ -68,7 +68,7 @@ class TORCH_API Unpickler {
       TypeResolver type_resolver,
       ObjLoader obj_loader,
       std::function<at::DataPtr(const std::string&)> read_record,
-      c10::optional<at::Device> device,
+      std::optional<at::Device> device,
       bool use_storage_device = false,
       TypeParserT type_parser = defaultTypeParser,
       std::shared_ptr<DeserializationStorageContext> storage_context = nullptr)
@@ -178,7 +178,7 @@ class TORCH_API Unpickler {
   IValue empty_tuple_;
 
   std::function<at::DataPtr(const std::string&)> read_record_;
-  c10::optional<at::Device> device_;
+  std::optional<at::Device> device_;
   // When set to true, Unpickler will ignore the pickled device and use the
   // device of the DataPtr returned by the read_record_ function. The default
   // value of this flag is false.
diff --git a/torch/csrc/jit/tensorexpr/codegen.cpp b/torch/csrc/jit/tensorexpr/codegen.cpp
index 53754aab7c0d6..e1464d0efc3ec 100644
--- a/torch/csrc/jit/tensorexpr/codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/codegen.cpp
@@ -95,7 +95,7 @@ void CodeGen::call_with_numel(void** args, int64_t numel) {
       false, "This codegen backend does not implement call_with_numel");
 }
 
-static c10::optional<size_t> bufSize(BufPtr buf) {
+static std::optional<size_t> bufSize(BufPtr buf) {
   size_t size = elementSize(buf->dtype().scalar_type()) * buf->dtype().lanes();
   for (auto& d : buf->dims()) {
     if (!d->isConstant()) {
diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h
index fdcf3425e3abc..42db25c26ea49 100644
--- a/torch/csrc/jit/tensorexpr/codegen.h
+++ b/torch/csrc/jit/tensorexpr/codegen.h
@@ -85,10 +85,10 @@ class TORCH_API CodeGen {
   virtual at::Tensor empty_strided(
       c10::IntArrayRef size,
       c10::IntArrayRef stride,
-      c10::optional<c10::ScalarType> dtype_opt,
-      c10::optional<c10::Layout> layout_opt,
-      c10::optional<c10::Device> device_opt,
-      c10::optional<bool> pin_memory_opt) {
+      std::optional<c10::ScalarType> dtype_opt,
+      std::optional<c10::Layout> layout_opt,
+      std::optional<c10::Device> device_opt,
+      std::optional<bool> pin_memory_opt) {
     return at::empty_strided(
         size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
   }
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 07626232399e4..602bc49302c53 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -1275,10 +1275,10 @@ void CudaCodeGen::call(const std::vector<CallArg>& args) {
 at::Tensor CudaCodeGen::empty_strided(
     c10::IntArrayRef size,
     c10::IntArrayRef stride,
-    c10::optional<c10::ScalarType> dtype_opt,
-    c10::optional<c10::Layout> layout_opt,
-    c10::optional<c10::Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+    std::optional<c10::ScalarType> dtype_opt,
+    std::optional<c10::Layout> layout_opt,
+    std::optional<c10::Device> device_opt,
+    std::optional<bool> pin_memory_opt) {
   c10::DeviceGuard device_guard(device_opt.value());
   return at::native::empty_strided_cuda(
       size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.h b/torch/csrc/jit/tensorexpr/cuda_codegen.h
index 22de1ce32d00f..74f3d4ec7835b 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.h
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.h
@@ -235,10 +235,10 @@ class TORCH_CUDA_CU_API CudaCodeGen : public CodeGen {
   at::Tensor empty_strided(
       c10::IntArrayRef size,
       c10::IntArrayRef stride,
-      c10::optional<c10::ScalarType> dtype_opt,
-      c10::optional<c10::Layout> layout_opt,
-      c10::optional<c10::Device> device_opt,
-      c10::optional<bool> pin_memory_opt) override;
+      std::optional<c10::ScalarType> dtype_opt,
+      std::optional<c10::Layout> layout_opt,
+      std::optional<c10::Device> device_opt,
+      std::optional<bool> pin_memory_opt) override;
 
   const std::vector<ExprPtr>& gpu_block_extents() const {
     return cuda_analysis_->gpu_block_extents();
diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp
index e5a59ae33ef26..be1057e21c3c7 100644
--- a/torch/csrc/jit/tensorexpr/eval.cpp
+++ b/torch/csrc/jit/tensorexpr/eval.cpp
@@ -1300,7 +1300,7 @@ InterpValue SimpleIREvaluator::value() const {
   return impl_->value();
 }
 
-c10::optional<int64_t> evalInt(ExprPtr e) {
+std::optional<int64_t> evalInt(ExprPtr e) {
   try {
     return ExprEval<SimpleIREvaluator>(cast<int64_t>(ExprHandle(e)))
         .value<int64_t>();
diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h
index 64ac1edf8f188..9bbea1bd28a43 100644
--- a/torch/csrc/jit/tensorexpr/eval.h
+++ b/torch/csrc/jit/tensorexpr/eval.h
@@ -307,7 +307,7 @@ class ExprEval {
 
 // Evaluates the given expression and returns an int64_t value if the result of
 // the given expression is int64_t.
-c10::optional<int64_t> evalInt(ExprPtr e);
+std::optional<int64_t> evalInt(ExprPtr e);
 
 // Substitutes the given vars with their corresponding expressions in the input
 // expression.
diff --git a/torch/csrc/jit/tensorexpr/expr.cpp b/torch/csrc/jit/tensorexpr/expr.cpp
index cffc5e45dbf46..bf3cc13ccb39f 100644
--- a/torch/csrc/jit/tensorexpr/expr.cpp
+++ b/torch/csrc/jit/tensorexpr/expr.cpp
@@ -415,7 +415,7 @@ Buf::Buf(
     std::vector<ExprPtr> dims,
     Dtype dtype,
     ExprPtr initializer,
-    c10::optional<std::vector<ExprPtr>> strides,
+    std::optional<std::vector<ExprPtr>> strides,
     ExprPtr qscale,
     ExprPtr qzero)
     : ExprNodeBase(dtype, kPrimitive),
@@ -452,11 +452,11 @@ BufHandle Buf::make(
     const std::string& name_hint,
     const std::vector<ExprHandle>& dims,
     Dtype dtype,
-    c10::optional<ExprHandle> initializer,
-    c10::optional<std::vector<ExprHandle>> strides,
-    c10::optional<ExprHandle> qscale,
-    c10::optional<ExprHandle> qzero) {
-  c10::optional<std::vector<ExprPtr>> opt_strides;
+    std::optional<ExprHandle> initializer,
+    std::optional<std::vector<ExprHandle>> strides,
+    std::optional<ExprHandle> qscale,
+    std::optional<ExprHandle> qzero) {
+  std::optional<std::vector<ExprPtr>> opt_strides;
   if (strides) {
     opt_strides = ExprHandleVectorToExprVector(*strides);
   }
diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h
index 1a0cc57875d19..8c8de89975750 100644
--- a/torch/csrc/jit/tensorexpr/expr.h
+++ b/torch/csrc/jit/tensorexpr/expr.h
@@ -207,10 +207,10 @@ class TORCH_API Buf : public ExprNode<Buf> {
       const std::string& name_hint,
       const std::vector<ExprHandle>& dims,
       Dtype dtype,
-      c10::optional<ExprHandle> initializer = c10::nullopt,
-      c10::optional<std::vector<ExprHandle>> strides = c10::nullopt,
-      c10::optional<ExprHandle> qscale = c10::nullopt,
-      c10::optional<ExprHandle> qzero = c10::nullopt);
+      std::optional<ExprHandle> initializer = c10::nullopt,
+      std::optional<std::vector<ExprHandle>> strides = c10::nullopt,
+      std::optional<ExprHandle> qscale = c10::nullopt,
+      std::optional<ExprHandle> qzero = c10::nullopt);
 
   // TODO: unique_name
   VarPtr base_handle() const {
@@ -232,7 +232,7 @@ class TORCH_API Buf : public ExprNode<Buf> {
       const std::vector<ExprPtr>& dims,
       Dtype dtype,
       ExprPtr initializer = nullptr,
-      c10::optional<std::vector<ExprPtr>> strides = c10::nullopt,
+      std::optional<std::vector<ExprPtr>> strides = c10::nullopt,
       ExprPtr qscale = nullptr,
       ExprPtr qzero = nullptr)
       : Buf(alloc<Var>(name_hint, kHandle),
@@ -248,7 +248,7 @@ class TORCH_API Buf : public ExprNode<Buf> {
       std::vector<ExprPtr> dims,
       Dtype dtype,
       ExprPtr initializer = nullptr,
-      c10::optional<std::vector<ExprPtr>> strides = c10::nullopt,
+      std::optional<std::vector<ExprPtr>> strides = c10::nullopt,
       ExprPtr qscale = nullptr,
       ExprPtr qzero = nullptr);
 
diff --git a/torch/csrc/jit/tensorexpr/external_functions.cpp b/torch/csrc/jit/tensorexpr/external_functions.cpp
index c593ab80e811c..a3146ccfaff55 100644
--- a/torch/csrc/jit/tensorexpr/external_functions.cpp
+++ b/torch/csrc/jit/tensorexpr/external_functions.cpp
@@ -80,7 +80,7 @@ std::vector<at::Tensor> constructTensors(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    c10::optional<std::vector<std::pair<size_t, QIData>>> qdataArg) {
+    std::optional<std::vector<std::pair<size_t, QIData>>> qdataArg) {
   std::vector<void*> buf_data_vec;
   std::vector<std::vector<int64_t>> buf_dims_vec;
   std::vector<std::vector<int64_t>> buf_strides_vec;
@@ -123,7 +123,7 @@ std::vector<at::Tensor> constructTensors(
     }
   } else {
     // handle quantized
-    std::vector<c10::optional<QIData>> qdata(bufs_num, c10::nullopt);
+    std::vector<std::optional<QIData>> qdata(bufs_num, c10::nullopt);
     for (const auto& qd : *qdataArg) {
       qdata[qd.first] = qd.second;
     }
@@ -172,7 +172,7 @@ static std::vector<at::Tensor> constructTensors(
     int64_t* buf_strides,
     int8_t* buf_dtypes,
     std::vector<std::pair<size_t, QIData>> qdata) {
-  c10::optional<std::vector<std::pair<size_t, QIData>>> opt = std::move(qdata);
+  std::optional<std::vector<std::pair<size_t, QIData>>> opt = std::move(qdata);
   return constructTensors(
       bufs_num, buf_data, buf_ranks, buf_dims, buf_strides, buf_dtypes, opt);
 }
@@ -184,7 +184,7 @@ std::vector<at::Tensor> constructTensors2(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    c10::optional<std::vector<std::pair<size_t, QIData>>> qdataArg,
+    std::optional<std::vector<std::pair<size_t, QIData>>> qdataArg,
     size_t bufs_out_num) {
   std::vector<void*> buf_data_vec;
   std::vector<std::vector<int64_t>> buf_dims_vec;
@@ -233,7 +233,7 @@ std::vector<at::Tensor> constructTensors2(
     }
   } else {
     // handle quantized
-    std::vector<c10::optional<QIData>> qdata(bufs_in_num, c10::nullopt);
+    std::vector<std::optional<QIData>> qdata(bufs_in_num, c10::nullopt);
     for (const auto& qd : *qdataArg) {
       qdata[qd.first - bufs_out_num] = qd.second;
     }
@@ -283,7 +283,7 @@ static std::vector<at::Tensor> constructTensors2(
     int8_t* buf_dtypes,
     std::vector<std::pair<size_t, QIData>> qdata,
     size_t bufs_out_num = 0u) {
-  c10::optional<std::vector<std::pair<size_t, QIData>>> opt = std::move(qdata);
+  std::optional<std::vector<std::pair<size_t, QIData>>> opt = std::move(qdata);
   return constructTensors2(
       bufs_in_num,
       buf_data,
@@ -331,15 +331,15 @@ static at::Tensor quantized_mul_scalar(const at::Tensor& x, double scalar) {
 static at::Tensor quantized_cat(
     const c10::List<at::Tensor>& qxs,
     int64_t dim,
-    c10::optional<double> scale,
-    c10::optional<int64_t> zero) {
+    std::optional<double> scale,
+    std::optional<int64_t> zero) {
   const auto op = c10::Dispatcher::singleton()
                       .findSchemaOrThrow("quantized::cat", "")
                       .typed<at::Tensor(
                           c10::List<at::Tensor> const&,
                           int64_t,
-                          c10::optional<double>,
-                          c10::optional<int64_t>)>();
+                          std::optional<double>,
+                          std::optional<int64_t>)>();
   return op.redispatch(
       c10::DispatchKeySet({c10::DispatchKey::QuantizedCPU}),
       qxs,
@@ -972,7 +972,7 @@ void nnc_aten_upsample_nearest2d(
   const int64_t x_qzero = extra_args[1];
   const int64_t x_qdtype = extra_args[2];
   const auto is_quantized = x_qdtype != -1;
-  c10::optional<std::vector<std::pair<size_t, QIData>>> qdata;
+  std::optional<std::vector<std::pair<size_t, QIData>>> qdata;
   if (is_quantized) {
     qdata = {
         {1u,
@@ -992,9 +992,9 @@ void nnc_aten_upsample_nearest2d(
   auto r = at::upsample_nearest2d(
       x,
       (output_size_h != -1)
-          ? c10::optional<at::IntArrayRef>({output_size_h, output_size_w})
+          ? std::optional<at::IntArrayRef>({output_size_h, output_size_w})
           : c10::nullopt,
-      (scale_factor_h != -1.f) ? c10::optional<at::ArrayRef<double>>(
+      (scale_factor_h != -1.f) ? std::optional<at::ArrayRef<double>>(
                                      {scale_factor_h, scale_factor_w})
                                : c10::nullopt);
   memcpy(buf_data[0], r.const_data_ptr(), r.element_size() * r.numel());
@@ -1015,7 +1015,7 @@ void nnc_aten_upsample_nearest2d_out(
   const int64_t x_qzero = extra_args[1];
   const int64_t x_qdtype = extra_args[2];
   const auto is_quantized = x_qdtype != -1;
-  c10::optional<std::vector<std::pair<size_t, QIData>>> qdata;
+  std::optional<std::vector<std::pair<size_t, QIData>>> qdata;
   if (is_quantized) {
     qdata = {
         {1u,
@@ -1042,9 +1042,9 @@ void nnc_aten_upsample_nearest2d_out(
   auto r = at::upsample_nearest2d(
       x,
       (output_size_h != -1)
-          ? c10::optional<at::IntArrayRef>({output_size_h, output_size_w})
+          ? std::optional<at::IntArrayRef>({output_size_h, output_size_w})
           : c10::nullopt,
-      (scale_factor_h != -1.f) ? c10::optional<at::ArrayRef<double>>(
+      (scale_factor_h != -1.f) ? std::optional<at::ArrayRef<double>>(
                                      {scale_factor_h, scale_factor_w})
                                : c10::nullopt);
   buf_data[0] = r.data_ptr();
diff --git a/torch/csrc/jit/tensorexpr/external_functions.h b/torch/csrc/jit/tensorexpr/external_functions.h
index 627d67c934d59..1fd90a3f056b8 100644
--- a/torch/csrc/jit/tensorexpr/external_functions.h
+++ b/torch/csrc/jit/tensorexpr/external_functions.h
@@ -74,7 +74,7 @@ std::vector<at::Tensor> constructTensors(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    c10::optional<std::vector<std::pair<size_t, QIData>>> qdataArg =
+    std::optional<std::vector<std::pair<size_t, QIData>>> qdataArg =
         c10::nullopt);
 
 std::vector<at::Tensor> constructTensors2(
@@ -84,7 +84,7 @@ std::vector<at::Tensor> constructTensors2(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    c10::optional<std::vector<std::pair<size_t, QIData>>> qdataArg =
+    std::optional<std::vector<std::pair<size_t, QIData>>> qdataArg =
         c10::nullopt,
     size_t bufs_out_num = 0);
 
diff --git a/torch/csrc/jit/tensorexpr/graph_opt.cpp b/torch/csrc/jit/tensorexpr/graph_opt.cpp
index c8f06fea063fd..01511b2b4d8c5 100644
--- a/torch/csrc/jit/tensorexpr/graph_opt.cpp
+++ b/torch/csrc/jit/tensorexpr/graph_opt.cpp
@@ -184,7 +184,7 @@ bool OptimizeCat(const std::shared_ptr<Graph>& graph) {
 
 void annotateInputShapes(
     const std::shared_ptr<Graph>& graph,
-    const std::vector<c10::optional<at::Tensor>>& example_inputs) {
+    const std::vector<std::optional<at::Tensor>>& example_inputs) {
   TORCH_INTERNAL_ASSERT(
       graph->inputs().size() == example_inputs.size(),
       buildErrorMessage("Given inputs do not match the fuser graph inputs."));
@@ -304,8 +304,8 @@ bool isGraphCompilable(const std::shared_ptr<Graph>& graph) {
 
 static void fixupTypeInfoForValue(
     Value* v,
-    c10::optional<at::ScalarType> scalar_type,
-    c10::optional<at::Device> device) {
+    std::optional<at::ScalarType> scalar_type,
+    std::optional<at::Device> device) {
   Node* n = v->node();
   auto const& t = v->type();
   if (t->kind() != TypeKind::TensorType) {
@@ -339,8 +339,8 @@ static void fixupTypeInfoForValue(
   v->setType(new_tt);
 }
 
-static c10::optional<at::ScalarType> inferScalarType(Node* n) {
-  c10::optional<at::ScalarType> scalar_type;
+static std::optional<at::ScalarType> inferScalarType(Node* n) {
+  std::optional<at::ScalarType> scalar_type;
   for (auto v : n->inputs()) {
     auto const& t = v->type();
     if (t->kind() == TypeKind::TensorType) {
@@ -358,8 +358,8 @@ static c10::optional<at::ScalarType> inferScalarType(Node* n) {
   return scalar_type;
 }
 
-static c10::optional<at::Device> inferDevice(Node* n) {
-  c10::optional<at::Device> device;
+static std::optional<at::Device> inferDevice(Node* n) {
+  std::optional<at::Device> device;
   for (auto v : n->inputs()) {
     auto const& t = v->type();
     if (t->kind() == TypeKind::TensorType) {
@@ -394,8 +394,8 @@ void fixupMissingShapeInfo(const std::shared_ptr<Graph>& graph) {
   }
 
   for (auto n : graph->nodes()) {
-    c10::optional<at::ScalarType> scalar_type = inferScalarType(n);
-    c10::optional<at::Device> device = inferDevice(n);
+    std::optional<at::ScalarType> scalar_type = inferScalarType(n);
+    std::optional<at::Device> device = inferDevice(n);
 
     for (auto v : n->outputs()) {
       fixupTypeInfoForValue(v, scalar_type, device);
diff --git a/torch/csrc/jit/tensorexpr/graph_opt.h b/torch/csrc/jit/tensorexpr/graph_opt.h
index 1180d0ac438b9..5bd2ec8600931 100644
--- a/torch/csrc/jit/tensorexpr/graph_opt.h
+++ b/torch/csrc/jit/tensorexpr/graph_opt.h
@@ -60,7 +60,7 @@ bool OptimizeCat(const std::shared_ptr<Graph>& graph);
 
 TORCH_API void annotateInputShapes(
     const std::shared_ptr<Graph>& graph,
-    const std::vector<c10::optional<at::Tensor>>& example_inputs);
+    const std::vector<std::optional<at::Tensor>>& example_inputs);
 TORCH_API std::shared_ptr<Graph> removeUnusedSelfArgument(
     const std::shared_ptr<Graph>& graph);
 TORCH_API std::shared_ptr<Graph> removeGraphOutput(
diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h
index 1ab21c83ef183..f35bafb332eaf 100644
--- a/torch/csrc/jit/tensorexpr/ir.h
+++ b/torch/csrc/jit/tensorexpr/ir.h
@@ -361,7 +361,7 @@ ExprPtr immLike(const ExprHandle& e, T v) {
   return immLike(e.node(), v);
 }
 
-inline c10::optional<int64_t> intValue(const ExprPtr& e) {
+inline std::optional<int64_t> intValue(const ExprPtr& e) {
 #define TYPE_CASE(Type, Name)      \
   if (auto v = to<Name##Imm>(e)) { \
     return v->value();             \
@@ -371,7 +371,7 @@ inline c10::optional<int64_t> intValue(const ExprPtr& e) {
   return c10::nullopt;
 }
 
-inline c10::optional<int64_t> intValue(const ExprHandle& e) {
+inline std::optional<int64_t> intValue(const ExprHandle& e) {
   return intValue(e.node());
 }
 
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index 4ce640bb8a739..afb7aefdda652 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -1867,7 +1867,7 @@ class ModRound {
   ExprPtr mod_divisor;
 };
 
-static c10::optional<class ModRound> isModRound(TermPtr e) {
+static std::optional<class ModRound> isModRound(TermPtr e) {
   DivPtr div{nullptr};
   ModPtr mod{nullptr};
   ExprPtr denom{nullptr};
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index a360762f5bf9c..50578a0414572 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -128,9 +128,9 @@ bool& getOptConditionals() {
   return opt_conditionals;
 }
 
-c10::optional<at::Device> pickDeviceType(
+std::optional<at::Device> pickDeviceType(
     const at::ArrayRef<torch::jit::Value*>& inputs) {
-  c10::optional<at::Device> device = c10::nullopt;
+  std::optional<at::Device> device = c10::nullopt;
   for (auto const& input : inputs) {
     auto tt = input->type()->cast<TensorType>();
     if (tt && tt->device()) {
@@ -143,9 +143,9 @@ c10::optional<at::Device> pickDeviceType(
   return device;
 }
 
-static c10::optional<at::Device> pickDeviceType(
+static std::optional<at::Device> pickDeviceType(
     const std::shared_ptr<Graph>& graph) {
-  c10::optional<at::Device> device = c10::nullopt;
+  std::optional<at::Device> device = c10::nullopt;
   for (auto const& node : graph->nodes()) {
     for (auto const& input : node->inputs()) {
       if (auto tt = input->type()->cast<TensorType>()) {
@@ -179,7 +179,7 @@ static c10::optional<at::Device> pickDeviceType(
 
 // If v is a Tensor with concretely-known sizes and dtype, return them, else
 // nullopt.
-static c10::optional<TensorInfo> getTensorInfoJit(torch::jit::Value* v) {
+static std::optional<TensorInfo> getTensorInfoJit(torch::jit::Value* v) {
   auto const& it = v->type()->cast<TensorType>();
 
   c10::ScalarType dtype = c10::ScalarType::Float;
@@ -527,7 +527,7 @@ std::vector<ExprHandle> TensorExprKernel::sizesForValue(
   throw malformed_input(msg);
 }
 
-static c10::optional<ScalarType> findDtypeForValue(const torch::jit::Value* v) {
+static std::optional<ScalarType> findDtypeForValue(const torch::jit::Value* v) {
   if (v->type()->kind() == TypeKind::TensorType) {
     auto tt = v->type()->cast<TensorType>();
     if (tt->scalarType()) {
@@ -707,7 +707,7 @@ static void fuseAllLoops(StmtPtr st) {
 }
 
 // Compute the trip count of a loop if it is a constant.
-static c10::optional<int64_t> tripCount(ForPtr loop) {
+static std::optional<int64_t> tripCount(ForPtr loop) {
   auto tc = IRSimplifier::simplify(
       cast<int64_t>(ExprHandle(loop->stop()) - ExprHandle(loop->start())));
   if (auto val = to<LongImm>(tc.node())) {
@@ -958,7 +958,7 @@ std::string TensorExprKernel::getCodeGenName(BackendType backendType) {
 }
 
 template <typename T>
-static bool isValidPrimProperty(const c10::optional<T>& a, T b) {
+static bool isValidPrimProperty(const std::optional<T>& a, T b) {
   return !a.has_value() || *a == b;
 }
 
diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h
index 45658beb750e9..d7c737d8f8f2c 100644
--- a/torch/csrc/jit/tensorexpr/kernel.h
+++ b/torch/csrc/jit/tensorexpr/kernel.h
@@ -274,10 +274,10 @@ class TORCH_API TensorExprKernel {
       const std::vector<BufPtr>& interm_bufs);
 
   struct UnpackedTensorOptions {
-    c10::optional<c10::ScalarType> dtype;
-    c10::optional<c10::Layout> layout;
-    c10::optional<c10::Device> device;
-    c10::optional<bool> pinned_memory;
+    std::optional<c10::ScalarType> dtype;
+    std::optional<c10::Layout> layout;
+    std::optional<c10::Device> device;
+    std::optional<bool> pinned_memory;
 
     UnpackedTensorOptions(const c10::TensorOptions& opts)
         : dtype(c10::optTypeMetaToScalarType(opts.dtype_opt())),
@@ -370,7 +370,7 @@ TORCH_API bool setFallbackAllowed(bool value);
 TORCH_API bool& getCatWoConditionals();
 TORCH_API bool& getOptConditionals();
 
-TORCH_API c10::optional<at::Device> pickDeviceType(
+TORCH_API std::optional<at::Device> pickDeviceType(
     const at::ArrayRef<torch::jit::Value*>& inputs);
 
 bool isContiguous(
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index fd7f0818996c9..dec03637847e2 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -84,16 +84,16 @@ C10_DEFINE_bool(
 
 namespace torch::jit::tensorexpr {
 
-c10::optional<std::string>& LLVMTargetTriple() {
-  static c10::optional<std::string> triple = c10::nullopt;
+std::optional<std::string>& LLVMTargetTriple() {
+  static std::optional<std::string> triple = c10::nullopt;
   return triple;
 }
-c10::optional<std::string>& LLVMTargetCPU() {
-  static c10::optional<std::string> cpu = c10::nullopt;
+std::optional<std::string>& LLVMTargetCPU() {
+  static std::optional<std::string> cpu = c10::nullopt;
   return cpu;
 }
-c10::optional<std::string>& LLVMTargetAttrs() {
-  static c10::optional<std::string> attrs = c10::nullopt;
+std::optional<std::string>& LLVMTargetAttrs() {
+  static std::optional<std::string> attrs = c10::nullopt;
   return attrs;
 }
 bool& LLVMAOTWorkflow() {
@@ -306,9 +306,9 @@ class LLVMCodeGenImpl : public IRVisitor {
       at::Device device,
       Dtype dtype,
       std::string kernel_func_name,
-      c10::optional<std::string> triple,
-      c10::optional<std::string> cpu,
-      c10::optional<std::string> attrs);
+      std::optional<std::string> triple,
+      std::optional<std::string> cpu,
+      std::optional<std::string> attrs);
   ~LLVMCodeGenImpl() = default;
 
   llvm::JITTargetAddress getKernelAddress() const;
@@ -397,9 +397,9 @@ LLVMCodeGen::LLVMCodeGen(
     at::Device device,
     const std::string& kernel_func_name,
     Dtype dtype,
-    c10::optional<std::string> triple,
-    c10::optional<std::string> cpu,
-    c10::optional<std::string> attrs)
+    std::optional<std::string> triple,
+    std::optional<std::string> cpu,
+    std::optional<std::string> attrs)
     : CodeGen(stmt, args, device, kernel_func_name) {
   impl_ = std::make_unique<LLVMCodeGenImpl>(
       this->stmt(),
@@ -446,10 +446,10 @@ void LLVMCodeGen::call(const std::vector<CallArg>& args) {
 at::Tensor LLVMCodeGen::empty_strided(
     c10::IntArrayRef size,
     c10::IntArrayRef stride,
-    c10::optional<c10::ScalarType> dtype_opt,
-    c10::optional<c10::Layout> layout_opt,
-    c10::optional<c10::Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+    std::optional<c10::ScalarType> dtype_opt,
+    std::optional<c10::Layout> layout_opt,
+    std::optional<c10::Device> device_opt,
+    std::optional<bool> pin_memory_opt) {
   return at::native::empty_strided_cpu(
       size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
 }
@@ -489,9 +489,9 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
     at::Device device,
     Dtype dtype,
     std::string kernel_func_name,
-    c10::optional<std::string> triple,
-    c10::optional<std::string> cpu,
-    c10::optional<std::string> attrs)
+    std::optional<std::string> triple,
+    std::optional<std::string> cpu,
+    std::optional<std::string> attrs)
     : context_(std::make_unique<llvm::LLVMContext>()),
       irb_(getContext()),
       kernel_func_name_(std::move(kernel_func_name)),
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.h b/torch/csrc/jit/tensorexpr/llvm_codegen.h
index 7ab506fa8fe1e..74271fa879f3d 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.h
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.h
@@ -27,9 +27,9 @@ class TORCH_API LLVMCodeGen : public CodeGen {
       at::Device device = at::kCPU,
       const std::string& kernel_func_name = "func",
       Dtype dtype = kInt,
-      c10::optional<std::string> triple = c10::nullopt,
-      c10::optional<std::string> cpu = c10::nullopt,
-      c10::optional<std::string> attrs = c10::nullopt);
+      std::optional<std::string> triple = c10::nullopt,
+      std::optional<std::string> cpu = c10::nullopt,
+      std::optional<std::string> attrs = c10::nullopt);
   explicit LLVMCodeGen(StmtPtr stmt);
 
   LLVMCodeGen() = delete;
@@ -48,10 +48,10 @@ class TORCH_API LLVMCodeGen : public CodeGen {
   at::Tensor empty_strided(
       c10::IntArrayRef size,
       c10::IntArrayRef stride,
-      c10::optional<c10::ScalarType> dtype_opt,
-      c10::optional<c10::Layout> layout_opt,
-      c10::optional<c10::Device> device_opt,
-      c10::optional<bool> pin_memory_opt) override;
+      std::optional<c10::ScalarType> dtype_opt,
+      std::optional<c10::Layout> layout_opt,
+      std::optional<c10::Device> device_opt,
+      std::optional<bool> pin_memory_opt) override;
 
   template <typename T>
   T value() {
@@ -126,14 +126,14 @@ struct TORCH_API LLVMCodeGenBuilder {
   at::Device device_ = at::kCPU;
   std::string kernelFuncName_ = "func";
   Dtype dtype_ = kInt;
-  c10::optional<std::string> triple_ = c10::nullopt;
-  c10::optional<std::string> cpu_ = c10::nullopt;
-  c10::optional<std::string> attrs_ = c10::nullopt;
+  std::optional<std::string> triple_ = c10::nullopt;
+  std::optional<std::string> cpu_ = c10::nullopt;
+  std::optional<std::string> attrs_ = c10::nullopt;
 };
 
-TORCH_API c10::optional<std::string>& LLVMTargetTriple();
-TORCH_API c10::optional<std::string>& LLVMTargetCPU();
-TORCH_API c10::optional<std::string>& LLVMTargetAttrs();
+TORCH_API std::optional<std::string>& LLVMTargetTriple();
+TORCH_API std::optional<std::string>& LLVMTargetCPU();
+TORCH_API std::optional<std::string>& LLVMTargetAttrs();
 TORCH_API bool& LLVMAOTWorkflow();
 
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.cpp b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
index 71f4fed3db3e7..37a4b8db6bb27 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
@@ -67,8 +67,8 @@ static llvm::SubtargetFeatures getHostSubtargetFeatures() {
 // Create a JTMB using the host's triple.  CPU and attrs default to the host
 // unless they are supplied.
 static llvm::orc::JITTargetMachineBuilder makeJTMBFromHost(
-    c10::optional<std::string> cpu,
-    c10::optional<std::string> attrs) {
+    std::optional<std::string> cpu,
+    std::optional<std::string> attrs) {
   llvm::orc::JITTargetMachineBuilder JTMB(
       (llvm::Triple(llvm::sys::getProcessTriple())));
   JTMB.setCPU(cpu.value_or(llvm::sys::getHostCPUName().str()));
@@ -85,8 +85,8 @@ static llvm::orc::JITTargetMachineBuilder makeJTMBFromHost(
 // Create a JTMB using a given triple.  Do not set cpu or attrs if not supplied.
 static llvm::orc::JITTargetMachineBuilder makeJTMBFromTriple(
     const std::string& triple,
-    c10::optional<std::string> cpu,
-    c10::optional<std::string> attrs) {
+    std::optional<std::string> cpu,
+    std::optional<std::string> attrs) {
   llvm::orc::JITTargetMachineBuilder JTMB((llvm::Triple(triple)));
   if (cpu) {
     JTMB.setCPU(*cpu);
@@ -100,9 +100,9 @@ static llvm::orc::JITTargetMachineBuilder makeJTMBFromTriple(
 }
 
 static llvm::orc::JITTargetMachineBuilder makeTargetMachineBuilder(
-    c10::optional<std::string> triple,
-    c10::optional<std::string> cpu,
-    c10::optional<std::string> attrs) {
+    std::optional<std::string> triple,
+    std::optional<std::string> cpu,
+    std::optional<std::string> attrs) {
   auto JTMB = triple ? makeJTMBFromTriple(*triple, cpu, attrs)
                      : makeJTMBFromHost(cpu, attrs);
 #if LLVM_VERSION_MAJOR >= 18
@@ -160,9 +160,9 @@ class TORCH_API PytorchLLVMJITImpl {
 
  public:
   PytorchLLVMJITImpl(
-      c10::optional<std::string> triple,
-      c10::optional<std::string> cpu,
-      c10::optional<std::string> attrs)
+      std::optional<std::string> triple,
+      std::optional<std::string> cpu,
+      std::optional<std::string> attrs)
       : TM(assertSuccess(makeTargetMachineBuilder(triple, cpu, attrs)
                              .createTargetMachine())),
         LLJ(assertSuccess(
@@ -241,9 +241,9 @@ class TORCH_API PytorchLLVMJITImpl {
 
  public:
   PytorchLLVMJITImpl(
-      c10::optional<std::string> triple,
-      c10::optional<std::string> cpu,
-      c10::optional<std::string> attrs)
+      std::optional<std::string> triple,
+      std::optional<std::string> cpu,
+      std::optional<std::string> attrs)
       : Resolver(createLegacyLookupResolver(
             ES,
             [this](const std::string& Name) -> JITSymbol {
@@ -320,9 +320,9 @@ class TORCH_API PytorchLLVMJITImpl {
 #endif
 
 PytorchLLVMJIT::PytorchLLVMJIT(
-    c10::optional<std::string> triple,
-    c10::optional<std::string> cpu,
-    c10::optional<std::string> attrs)
+    std::optional<std::string> triple,
+    std::optional<std::string> cpu,
+    std::optional<std::string> attrs)
     : impl_(std::make_unique<PytorchLLVMJITImpl>(triple, cpu, attrs)) {}
 
 PytorchLLVMJIT::~PytorchLLVMJIT() = default;
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h
index 4aca55a9abf47..98238e0043885 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.h
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.h
@@ -51,9 +51,9 @@ class PytorchLLVMJITImpl;
 class TORCH_API PytorchLLVMJIT {
  public:
   PytorchLLVMJIT(
-      c10::optional<std::string> triple,
-      c10::optional<std::string> cpu,
-      c10::optional<std::string> attrs);
+      std::optional<std::string> triple,
+      std::optional<std::string> cpu,
+      std::optional<std::string> attrs);
   ~PytorchLLVMJIT();
 
   void addModule(std::unique_ptr<Module> M, std::unique_ptr<LLVMContext> C);
diff --git a/torch/csrc/jit/tensorexpr/lowerings.cpp b/torch/csrc/jit/tensorexpr/lowerings.cpp
index 79f0c59e59b39..1518e06376c14 100644
--- a/torch/csrc/jit/tensorexpr/lowerings.cpp
+++ b/torch/csrc/jit/tensorexpr/lowerings.cpp
@@ -55,7 +55,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         auto sub_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) {
           // NB: sub isn't supported on boolean, no need to promote to integer.
@@ -86,7 +86,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_mul",
@@ -108,7 +108,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,                             \
          const std::vector<ExprHandle>& outputShape,                      \
          const std::vector<ExprHandle>& outputStrides,                    \
-         const c10::optional<ScalarType>& outputType,                     \
+         const std::optional<ScalarType>& outputType,                     \
          at::Device device) {                                             \
         return computeScalar(                                             \
             "aten_#op_name",                                              \
@@ -131,7 +131,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeScalar(
             "aten_div",
@@ -155,7 +155,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,                             \
          const std::vector<ExprHandle>& outputShape,                      \
          const std::vector<ExprHandle>& outputStrides,                    \
-         const c10::optional<ScalarType>& outputType,                     \
+         const std::optional<ScalarType>& outputType,                     \
          at::Device device) {                                             \
         return computeScalar(                                             \
             "aten_#op_name",                                              \
@@ -179,7 +179,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,                             \
          const std::vector<ExprHandle>& outputShape,                      \
          const std::vector<ExprHandle>& outputStrides,                    \
-         const c10::optional<ScalarType>& outputType,                     \
+         const std::optional<ScalarType>& outputType,                     \
          at::Device device) {                                             \
         return computeScalar(                                             \
             "aten_#op_name",                                              \
@@ -204,7 +204,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,                             \
          const std::vector<ExprHandle>& outputShape,                      \
          const std::vector<ExprHandle>& outputStrides,                    \
-         const c10::optional<ScalarType>& outputType,                     \
+         const std::optional<ScalarType>& outputType,                     \
          at::Device device) {                                             \
         return computeScalar(                                             \
             "aten_#op_name",                                              \
@@ -225,7 +225,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_div",
@@ -245,7 +245,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_and",
@@ -264,7 +264,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_or",
@@ -283,7 +283,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_xor",
@@ -302,7 +302,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_lshift",
@@ -321,7 +321,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_rshift",
@@ -340,7 +340,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_eq",
@@ -359,7 +359,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_ne",
@@ -378,7 +378,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_ge",
@@ -397,7 +397,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_gt",
@@ -416,7 +416,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_le",
@@ -435,7 +435,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_lt",
@@ -453,7 +453,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_min",
@@ -471,7 +471,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_max",
@@ -490,7 +490,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeThreeOperand(
             "aten_masked_fill",
@@ -513,7 +513,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         bool noMin = false;
         bool noMax = false;
@@ -561,7 +561,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeFourOperand(
             "aten_addcmul",
@@ -580,7 +580,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         // check if the activation is quantized
         const BufHandle& x = std::get<BufHandle>(inputs[0]);
@@ -604,7 +604,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_silu",
@@ -620,7 +620,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_reciprocal",
@@ -636,7 +636,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_neg",
@@ -652,7 +652,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_isnan",
@@ -673,7 +673,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         auto A = std::get<BufHandle>(inputs[0]);
         if (A.node()->qscale()) {
@@ -697,7 +697,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_leaky_relu",
@@ -719,7 +719,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_relu6",
@@ -739,7 +739,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         const auto& kApproximate = std::get<std::string>(inputs[1]);
         std::vector<ArgValue> operands = {inputs.front()};
@@ -787,7 +787,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_log",
@@ -805,7 +805,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_log10",
@@ -823,7 +823,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_log1p",
@@ -841,7 +841,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_log2",
@@ -859,7 +859,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_exp",
@@ -877,7 +877,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_expm1",
@@ -895,7 +895,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_erf",
@@ -913,7 +913,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_erfc",
@@ -931,7 +931,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_cos",
@@ -949,7 +949,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_sin",
@@ -967,7 +967,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_tan",
@@ -985,7 +985,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         const BufHandle& rhs = std::get<BufHandle>(inputs[1]);
         auto dtype = rhs.dtype();
@@ -1005,7 +1005,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_pow",
@@ -1050,7 +1050,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_fmod",
@@ -1069,7 +1069,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeThreeOperand(
             "aten_lerp",
@@ -1089,7 +1089,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         auto imodImpl = [](const ExprHandle& lhs, const ExprHandle& rhs) {
           return Mod::make(lhs, rhs);
@@ -1137,7 +1137,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_acos",
@@ -1155,7 +1155,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_asin",
@@ -1173,7 +1173,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_cosh",
@@ -1191,7 +1191,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_sinh",
@@ -1209,7 +1209,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_atan",
@@ -1227,7 +1227,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_atan2",
@@ -1247,7 +1247,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_tanh",
@@ -1265,7 +1265,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeThreeOperand(
             "aten_hardtanh",
@@ -1286,7 +1286,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeThreeOperand(
             "aten_softplus",
@@ -1314,7 +1314,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_mish",
@@ -1333,7 +1333,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeFourOperand(
             "aten_elu",
@@ -1366,7 +1366,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_hardsigmoid",
@@ -1387,7 +1387,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_hardswish",
@@ -1410,7 +1410,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTwoOperand(
             "aten_hardshrink",
@@ -1433,7 +1433,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_sqrt",
@@ -1451,7 +1451,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_rsqrt",
@@ -1469,7 +1469,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_abs",
@@ -1488,7 +1488,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) { return computeSign(inputs, outputShape); });
 
   RegisterNNCLoweringsFunction aten_ceil(
@@ -1496,7 +1496,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_ceil",
@@ -1512,7 +1512,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_floor",
@@ -1528,7 +1528,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_round",
@@ -1544,7 +1544,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_trunc",
@@ -1560,7 +1560,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_cast_float",
@@ -1582,7 +1582,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         // see handling of aten::to in tensorexpr_fuser.cpp for why we only
         // need to handle the first input
@@ -1604,7 +1604,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeThreeOperand(
             "aten_threshold",
@@ -1628,7 +1628,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeConditionWithTwoOperand(
             "aten_where",
@@ -1646,7 +1646,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_frac",
@@ -1666,7 +1666,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeOneOperand(
             "aten_lgamma",
@@ -1684,7 +1684,7 @@ int nnc_lowerings_lazy_registration() {
   //     {"aten::rand_like"},
   //     [](const std::vector<ArgValue>& inputs,
   //        const std::vector<ExprHandle>& outputShape,
-  //        const c10::optional<ScalarType>& outputType,
+  //        const std::optional<ScalarType>& outputType,
   //        at::Device device) {
   //       return computeOneOperand(
   //           "aten_rand_like",
@@ -1701,7 +1701,7 @@ int nnc_lowerings_lazy_registration() {
   //     {"aten::slice"},
   //     [](const std::vector<ArgValue>& inputs,
   //        const std::vector<ExprHandle>& outputShape,
-  //        const c10::optional<ScalarType>& outputType,
+  //        const std::optional<ScalarType>& outputType,
   //        at::Device device) {
   //       return Compute(
   //           "aten_slice",
@@ -1723,7 +1723,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return Compute(
             "aten_unsqueeze",
@@ -1757,7 +1757,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeTranspose(
             {inputs[0], (int64_t)1, (int64_t)0},
@@ -1774,7 +1774,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         auto A = std::get<BufHandle>(inputs[0]);
         // Trivial case of 0-dim tensors: just a copy of the input
@@ -1848,7 +1848,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeSoftmax(inputs, outputShape, outputStrides, false);
       });
@@ -1858,7 +1858,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         return computeSoftmax(inputs, outputShape, outputStrides, true);
       });
@@ -1892,7 +1892,7 @@ int nnc_lowerings_lazy_registration() {
       [](const std::vector<ArgValue>& inputs,
          const std::vector<ExprHandle>& outputShape,
          const std::vector<ExprHandle>& outputStrides,
-         const c10::optional<ScalarType>& outputType,
+         const std::optional<ScalarType>& outputType,
          at::Device device) {
         auto add_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) {
           return boolToInteger(lhs) + boolToInteger(rhs);
diff --git a/torch/csrc/jit/tensorexpr/lowerings.h b/torch/csrc/jit/tensorexpr/lowerings.h
index 6d8b2c433ae37..da22899ba28ce 100644
--- a/torch/csrc/jit/tensorexpr/lowerings.h
+++ b/torch/csrc/jit/tensorexpr/lowerings.h
@@ -32,7 +32,7 @@ using NNCLoweringFunction = std::function<Tensor(
     const std::vector<ArgValue>&,
     const std::vector<ExprHandle>&,
     const std::vector<ExprHandle>&,
-    const c10::optional<ScalarType>&,
+    const std::optional<ScalarType>&,
     at::Device)>;
 
 TORCH_API FunctionSchemaMap<NNCLoweringFunction>& getNNCLoweringRegistry();
diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
index 3f29dad4c13f3..bdf313f0ad051 100644
--- a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
@@ -353,7 +353,7 @@ Tensor computeConv2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
   if (outputType) {
@@ -401,7 +401,7 @@ Tensor computeConv1d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
   if (outputType) {
@@ -435,7 +435,7 @@ Tensor computePrepackedConv2dClampRun(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
   if (outputType) {
@@ -454,7 +454,7 @@ Tensor computePrepackedLinearClampRun(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
   if (outputType) {
@@ -473,7 +473,7 @@ Tensor computeMkldnnPrepackedConvRun(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
   if (outputType) {
diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.h b/torch/csrc/jit/tensorexpr/operators/conv2d.h
index 65902960192ab..f842a1350a551 100644
--- a/torch/csrc/jit/tensorexpr/operators/conv2d.h
+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.h
@@ -74,31 +74,31 @@ Tensor computeConv2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeConv1d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computePrepackedConv2dClampRun(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computePrepackedLinearClampRun(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeMkldnnPrepackedConvRun(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 } // namespace tensorexpr
 } // namespace jit
diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.cpp b/torch/csrc/jit/tensorexpr/operators/matmul.cpp
index 38b420a7aca1c..92c6c14519325 100644
--- a/torch/csrc/jit/tensorexpr/operators/matmul.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/matmul.cpp
@@ -9,7 +9,7 @@ Tensor computeMatmul(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
   if (outputType) {
@@ -56,7 +56,7 @@ Tensor computeAddMM(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
   if (outputType) {
diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.h b/torch/csrc/jit/tensorexpr/operators/matmul.h
index 70f3f4bf7bf03..40ef3cfd9b619 100644
--- a/torch/csrc/jit/tensorexpr/operators/matmul.h
+++ b/torch/csrc/jit/tensorexpr/operators/matmul.h
@@ -10,13 +10,13 @@ Tensor computeMatmul(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeAddMM(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/misc.cpp b/torch/csrc/jit/tensorexpr/operators/misc.cpp
index c282787485ea4..70991f6db1f4c 100644
--- a/torch/csrc/jit/tensorexpr/operators/misc.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/misc.cpp
@@ -136,7 +136,7 @@ ExprHandle promoteIntegerToDefaultType(const ExprHandle& e) {
 
 ExprHandle demoteOutput(
     const ExprHandle& e,
-    const c10::optional<ScalarType> type) {
+    const std::optional<ScalarType> type) {
   if (!type.has_value()) {
     return e;
   }
@@ -160,7 +160,7 @@ ExprHandle demoteOutput(
   return e;
 }
 
-c10::optional<TensorInfo> getTensorInfo(BufHandle b) {
+std::optional<TensorInfo> getTensorInfo(BufHandle b) {
   std::vector<int64_t> dims;
   for (auto dim : b.dims()) {
     auto val = intValue(dim.node());
@@ -321,7 +321,7 @@ Tensor computeChunk(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   return Compute(
       "prim_constantchunk",
@@ -355,7 +355,7 @@ Tensor computeTranspose(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   auto A = std::get<BufHandle>(inputs[0]);
   // Trivial case of 0-dim and 1-dim tensors: transpose is just a copy
@@ -382,7 +382,7 @@ Tensor computeExpand(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   auto A = std::get<BufHandle>(inputs[0]);
   return Compute(
@@ -396,7 +396,7 @@ Tensor computeReshape(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   auto A = std::get<BufHandle>(inputs[0]);
   if (A.ndim() == 0) {
@@ -464,7 +464,7 @@ Tensor computeFlatten(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   std::vector<int64_t> outputShapeVec;
   for (const auto dim : c10::irange(outputShape.size())) {
@@ -622,7 +622,7 @@ Tensor computeCat(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   if (device == at::kCPU && getCatWoConditionals()) {
     return computeCatWoConditionals(inputs, outputShape, outputStrides);
@@ -685,7 +685,7 @@ Tensor computeEmbedding(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
   if (outputType) {
diff --git a/torch/csrc/jit/tensorexpr/operators/misc.h b/torch/csrc/jit/tensorexpr/operators/misc.h
index 5650b35147b17..50f53b0b50d07 100644
--- a/torch/csrc/jit/tensorexpr/operators/misc.h
+++ b/torch/csrc/jit/tensorexpr/operators/misc.h
@@ -12,7 +12,7 @@ struct TensorInfo {
   std::vector<int64_t> dims;
   c10::ScalarType dtype;
 };
-c10::optional<TensorInfo> getTensorInfo(BufHandle b);
+std::optional<TensorInfo> getTensorInfo(BufHandle b);
 
 int64_t normalizeAndCheckIndex(int64_t idx, int64_t list_size);
 
@@ -26,7 +26,7 @@ ExprHandle promoteIntegerToDefaultType(const ExprHandle& e);
 ExprHandle promoteHalfToFloat(const ExprHandle& e);
 ExprHandle demoteOutput(
     const ExprHandle& e,
-    const c10::optional<ScalarType> type);
+    const std::optional<ScalarType> type);
 
 std::vector<ExprHandle> broadcastShapes(
     std::vector<std::vector<ExprHandle>> shapes);
@@ -51,31 +51,31 @@ Tensor computeChunk(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeTranspose(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeExpand(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeReshape(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeFlatten(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeCatWoConditionals(
     const std::vector<ArgValue>& inputs,
@@ -84,13 +84,13 @@ Tensor computeCat(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeEmbedding(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/norm.cpp b/torch/csrc/jit/tensorexpr/operators/norm.cpp
index 335cfae05f4d4..c87a931d1fc43 100644
--- a/torch/csrc/jit/tensorexpr/operators/norm.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/norm.cpp
@@ -9,7 +9,7 @@ Tensor computeBatchNorm(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   bool hasWeight = true;
   bool hasBias = true;
diff --git a/torch/csrc/jit/tensorexpr/operators/norm.h b/torch/csrc/jit/tensorexpr/operators/norm.h
index 7c8cc43387b01..dbe6140cca8b4 100644
--- a/torch/csrc/jit/tensorexpr/operators/norm.h
+++ b/torch/csrc/jit/tensorexpr/operators/norm.h
@@ -10,7 +10,7 @@ Tensor computeBatchNorm(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/pointwise.cpp b/torch/csrc/jit/tensorexpr/operators/pointwise.cpp
index 57c63fcd92391..19aad4d015e27 100644
--- a/torch/csrc/jit/tensorexpr/operators/pointwise.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/pointwise.cpp
@@ -10,7 +10,7 @@ using namespace torch::jit::tensorexpr;
 Tensor computeSign(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
-    c10::optional<std::vector<ExprHandle>> outputStrides) {
+    std::optional<std::vector<ExprHandle>> outputStrides) {
   return Compute(
       "aten_sign", outputShape, outputStrides, [&](ParameterList& axes) {
         std::vector<ExprHandle> indices(axes.begin(), axes.end());
@@ -28,7 +28,7 @@ Tensor computeOneOperand(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&)>& innerExpr,
     const int checkParamTypes) {
   return Compute(
@@ -51,7 +51,7 @@ Tensor computeTwoOperand(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
         innerExpr) {
   return Compute(
@@ -76,7 +76,7 @@ Tensor computeTwoOperandWithAlpha(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
         innerExpr) {
   return Compute(
@@ -102,7 +102,7 @@ Tensor computeConditionWithTwoOperand(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<
         ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>&
         innerExpr) {
@@ -131,7 +131,7 @@ Tensor computeThreeOperand(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<
         ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>&
         innerExpr,
@@ -161,7 +161,7 @@ Tensor computeFourOperand(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<ExprHandle(
         const ExprHandle&,
         const ExprHandle&,
@@ -191,7 +191,7 @@ Tensor computeNoop(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   return computeOneOperand(
       "copy",
@@ -207,7 +207,7 @@ Tensor computeScalar(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
         innerExpr) {
   auto dt = Dtype(*outputType);
diff --git a/torch/csrc/jit/tensorexpr/operators/pointwise.h b/torch/csrc/jit/tensorexpr/operators/pointwise.h
index 8de218dbb0383..0ce10424b3d30 100644
--- a/torch/csrc/jit/tensorexpr/operators/pointwise.h
+++ b/torch/csrc/jit/tensorexpr/operators/pointwise.h
@@ -9,14 +9,14 @@ namespace tensorexpr {
 TORCH_API Tensor computeSign(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
-    c10::optional<std::vector<ExprHandle>> outputStrides = c10::nullopt);
+    std::optional<std::vector<ExprHandle>> outputStrides = c10::nullopt);
 
 Tensor computeOneOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&)>& innerExpr,
     const int checkParamTypes = kAllTypes);
 Tensor computeTwoOperand(
@@ -24,7 +24,7 @@ Tensor computeTwoOperand(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
         innerExpr);
 Tensor computeTwoOperandWithAlpha(
@@ -32,7 +32,7 @@ Tensor computeTwoOperandWithAlpha(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
         innerExpr);
 Tensor computeConditionWithTwoOperand(
@@ -40,7 +40,7 @@ Tensor computeConditionWithTwoOperand(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<
         ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>&
         innerExpr);
@@ -49,7 +49,7 @@ Tensor computeThreeOperand(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<
         ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>&
         innerExpr,
@@ -59,7 +59,7 @@ Tensor computeFourOperand(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<ExprHandle(
         const ExprHandle&,
         const ExprHandle&,
@@ -69,7 +69,7 @@ Tensor computeNoop(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 Tensor computeScalar(
@@ -77,7 +77,7 @@ Tensor computeScalar(
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
         innerExpr);
 
diff --git a/torch/csrc/jit/tensorexpr/operators/quantization.cpp b/torch/csrc/jit/tensorexpr/operators/quantization.cpp
index da6d43cbb7aa9..66c0688538a1d 100644
--- a/torch/csrc/jit/tensorexpr/operators/quantization.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/quantization.cpp
@@ -141,7 +141,7 @@ Tensor computeQuantizePerTensor(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>&,
+    const std::optional<ScalarType>&,
     at::Device) {
   std::vector<VarPtr> vars;
   std::vector<ExprHandle> indices;
@@ -181,7 +181,7 @@ Tensor computeQuantizedAdd(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device) {
   const BufHandle& QA = std::get<BufHandle>(inputs[0]);
   const BufHandle& QB = std::get<BufHandle>(inputs[1]);
@@ -225,7 +225,7 @@ Tensor computeQuantizePerTensorExternalCall(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device) {
   const BufHandle& x = std::get<BufHandle>(inputs[0]);
   const auto qscale = std::get<double>(inputs[1]);
@@ -257,7 +257,7 @@ Tensor computeDequantizeExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device) {
   Dtype dtype = kFloat;
   if (outputType) {
@@ -282,7 +282,7 @@ Tensor computeQuantizedConv2dPrepack(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device) {
   Dtype dtype = kFloat;
   if (outputType) {
@@ -332,7 +332,7 @@ Tensor computeQuantizedConv1d(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
     at::Device device) {
   const BufHandle& qx = std::get<BufHandle>(inputs[0]);
@@ -364,7 +364,7 @@ Tensor computeQuantizedConv2d(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
     at::Device device) {
   const BufHandle& qx = std::get<BufHandle>(inputs[0]);
@@ -396,7 +396,7 @@ Tensor computeQuantizedConv2dRelu(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
     at::Device device) {
   const BufHandle& qx = std::get<BufHandle>(inputs[0]);
@@ -428,7 +428,7 @@ Tensor computeQuantizedLinear(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
     at::Device device) {
   const BufHandle& qx = std::get<BufHandle>(inputs[0]);
@@ -460,7 +460,7 @@ Tensor computeQuantizedLinearRelu(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
     at::Device device) {
   const BufHandle& qx = std::get<BufHandle>(inputs[0]);
@@ -492,7 +492,7 @@ Tensor computeQuantizedAddExternalCall(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
     at::Device device) {
   const BufHandle& qa = std::get<BufHandle>(inputs[0]);
@@ -536,7 +536,7 @@ Tensor computeQuantizedMul(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
     at::Device device) {
   const BufHandle& qa = std::get<BufHandle>(inputs[0]);
@@ -567,7 +567,7 @@ Tensor computeQuantizedMulScalar(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
     at::Device device) {
   const BufHandle& qa = std::get<BufHandle>(inputs[0]);
@@ -594,7 +594,7 @@ Tensor computeQuantizedRelu(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
     at::Device device) {
   const BufHandle& qa = std::get<BufHandle>(inputs[0]);
@@ -625,7 +625,7 @@ Tensor computeQuantizedCat(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     // NOLINTNEXTLINE
     at::Device device) {
   // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
@@ -663,7 +663,7 @@ Tensor computeDequantize(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device) {
   Dtype dtype = kFloat;
   if (outputType) {
@@ -695,7 +695,7 @@ Tensor computeUpsampleNearest2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device) {
   auto A = std::get<BufHandle>(inputs[0]);
   const auto& output_height = outputShape[2];
@@ -742,7 +742,7 @@ Tensor computeUpsampleNearest2dExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device) {
   Dtype dtype = kFloat;
   if (outputType) {
@@ -802,7 +802,7 @@ Tensor computeQuantizedSigmoidExternalCall(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     // NOLINTNEXTLINE
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device) {
   const BufHandle& qx = std::get<BufHandle>(inputs[0]);
 
diff --git a/torch/csrc/jit/tensorexpr/operators/quantization.h b/torch/csrc/jit/tensorexpr/operators/quantization.h
index 019b2349b1840..d48c9e3273ba0 100644
--- a/torch/csrc/jit/tensorexpr/operators/quantization.h
+++ b/torch/csrc/jit/tensorexpr/operators/quantization.h
@@ -20,140 +20,140 @@ TORCH_API Tensor computeQuantizePerTensor(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizePerTensorExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedConv1d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedConv2dPrepack(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedConv1d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedConv2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedConv2dRelu(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedLinear(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedLinearRelu(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedAdd(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 Tensor computeQuantizedAddExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedMul(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedMulScalar(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedCat(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedRelu(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeDequantize(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeDequantizeExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeUpsampleNearest2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeUpsampleNearest2dExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 TORCH_API Tensor computeQuantizedSigmoidExternalCall(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device);
 } // namespace tensorexpr
 } // namespace jit
diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.cpp b/torch/csrc/jit/tensorexpr/operators/reduction.cpp
index dfd6e2d01adf5..b5f53560c9be3 100644
--- a/torch/csrc/jit/tensorexpr/operators/reduction.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/reduction.cpp
@@ -23,7 +23,7 @@ Tensor computeSum(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   std::vector<size_t> axes;
   bool keepdim = false;
@@ -108,7 +108,7 @@ Tensor computeMean(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
   if (outputType) {
@@ -140,7 +140,7 @@ Tensor computeMax(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
   if (outputType) {
@@ -164,7 +164,7 @@ Tensor computeAdaptiveAvgPool2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device) {
   Dtype dtype = kFloat;
   if (outputType) {
diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.h b/torch/csrc/jit/tensorexpr/operators/reduction.h
index 6265c4d265858..7d25e14a171ce 100644
--- a/torch/csrc/jit/tensorexpr/operators/reduction.h
+++ b/torch/csrc/jit/tensorexpr/operators/reduction.h
@@ -10,25 +10,25 @@ TORCH_API Tensor computeSum(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 TORCH_API Tensor computeMean(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 TORCH_API Tensor computeAdaptiveAvgPool2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 Tensor computeMax(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const c10::optional<ScalarType>& outputType,
+    const std::optional<ScalarType>& outputType,
     at::Device device);
 
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp
index 746a9a8cd1f0b..5bc734bb80b83 100644
--- a/torch/csrc/jit/tensorexpr/tensor.cpp
+++ b/torch/csrc/jit/tensorexpr/tensor.cpp
@@ -99,7 +99,7 @@ StmtPtr Tensor::constructStmt(
 Tensor Compute(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func) {
   std::vector<VarHandle> args = create_index_vars(dims);
   ExprHandle body = body_func(args);
@@ -116,7 +116,7 @@ Tensor Compute(
 Tensor Compute(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const std::function<ExprHandle(const VarHandle&)>& body_func) {
   if (dims.size() != 1) {
     throw malformed_input("mismatch between body and arg size (1)");
@@ -137,7 +137,7 @@ Tensor Compute(
 Tensor Compute(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
         body_func) {
   if (dims.size() != 2) {
@@ -159,7 +159,7 @@ Tensor Compute(
 Tensor Compute(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const std::function<
         ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
         body_func) {
@@ -183,7 +183,7 @@ Tensor Compute(
 Tensor Compute(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const std::function<ExprHandle(
         const VarHandle&,
         const VarHandle&,
@@ -211,7 +211,7 @@ Tensor Compute(
 Tensor Reduce(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const Reducer& reducer,
     const BufHandle& buffer,
     const std::vector<ExprHandle>& reduce_dims) {
@@ -235,7 +235,7 @@ Tensor Reduce(
 Tensor Reduce(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const Reducer& reducer,
     Tensor tensor,
     const std::vector<ExprHandle>& reduce_dims) {
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
index 698de07f2be54..7b589d0974b37 100644
--- a/torch/csrc/jit/tensorexpr/tensor.h
+++ b/torch/csrc/jit/tensorexpr/tensor.h
@@ -75,7 +75,7 @@ class TORCH_API Tensor {
 TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const std::function<ExprHandle(const VarHandle&)>& body_func);
 TORCH_API Tensor Compute(
     const std::string& func_name,
@@ -84,7 +84,7 @@ TORCH_API Tensor Compute(
 TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
         body_func);
 TORCH_API Tensor Compute(
@@ -95,7 +95,7 @@ TORCH_API Tensor Compute(
 TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const std::function<
         ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
         body_func);
@@ -108,7 +108,7 @@ TORCH_API Tensor Compute(
 TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const std::function<ExprHandle(
         const VarHandle&,
         const VarHandle&,
@@ -125,7 +125,7 @@ TORCH_API Tensor Compute(
 TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func);
 TORCH_API Tensor Compute(
     const std::string& func_name,
@@ -148,7 +148,7 @@ template <typename InitFunc, typename BodyFunc>
 Tensor Reduce(
     const std::string& func_name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const Reducer& reducer,
     const InitFunc& init_func,
     const BodyFunc& body_func,
@@ -217,7 +217,7 @@ template <typename BodyFunc>
 Tensor Reduce(
     const std::string& func_name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const Reducer& reducer,
     const BodyFunc& body_func,
     const std::vector<ExprHandle>& reduce_dims) {
@@ -246,7 +246,7 @@ template <typename BodyFunc>
 Tensor Reduce(
     const std::string& func_name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const Reducer& reducer,
     const BodyFunc&& body_func,
     const std::vector<ExprHandle>& reduce_dims) {
@@ -265,7 +265,7 @@ Tensor Reduce(
 TORCH_API Tensor Reduce(
     const std::string& name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const Reducer& reducer,
     const BufHandle& buffer,
     const std::vector<ExprHandle>& reduce_dims);
@@ -281,7 +281,7 @@ TORCH_API Tensor Reduce(
 TORCH_API Tensor Reduce(
     const std::string& func_name,
     const std::vector<ExprHandle>& dims,
-    c10::optional<std::vector<ExprHandle>> strides,
+    std::optional<std::vector<ExprHandle>> strides,
     const Reducer& reducer,
     Tensor tensor,
     const std::vector<ExprHandle>& reduce_dims);
diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
index f6e0b270c92ca..204326dc03e21 100644
--- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
+++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
@@ -936,13 +936,13 @@ void initTensorExprBindings(PyObject* module) {
       &tensorexpr::replaceListOutputWithTuple);
   te.def("trim_graph", &tensorexpr::trimGraph);
 #ifdef TORCH_ENABLE_LLVM
-  te.def("set_llvm_target_triple", [](const c10::optional<std::string>& val) {
+  te.def("set_llvm_target_triple", [](const std::optional<std::string>& val) {
     tensorexpr::LLVMTargetTriple() = val;
   });
-  te.def("set_llvm_target_cpu", [](const c10::optional<std::string>& val) {
+  te.def("set_llvm_target_cpu", [](const std::optional<std::string>& val) {
     tensorexpr::LLVMTargetCPU() = val;
   });
-  te.def("set_llvm_target_attrs", [](const c10::optional<std::string>& val) {
+  te.def("set_llvm_target_attrs", [](const std::optional<std::string>& val) {
     tensorexpr::LLVMTargetAttrs() = val;
   });
   te.def("set_llvm_aot_workflow", [](bool val) {
diff --git a/torch/csrc/jit/testing/file_check.cpp b/torch/csrc/jit/testing/file_check.cpp
index e1f87fccf7266..ec0011f40d775 100644
--- a/torch/csrc/jit/testing/file_check.cpp
+++ b/torch/csrc/jit/testing/file_check.cpp
@@ -43,17 +43,17 @@ struct Check {
   Check(
       CheckType type,
       std::string str,
-      c10::optional<size_t> count = c10::nullopt)
+      std::optional<size_t> count = c10::nullopt)
       : type_(type), count_(count), search_str_(std::move(str)) {}
 
   Check(
       CheckType type,
       c10::string_view str,
-      c10::optional<size_t> count = c10::nullopt)
+      std::optional<size_t> count = c10::nullopt)
       : Check(type, std::string(str.begin(), str.end()), count) {}
 
   CheckType type_;
-  c10::optional<size_t> count_;
+  std::optional<size_t> count_;
   const std::string search_str_;
 
   friend std::ostream& operator<<(std::ostream& out, const Check& c);
@@ -234,7 +234,7 @@ struct FileCheckImpl {
   TORCH_API void addCheck(
       CheckType type,
       const std::string& s,
-      c10::optional<size_t> count = c10::nullopt) {
+      std::optional<size_t> count = c10::nullopt) {
     addCheck(Check(type, s, count));
   }
 
@@ -264,7 +264,7 @@ struct FileCheckImpl {
       }
       size_t end_check_string = suffix_pos + check_suffix.size();
       CheckType type = check_pair.first;
-      c10::optional<size_t> count = c10::nullopt;
+      std::optional<size_t> count = c10::nullopt;
       auto end_line = source->text_str().find("\n", end_check_string);
       bool exactly = false;
       if (type == CHECK_COUNT) {
diff --git a/torch/csrc/lazy/backend/backend_device.cpp b/torch/csrc/lazy/backend/backend_device.cpp
index eaf3d6b28c07c..6d146ca0881ce 100644
--- a/torch/csrc/lazy/backend/backend_device.cpp
+++ b/torch/csrc/lazy/backend/backend_device.cpp
@@ -54,7 +54,7 @@ c10::Device backendDeviceToAtenDevice(const BackendDevice& device) {
   return c10::Device(at::kLazy, device.ordinal());
 }
 
-c10::optional<BackendDevice> GetBackendDevice(at::ITensorListRef tensors) {
+std::optional<BackendDevice> GetBackendDevice(at::ITensorListRef tensors) {
   for (auto& tensor : tensors) {
     if (auto lt = TryGetLtcTensor(tensor)) {
       return lt->GetDevice();
@@ -63,26 +63,26 @@ c10::optional<BackendDevice> GetBackendDevice(at::ITensorListRef tensors) {
   return c10::nullopt;
 }
 
-c10::optional<BackendDevice> GetBackendDevice(at::TensorList tensors) {
+std::optional<BackendDevice> GetBackendDevice(at::TensorList tensors) {
   return GetBackendDevice(at::ITensorListRef(tensors));
 }
 
-c10::optional<BackendDevice> GetBackendDevice(const at::Tensor& tensor) {
+std::optional<BackendDevice> GetBackendDevice(const at::Tensor& tensor) {
   if (auto lt = TryGetLtcTensor(tensor)) {
     return lt->GetDevice();
   }
   return c10::nullopt;
 }
 
-c10::optional<BackendDevice> GetBackendDevice(
-    const c10::optional<c10::Device>& device) {
+std::optional<BackendDevice> GetBackendDevice(
+    const std::optional<c10::Device>& device) {
   if (device) {
     return c10::make_optional(atenDeviceToBackendDevice(*device));
   }
   return c10::nullopt;
 }
 
-c10::optional<BackendDevice> GetBackendDevice() {
+std::optional<BackendDevice> GetBackendDevice() {
   return c10::nullopt;
 }
 
diff --git a/torch/csrc/lazy/backend/backend_device.h b/torch/csrc/lazy/backend/backend_device.h
index 4c239d1e4b71c..e80c800a2ecea 100644
--- a/torch/csrc/lazy/backend/backend_device.h
+++ b/torch/csrc/lazy/backend/backend_device.h
@@ -73,20 +73,20 @@ TORCH_API c10::Device backendDeviceToAtenDevice(const BackendDevice& device);
 
 // Tries to extract the backend device out of the lazy tensor. Returns nullopt
 // if the input is not a lazy tensor.
-TORCH_API c10::optional<BackendDevice> GetBackendDevice(
+TORCH_API std::optional<BackendDevice> GetBackendDevice(
     const at::ITensorListRef tensors);
-TORCH_API c10::optional<BackendDevice> GetBackendDevice(
+TORCH_API std::optional<BackendDevice> GetBackendDevice(
     const at::TensorList tensors);
-TORCH_API c10::optional<BackendDevice> GetBackendDevice(
+TORCH_API std::optional<BackendDevice> GetBackendDevice(
     const at::Tensor& tensor);
-TORCH_API c10::optional<BackendDevice> GetBackendDevice(
-    const c10::optional<c10::Device>& device);
+TORCH_API std::optional<BackendDevice> GetBackendDevice(
+    const std::optional<c10::Device>& device);
 
 // For variadic template.
-TORCH_API c10::optional<BackendDevice> GetBackendDevice();
+TORCH_API std::optional<BackendDevice> GetBackendDevice();
 
 template <typename T, typename... Args>
-c10::optional<BackendDevice> GetBackendDevice(
+std::optional<BackendDevice> GetBackendDevice(
     const T& tensor,
     const Args&... forward_tensors) {
   auto optional_device = GetBackendDevice(tensor);
diff --git a/torch/csrc/lazy/backend/backend_interface.h b/torch/csrc/lazy/backend/backend_interface.h
index f94d3b602e52c..366311921c394 100644
--- a/torch/csrc/lazy/backend/backend_interface.h
+++ b/torch/csrc/lazy/backend/backend_interface.h
@@ -63,7 +63,7 @@ class TORCH_API BackendImplInterface {
 
   virtual at::Tensor MakeTensorFromComputationData(
       const BackendDataPtr data,
-      c10::optional<at::ScalarType> logical_scalar_type) const = 0;
+      std::optional<at::ScalarType> logical_scalar_type) const = 0;
 
   /**
    * Lowering, Compilation, Execution
diff --git a/torch/csrc/lazy/core/hash.h b/torch/csrc/lazy/core/hash.h
index bb6a779555f22..19f57546c9a43 100644
--- a/torch/csrc/lazy/core/hash.h
+++ b/torch/csrc/lazy/core/hash.h
@@ -135,6 +135,12 @@ static inline hash_t TensorHash(const at::Tensor& tensor) {
       return DataHash(ctensor.const_data_ptr<c10::complex<float>>(), size);
     case at::ScalarType::ComplexDouble:
       return DataHash(ctensor.const_data_ptr<c10::complex<double>>(), size);
+    case at::ScalarType::UInt16:
+      return DataHash(ctensor.const_data_ptr<uint16_t>(), size);
+    case at::ScalarType::UInt32:
+      return DataHash(ctensor.const_data_ptr<uint32_t>(), size);
+    case at::ScalarType::UInt64:
+      return DataHash(ctensor.const_data_ptr<uint64_t>(), size);
     default:
       TORCH_INTERNAL_ASSERT(
           false, "Unsupported scalar type:", ctensor.scalar_type());
@@ -163,11 +169,11 @@ static inline hash_t Hash(const at::Generator& value) {
 // repeatedly hash a constant at runtime.
 static const int64_t kNullOpt = 0x8655d738f3678dda;
 
-// Hashing for c10::optional types contributes to hash
+// Hashing for std::optional types contributes to hash
 // for optionals with null value, important to distinguish
 // between <nullopt, non-nullopt> and <non-nullopt, nullopt> cases
 template <typename T>
-hash_t Hash(const c10::optional<T>& value) {
+hash_t Hash(const std::optional<T>& value) {
   if (value.has_value()) {
     return Hash(value.value());
   } else {
@@ -187,7 +193,7 @@ hash_t Hash(const std::vector<T>& values) {
 
 // Need a special case for optional<container>?
 template <typename T>
-hash_t Hash(const c10::optional<std::vector<T>>& value) {
+hash_t Hash(const std::optional<std::vector<T>>& value) {
   if (value.has_value()) {
     return ContainerHash(value.value());
   } else {
diff --git a/torch/csrc/lazy/core/ir_builder.h b/torch/csrc/lazy/core/ir_builder.h
index 3b58d00aace6c..981e166777294 100644
--- a/torch/csrc/lazy/core/ir_builder.h
+++ b/torch/csrc/lazy/core/ir_builder.h
@@ -61,7 +61,7 @@ struct IrBuilder {
   virtual NodePtr MakeCast(
       const Value& input0,
       const at::ScalarType& dtype,
-      const c10::optional<at::ScalarType>& stype = c10::nullopt) const = 0;
+      const std::optional<at::ScalarType>& stype = c10::nullopt) const = 0;
   virtual NodePtr MakeTensorList(const OpList& inputs) const = 0;
   virtual NodePtr MakeGeneric(
       const OpKind& op,
@@ -96,7 +96,7 @@ static inline NodePtr MakeExpand(
 static inline NodePtr MakeCast(
     const Value& input0,
     const at::ScalarType& dtype,
-    const c10::optional<at::ScalarType>& stype = c10::nullopt) {
+    const std::optional<at::ScalarType>& stype = c10::nullopt) {
   return getIrBuilder()->MakeCast(input0, dtype, stype);
 }
 static inline NodePtr MakeTensorList(const OpList& inputs) {
diff --git a/torch/csrc/lazy/core/ir_dump_util.cpp b/torch/csrc/lazy/core/ir_dump_util.cpp
index 19cb2ae7b1624..a4fb11761a67c 100644
--- a/torch/csrc/lazy/core/ir_dump_util.cpp
+++ b/torch/csrc/lazy/core/ir_dump_util.cpp
@@ -28,7 +28,7 @@ std::string::size_type SkipTagSeparator(
   return node_string.compare(pos, 2, ", ") == 0 ? pos + 2 : pos;
 }
 
-c10::optional<AttrTag> ParseAttrTag(
+std::optional<AttrTag> ParseAttrTag(
     const std::string& node_string,
     std::string::size_type pos) {
   // @lint-ignore-every CLANGTIDY facebook-hte-StdRegexIsAwful
@@ -97,7 +97,7 @@ std::unordered_map<const Node*, size_t> GetRootsIds(
   return roots_ids;
 }
 
-c10::optional<size_t> GetRootNodeId(
+std::optional<size_t> GetRootNodeId(
     const Node* node,
     const std::unordered_map<const Node*, size_t>& roots_ids) {
   auto it = roots_ids.find(node);
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp
index afeac5e75e6c3..a2b67c958313a 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@@ -610,7 +610,7 @@ LazyGraphExecutor::SyncTensorCollection LazyGraphExecutor::CollectSyncTensors(
       } else if (config.force_ltc_data) {
         // The tensor only has at::Tensor data. We need to queue it for a
         // device upload.
-        c10::optional<at::Tensor> tensor_data = tensors[i]->CurrentTensorData();
+        std::optional<at::Tensor> tensor_data = tensors[i]->CurrentTensorData();
         TORCH_CHECK(tensor_data);
         at_tensors.push_back(*tensor_data);
         devices.push_back(tensors[i]->GetDevice());
@@ -996,7 +996,7 @@ std::vector<at::Tensor> LazyGraphExecutor::FetchTensors(
       ++literals_index;
       ++sync_index;
     } else {
-      c10::optional<at::Tensor> tensor_data =
+      std::optional<at::Tensor> tensor_data =
           (*tensors)[i]->CurrentTensorData();
       if (tensor_data) {
         results.push_back(*tensor_data);
diff --git a/torch/csrc/lazy/core/shape.cpp b/torch/csrc/lazy/core/shape.cpp
index 200dd8fac7895..939e2745ed393 100644
--- a/torch/csrc/lazy/core/shape.cpp
+++ b/torch/csrc/lazy/core/shape.cpp
@@ -13,7 +13,7 @@ namespace lazy {
 Shape::Shape(
     at::ScalarType scalar_type,
     c10::ArrayRef<int64_t> sizes,
-    c10::optional<std::vector<bool>> is_symbolic)
+    std::optional<std::vector<bool>> is_symbolic)
     : scalar_type_(scalar_type),
       sizes_(sizes.begin(), sizes.end()),
       is_symbolic_(std::move(is_symbolic)) {}
@@ -49,7 +49,7 @@ hash_t Shape::hash(bool bakeInSizes) const {
 }
 
 Shape Shape::with_symbolic_dims(
-    c10::optional<std::vector<bool>> symbolic_dims) const {
+    std::optional<std::vector<bool>> symbolic_dims) const {
   Shape copy = *this;
   copy.is_symbolic_ = symbolic_dims;
   return copy;
@@ -75,7 +75,7 @@ static c10::SymbolicShape get_symbolic_shape(at::Tensor& tensor) {
   TORCH_INTERNAL_ASSERT(
       sizes.size() == is_symbolic->size(),
       "Dims of two values are not consistent");
-  std::vector<c10::optional<int64_t>> symbolic_dims;
+  std::vector<std::optional<int64_t>> symbolic_dims;
   for (size_t i = 0; i < sizes.size(); i++) {
     if (is_symbolic->at(i)) {
       symbolic_dims.emplace_back(c10::nullopt);
diff --git a/torch/csrc/lazy/core/shape.h b/torch/csrc/lazy/core/shape.h
index 1c6b4d5bb3d81..63566619fd149 100644
--- a/torch/csrc/lazy/core/shape.h
+++ b/torch/csrc/lazy/core/shape.h
@@ -19,7 +19,7 @@ class TORCH_API Shape {
   Shape(
       at::ScalarType scalar_type,
       c10::ArrayRef<int64_t> sizes,
-      c10::optional<std::vector<bool>> is_symbolic = c10::nullopt);
+      std::optional<std::vector<bool>> is_symbolic = c10::nullopt);
 
   std::string to_string() const;
 
@@ -43,13 +43,13 @@ class TORCH_API Shape {
     sizes_.at(dim) = size;
   }
 
-  const c10::optional<std::vector<bool>>& is_symbolic() const {
+  const std::optional<std::vector<bool>>& is_symbolic() const {
     return is_symbolic_;
   }
 
   // Makes a copy with symbolic dims applied
   Shape with_symbolic_dims(
-      c10::optional<std::vector<bool>> symbolic_dims) const;
+      std::optional<std::vector<bool>> symbolic_dims) const;
 
   size_t numel() const;
   hash_t hash(bool bakeInSizes) const;
@@ -64,7 +64,7 @@ class TORCH_API Shape {
   // Stores which dimmensions are symbolic
   // If nullopt, either it hasn't been initialized or the symbolic
   // dimmensions are not calculatable
-  c10::optional<std::vector<bool>> is_symbolic_ = c10::nullopt;
+  std::optional<std::vector<bool>> is_symbolic_ = c10::nullopt;
 };
 
 TORCH_API std::ostream& operator<<(std::ostream& out, const Shape& shape);
diff --git a/torch/csrc/lazy/core/tensor.cpp b/torch/csrc/lazy/core/tensor.cpp
index 541a0f6f5a070..ba0571f87df4d 100644
--- a/torch/csrc/lazy/core/tensor.cpp
+++ b/torch/csrc/lazy/core/tensor.cpp
@@ -197,7 +197,7 @@ Value LazyTensor::GetIrValue() const {
     AssignIrValue(CreateTensorNode(handle, /*read_only=*/false));
     return data()->ir_value;
   }
-  c10::optional<at::Tensor> tensor_data = CurrentTensorData();
+  std::optional<at::Tensor> tensor_data = CurrentTensorData();
   TORCH_CHECK(tensor_data);
   AssignIrValue(GetIrValueForTensor(*tensor_data, GetDevice()));
   return data()->ir_value;
@@ -211,7 +211,7 @@ void LazyTensor::SetTensorData(at::Tensor tensor_data) {
   data()->tensor_data = std::move(tensor_data);
 }
 
-c10::optional<at::Tensor> LazyTensor::CurrentTensorData() const {
+std::optional<at::Tensor> LazyTensor::CurrentTensorData() const {
   return data()->tensor_data;
 }
 
@@ -236,7 +236,7 @@ Value LazyTensor::GetIrValueForTensor(
 
 at::Tensor LazyTensor::ToTensor(bool detached) {
   at::Tensor tensor;
-  c10::optional<at::Tensor> tensor_data = CurrentTensorData();
+  std::optional<at::Tensor> tensor_data = CurrentTensorData();
   if (!tensor_data) {
     LazyGraphExecutor::Get()->DeviceBarrier(GetDevice());
     // The GetDataHandle() call will trigger an ApplyPendingGraph() if an IR
@@ -373,7 +373,7 @@ std::vector<LazyTensorPtr> GetLtcTensors(c10::ArrayRef<at::Tensor> tensors) {
 }
 
 LazyTensorPtr GetOrCreateLtcTensor(
-    const c10::optional<at::Tensor>& tensor,
+    const std::optional<at::Tensor>& tensor,
     const BackendDevice& device) {
   return GetOrCreateLtcTensor(tensor.value_or(at::Tensor()), device);
 }
diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h
index 3a15c91c03452..afc52376c5545 100644
--- a/torch/csrc/lazy/core/tensor.h
+++ b/torch/csrc/lazy/core/tensor.h
@@ -47,7 +47,7 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
 
     BackendDataPtr handle;
     Value ir_value;
-    c10::optional<at::Tensor> tensor_data;
+    std::optional<at::Tensor> tensor_data;
     const BackendDevice device;
     const int64_t unique_id = 0;
     size_t generation = 1;
@@ -124,7 +124,7 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
   void SetIrValue(Value ir_value);
   void SetInPlaceIrValue(Value ir_value);
 
-  c10::optional<at::Tensor> CurrentTensorData() const;
+  std::optional<at::Tensor> CurrentTensorData() const;
 
   std::vector<LazyTensorPtr> MakeOutputTensors(NodePtr node) const;
 
@@ -191,7 +191,7 @@ TORCH_API std::vector<LazyTensorPtr> GetLtcTensors(
 // If tensor is a lazy tensor type, returns the LazyTensor embedded within it,
 // otherwise creates a new lazy tensor type with tensor as data.
 TORCH_API LazyTensorPtr GetOrCreateLtcTensor(
-    const c10::optional<at::Tensor>& tensor,
+    const std::optional<at::Tensor>& tensor,
     const BackendDevice& device);
 
 TORCH_API LazyTensorPtr GetLtcTensorOrCreateForWrappedNumber(
diff --git a/torch/csrc/lazy/core/tensor_impl.h b/torch/csrc/lazy/core/tensor_impl.h
index 6eca2212c08ed..a35c02a7aeac4 100644
--- a/torch/csrc/lazy/core/tensor_impl.h
+++ b/torch/csrc/lazy/core/tensor_impl.h
@@ -54,7 +54,7 @@ class TORCH_API LTCTensorImpl final : public c10::TensorImpl {
   void setup_size_properties();
 
   LazyTensorPtr tensor_;
-  mutable c10::optional<std::vector<c10::SymInt>> sym_sizes_;
+  mutable std::optional<std::vector<c10::SymInt>> sym_sizes_;
   size_t generation_{0};
 };
 
diff --git a/torch/csrc/lazy/core/tensor_util.h b/torch/csrc/lazy/core/tensor_util.h
index e4e6a1b7f0c26..121235ef9d8c0 100644
--- a/torch/csrc/lazy/core/tensor_util.h
+++ b/torch/csrc/lazy/core/tensor_util.h
@@ -43,7 +43,7 @@ inline at::Tensor CopyTensor(
 }
 
 template <typename T, typename S>
-T OptionalOr(const c10::optional<S>& value, T defval) {
+T OptionalOr(const std::optional<S>& value, T defval) {
   return value ? static_cast<T>(*value) : defval;
 }
 
diff --git a/torch/csrc/lazy/core/unique.h b/torch/csrc/lazy/core/unique.h
index 0b156a29eb906..fc09c8d71d7d8 100644
--- a/torch/csrc/lazy/core/unique.h
+++ b/torch/csrc/lazy/core/unique.h
@@ -49,7 +49,7 @@ class Unique {
   }
 
  private:
-  c10::optional<T> value_;
+  std::optional<T> value_;
 };
 
 } // namespace lazy
diff --git a/torch/csrc/lazy/core/util.h b/torch/csrc/lazy/core/util.h
index a3d35783ae969..e535e5365f227 100644
--- a/torch/csrc/lazy/core/util.h
+++ b/torch/csrc/lazy/core/util.h
@@ -89,7 +89,7 @@ class MaybeRef {
   }
 
  private:
-  c10::optional<T> storage_;
+  std::optional<T> storage_;
   const T& ref_;
 };
 
@@ -109,7 +109,7 @@ std::vector<T> ToVector(const S& input) {
 }
 
 template <typename T>
-c10::optional<std::vector<T>> ToOptionalVector(
+std::optional<std::vector<T>> ToOptionalVector(
     c10::OptionalArrayRef<T> arrayRef) {
   if (arrayRef) {
     return arrayRef->vec();
diff --git a/torch/csrc/lazy/python/python_util.cpp b/torch/csrc/lazy/python/python_util.cpp
index 703d43ca65059..90d9797e3fd35 100644
--- a/torch/csrc/lazy/python/python_util.cpp
+++ b/torch/csrc/lazy/python/python_util.cpp
@@ -11,7 +11,7 @@
 namespace torch {
 namespace lazy {
 
-c10::optional<SourceLocation> GetPythonFrameTop() {
+std::optional<SourceLocation> GetPythonFrameTop() {
   if (!Py_IsInitialized()) {
     return c10::nullopt;
   }
diff --git a/torch/csrc/lazy/python/python_util.h b/torch/csrc/lazy/python/python_util.h
index 8040a023de518..456aafa880971 100644
--- a/torch/csrc/lazy/python/python_util.h
+++ b/torch/csrc/lazy/python/python_util.h
@@ -7,7 +7,7 @@
 namespace torch {
 namespace lazy {
 
-c10::optional<SourceLocation> TORCH_PYTHON_API GetPythonFrameTop();
+std::optional<SourceLocation> TORCH_PYTHON_API GetPythonFrameTop();
 
 std::vector<SourceLocation> TORCH_PYTHON_API GetPythonFrames();
 
diff --git a/torch/csrc/lazy/ts_backend/ir_builder.h b/torch/csrc/lazy/ts_backend/ir_builder.h
index 1f32a3521ba8a..c538292374434 100644
--- a/torch/csrc/lazy/ts_backend/ir_builder.h
+++ b/torch/csrc/lazy/ts_backend/ir_builder.h
@@ -33,7 +33,7 @@ struct TorchScriptIrBuilder : IrBuilder {
   NodePtr MakeCast(
       const Value& input0,
       const at::ScalarType& dtype,
-      const c10::optional<at::ScalarType>& stype =
+      const std::optional<at::ScalarType>& stype =
           c10::nullopt) const override {
     return ReuseOrMakeNode<Cast>(input0, dtype, stype);
   }
diff --git a/torch/csrc/lazy/ts_backend/ops/to_copy.h b/torch/csrc/lazy/ts_backend/ops/to_copy.h
index 4b96b1c389f78..3a5f47411dfdd 100644
--- a/torch/csrc/lazy/ts_backend/ops/to_copy.h
+++ b/torch/csrc/lazy/ts_backend/ops/to_copy.h
@@ -18,12 +18,12 @@ class ToCopy : public torch::lazy::TsNode {
 
   ToCopy(
       const torch::lazy::Value& self,
-      const c10::optional<at::ScalarType>& dtype,
-      const c10::optional<at::Layout>& layout,
-      const c10::optional<at::Device>& device,
-      const c10::optional<bool>& pin_memory,
+      const std::optional<at::ScalarType>& dtype,
+      const std::optional<at::Layout>& layout,
+      const std::optional<at::Device>& device,
+      const std::optional<bool>& pin_memory,
       const bool& non_blocking,
-      const c10::optional<at::MemoryFormat>& memory_format,
+      const std::optional<at::MemoryFormat>& memory_format,
       std::vector<torch::lazy::Shape>&& shapes)
       : torch::lazy::TsNode(
             ClassOpKind(),
@@ -47,12 +47,12 @@ class ToCopy : public torch::lazy::TsNode {
 
   bool CanBeReused(
       const torch::lazy::Value& self,
-      const c10::optional<at::ScalarType>& dtype,
-      const c10::optional<at::Layout>& layout,
-      const c10::optional<at::Device>& device,
-      const c10::optional<bool>& pin_memory,
+      const std::optional<at::ScalarType>& dtype,
+      const std::optional<at::Layout>& layout,
+      const std::optional<at::Device>& device,
+      const std::optional<bool>& pin_memory,
       const bool& non_blocking,
-      const c10::optional<at::MemoryFormat>& memory_format) const {
+      const std::optional<at::MemoryFormat>& memory_format) const {
     size_t i = 0;
     return (
         operand(i++) == self && this->dtype == dtype &&
@@ -115,12 +115,12 @@ class ToCopy : public torch::lazy::TsNode {
     return _to_copy_out;
   }
 
-  c10::optional<at::ScalarType> dtype;
-  c10::optional<at::Layout> layout;
-  c10::optional<at::Device> device;
-  c10::optional<bool> pin_memory;
+  std::optional<at::ScalarType> dtype;
+  std::optional<at::Layout> layout;
+  std::optional<at::Device> device;
+  std::optional<bool> pin_memory;
   bool non_blocking;
-  c10::optional<at::MemoryFormat> memory_format;
+  std::optional<at::MemoryFormat> memory_format;
 };
 
 } // namespace lazy
diff --git a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
index 927e2ba62c2de..b0a2d7568aef8 100644
--- a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
@@ -81,7 +81,7 @@ class TSBackendImpl : public torch::lazy::BackendImplInterface {
 
   at::Tensor MakeTensorFromComputationData(
       const torch::lazy::BackendDataPtr data,
-      c10::optional<at::ScalarType> logical_scalar_type) const override {
+      std::optional<at::ScalarType> logical_scalar_type) const override {
     const auto ts_data = std::static_pointer_cast<TSData>(data);
     return ts_data->data();
   }
diff --git a/torch/csrc/lazy/ts_backend/ts_backend_impl.h b/torch/csrc/lazy/ts_backend/ts_backend_impl.h
index d238e8263e577..0607c3efb5386 100644
--- a/torch/csrc/lazy/ts_backend/ts_backend_impl.h
+++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.h
@@ -38,7 +38,7 @@ class TORCH_API TSData : public torch::lazy::BackendData {
     return data_;
   }
 
-  c10::optional<at::Scalar> scalar;
+  std::optional<at::Scalar> scalar;
 
  private:
   at::Tensor data_;
diff --git a/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp b/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp
index e59a665d7bc29..42acc2c5df10a 100644
--- a/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp
@@ -69,16 +69,16 @@ std::vector<at::Tensor> to_eager(
   return eager_tensors;
 }
 
-std::vector<c10::optional<at::Tensor>> to_eager(
-    const std::vector<c10::optional<at::Tensor>>& tensors,
+std::vector<std::optional<at::Tensor>> to_eager(
+    const std::vector<std::optional<at::Tensor>>& tensors,
     c10::DeviceType device_type) {
   // We can't just call _to_eager() on the entire list of Tensors because it
   // will break on undefined tensors. Separate out undefined tensors first.
-  std::vector<c10::optional<at::Tensor>> eager_tensors(tensors.size());
+  std::vector<std::optional<at::Tensor>> eager_tensors(tensors.size());
   std::vector<at::Tensor> valid_tensors;
   std::vector<bool> to_translate(tensors.size());
   for (size_t i = 0; i < tensors.size(); ++i) {
-    const c10::optional<at::Tensor>& tensor = tensors[i];
+    const std::optional<at::Tensor>& tensor = tensors[i];
     // Explicitly handling undefined tensors here instead of letting `_to_eager`
     // handle it. Otherwise, we'd need to require all backends with their own
     // implementation of _to_eager to properly handle undefined tensors.
@@ -112,10 +112,10 @@ c10::DispatchKey dispatch_key(c10::DeviceType device_type) {
   }
 }
 
-c10::optional<c10::Device> compute_target_device(
+std::optional<c10::Device> compute_target_device(
     std::vector<at::Tensor>& t_args,
     std::vector<c10::List<at::Tensor>> tlist_args,
-    std::vector<c10::List<c10::optional<at::Tensor>>> opt_tlist_args) {
+    std::vector<c10::List<std::optional<at::Tensor>>> opt_tlist_args) {
   // Decide what device to move the output tensor(s) to.
   // The current convention is that we use the first tensor arg to pick the
   // device Barring that, we take the first tensor from a TensorList arg.
@@ -217,7 +217,7 @@ void ts_eager_fallback(
   std::vector<int> tensor_args_indices;
 
   std::vector<c10::List<at::Tensor>> tensorlist_args;
-  std::vector<c10::List<c10::optional<at::Tensor>>> opt_tensorlist_args;
+  std::vector<c10::List<std::optional<at::Tensor>>> opt_tensorlist_args;
 
   // Step 1: Convert all non-eager tensor inputs into eager tensors and put them
   // on the stack at the correct indices.
@@ -236,7 +236,7 @@ void ts_eager_fallback(
       (*stack)[arguments_begin + idx] = std::move(eager_ivalue);
       tensorlist_args.push_back(ivalue.toTensorList());
     } else if (ivalue.isOptionalTensorList()) {
-      auto eager_ivalue = c10::IValue(c10::List<c10::optional<at::Tensor>>(
+      auto eager_ivalue = c10::IValue(c10::List<std::optional<at::Tensor>>(
           to_eager(ivalue.toOptionalTensorVector(), device_type)));
       (*stack)[arguments_begin + idx] = std::move(eager_ivalue);
       opt_tensorlist_args.push_back(ivalue.toOptionalTensorList());
@@ -323,7 +323,7 @@ void ts_eager_fallback(
               "mutable alias: ",
               schema_returns[idx]);
         } else {
-          c10::optional<c10::Device> tgt_device = compute_target_device(
+          std::optional<c10::Device> tgt_device = compute_target_device(
               tensor_args, tensorlist_args, opt_tensorlist_args);
           if (alias_info != nullptr && !alias_info->isWrite()) {
             // immutable alias (view) case: Warn here, since we're copying and
diff --git a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
index 456ff4211ac1a..78ae6a6f6e2e5 100644
--- a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
@@ -28,7 +28,7 @@ namespace {
 
 at::Tensor CreateLtcTensor(
     const at::Tensor& tensor,
-    const c10::optional<torch::lazy::BackendDevice>& device) {
+    const std::optional<torch::lazy::BackendDevice>& device) {
   if (tensor.defined() && device) {
     return torch::lazy::CreateAtenFromLtcTensor(
         torch::lazy::LazyTensor::Create(tensor, *device));
@@ -36,8 +36,8 @@ at::Tensor CreateLtcTensor(
   return tensor;
 }
 
-c10::optional<torch::lazy::BackendDevice> GetLtcDevice(
-    const c10::optional<c10::Device>& device) {
+std::optional<torch::lazy::BackendDevice> GetLtcDevice(
+    const std::optional<c10::Device>& device) {
   if (!device) {
     return c10::nullopt;
   }
@@ -53,7 +53,7 @@ c10::optional<torch::lazy::BackendDevice> GetLtcDevice(
 // This should be safe to do, because every operator in the LT is functional.
 at::Tensor LazyNativeFunctions::clone(
     const at::Tensor& self,
-    c10::optional<at::MemoryFormat> memory_format) {
+    std::optional<at::MemoryFormat> memory_format) {
   auto self_lt = torch::lazy::TryGetLtcTensor(self);
   return torch::lazy::CreateAtenFromLtcTensor(
       self_lt->Create(self_lt->GetIrValue(), self_lt->GetDevice()));
@@ -138,12 +138,12 @@ at::Tensor LazyNativeFunctions::_copy_from_and_resize(
 
 at::Tensor LazyNativeFunctions::_to_copy(
     const at::Tensor& self,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory,
+    std::optional<at::ScalarType> dtype,
+    std::optional<at::Layout> layout,
+    std::optional<at::Device> device,
+    std::optional<bool> pin_memory,
     bool non_blocking,
-    c10::optional<at::MemoryFormat> memory_format) {
+    std::optional<at::MemoryFormat> memory_format) {
   if (force_eager_fallback(at::aten::_to_copy)) {
     TORCH_INTERNAL_ASSERT(
         false,
@@ -270,11 +270,11 @@ at::Tensor LazyNativeFunctions::_to_copy(
 
 at::Tensor LazyNativeFunctions::empty_symint(
     at::SymIntArrayRef sym_size,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<at::MemoryFormat> memory_format) {
+    std::optional<at::ScalarType> dtype,
+    std::optional<at::Layout> layout,
+    std::optional<at::Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<at::MemoryFormat> memory_format) {
   // TODO: support this directly
   auto size = C10_AS_INTARRAYREF_SLOW(sym_size);
   const auto device_type = torch::lazy::getBackend()->EagerFallbackDeviceType();
@@ -301,10 +301,10 @@ at::Tensor LazyNativeFunctions::empty_symint(
 at::Tensor LazyNativeFunctions::empty_strided_symint(
     at::SymIntArrayRef sym_size,
     at::SymIntArrayRef sym_stride,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<at::ScalarType> dtype,
+    std::optional<at::Layout> layout,
+    std::optional<at::Device> device,
+    std::optional<bool> pin_memory) {
   TORCH_LAZY_FN_COUNTER("lazy::");
   at::Tensor t =
       empty_symint(sym_size, dtype, layout, device, pin_memory, c10::nullopt);
@@ -406,10 +406,10 @@ at::Tensor LazyNativeFunctions::new_empty_strided_symint(
     const at::Tensor& self,
     c10::SymIntArrayRef size,
     c10::SymIntArrayRef stride,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory) {
+    std::optional<at::ScalarType> dtype,
+    std::optional<at::Layout> layout,
+    std::optional<at::Device> device,
+    std::optional<bool> pin_memory) {
   return at::functionalization::
       functionalize_aten_op_symint<ATEN_OP(new_empty_strided)>::call(
           self, size, stride, dtype, layout, device, pin_memory);
@@ -457,8 +457,8 @@ at::Tensor LazyNativeFunctions::_trilinear(
 }
 at::Tensor LazyNativeFunctions::linalg_pinv(
     const at::Tensor& self,
-    const c10::optional<at::Tensor>& atol,
-    const c10::optional<at::Tensor>& rtol,
+    const std::optional<at::Tensor>& atol,
+    const std::optional<at::Tensor>& rtol,
     bool hermitian) {
   return at::functionalization::functionalize_aten_op<ATEN_OP2(
       linalg_pinv, atol_rtol_tensor)>::call(self, atol, rtol, hermitian);
@@ -525,8 +525,8 @@ at::Tensor LazyNativeFunctions::slice_backward_symint(
 // backwards formula for native_group_norm
 std::tuple<Tensor, Tensor, Tensor> LazyNativeFunctions::native_group_norm(
     const at::Tensor& input,
-    const c10::optional<at::Tensor>& weight,
-    const c10::optional<at::Tensor>& bias,
+    const std::optional<at::Tensor>& weight,
+    const std::optional<at::Tensor>& bias,
     int64_t N,
     int64_t C,
     int64_t HxW,
diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp
index 825ed46e11a50..b8bef342323c5 100644
--- a/torch/csrc/onnx/init.cpp
+++ b/torch/csrc/onnx/init.cpp
@@ -293,10 +293,6 @@ void initONNXBindings(PyObject* module) {
 
   onnx.attr("PRODUCER_VERSION") = py::str(TORCH_VERSION);
 
-#ifdef BUILD_CAFFE2
-  onnx.attr("_CAFFE2_ATEN_FALLBACK") = true;
-#else
   onnx.attr("_CAFFE2_ATEN_FALLBACK") = false;
-#endif
 }
 } // namespace torch::onnx
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index 3a129b3118d86..6822d39c225ac 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -61,9 +61,9 @@ struct TORCH_API RawTensorMetadata : RawTensorMetadataBase {
   RawTensorMetadata& operator=(RawTensorMetadata&&) noexcept = default;
   explicit RawTensorMetadata(const at::Tensor& t);
 
-  // Wrap `weak_self_` in `c10::optional` and split device into components to
+  // Wrap `weak_self_` in `std::optional` and split device into components to
   // keep struct default constructable. (which the std::array initializer needs)
-  c10::optional<WeakTensor> weak_self_;
+  std::optional<WeakTensor> weak_self_;
   c10::DeviceType device_type_{c10::DeviceType::CPU};
   c10::DeviceIndex device_index_{-1};
 };
@@ -85,8 +85,8 @@ struct TORCH_API TensorMetadata : public RawTensorMetadataBase {
   std::vector<int64_t> strides_;
 
   // Set during `calculateUniqueTensorIDs`.
-  c10::optional<TensorID> id_;
-  c10::optional<AllocationID> allocation_id_;
+  std::optional<TensorID> id_;
+  std::optional<AllocationID> allocation_id_;
 };
 
 using op_input_t = std::variant<
@@ -207,8 +207,8 @@ struct ExtraFields<EventType::Allocation> : RawAllocation {
     return {device_type_, device_index_};
   }
 
-  c10::optional<TensorID> id_;
-  c10::optional<AllocationID> allocation_id_;
+  std::optional<TensorID> id_;
+  std::optional<AllocationID> allocation_id_;
 };
 
 template <>
@@ -246,7 +246,7 @@ struct NNModuleInfo {
   struct ParameterInfo {
     std::string name_;
     TensorMetadata metadata_;
-    c10::optional<TensorMetadata> grad_metadata_;
+    std::optional<TensorMetadata> grad_metadata_;
   };
 
   PyModuleSelf self_;
@@ -261,7 +261,7 @@ struct NNModuleInfo {
 struct OptimizerInfo {
   struct ParameterInfo {
     TensorMetadata metadata_;
-    c10::optional<TensorMetadata> grad_metadata_;
+    std::optional<TensorMetadata> grad_metadata_;
     std::vector<std::pair<std::string, TensorMetadata>> state_;
   };
 
@@ -293,8 +293,8 @@ template <>
 struct ExtraFields<EventType::PyCall> : public PyExtraFieldsBase {
   struct args_t {
     PyFrameState frame_state_;
-    c10::optional<NNModuleInfo> module_info_;
-    c10::optional<OptimizerInfo> optimizer_info_;
+    std::optional<NNModuleInfo> module_info_;
+    std::optional<OptimizerInfo> optimizer_info_;
   };
 
   ExtraFields(
@@ -308,8 +308,8 @@ struct ExtraFields<EventType::PyCall> : public PyExtraFieldsBase {
         optimizer_{std::move(args.optimizer_info_)} {}
 
   PyFrameState callsite_;
-  c10::optional<NNModuleInfo> module_;
-  c10::optional<OptimizerInfo> optimizer_;
+  std::optional<NNModuleInfo> module_;
+  std::optional<OptimizerInfo> optimizer_;
 };
 
 template <>
diff --git a/torch/csrc/profiler/combined_traceback.cpp b/torch/csrc/profiler/combined_traceback.cpp
index 1cae103efc77c..c727f58d5284e 100644
--- a/torch/csrc/profiler/combined_traceback.cpp
+++ b/torch/csrc/profiler/combined_traceback.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/profiler/combined_traceback.h>
+#include <torch/csrc/utils/cpp_stacktraces.h>
 
 namespace torch {
 
@@ -77,7 +78,7 @@ SymbolizedTracebacks symbolize(
   }
   // gather symbol names for C++ frames
   if (!all_cpp_ips.empty()) {
-    r.all_frames = unwind::symbolize(all_cpp_ips);
+    r.all_frames = unwind::symbolize(all_cpp_ips, torch::get_symbolize_mode());
   }
 
   // batch symbolization requests so we dedup frame objects
diff --git a/torch/csrc/profiler/data_flow.cpp b/torch/csrc/profiler/data_flow.cpp
index e719835d7c2c1..9ea79cdbdb27d 100644
--- a/torch/csrc/profiler/data_flow.cpp
+++ b/torch/csrc/profiler/data_flow.cpp
@@ -18,8 +18,8 @@ struct RawTensorInfo {
   bool is_free_;
 
   // Used to assign back to the original structs.
-  std::reference_wrapper<c10::optional<AllocationID>> allocation_id_ref_;
-  std::reference_wrapper<c10::optional<TensorID>> id_ref_;
+  std::reference_wrapper<std::optional<AllocationID>> allocation_id_ref_;
+  std::reference_wrapper<std::optional<TensorID>> id_ref_;
 };
 
 struct RawTensors {
@@ -32,7 +32,7 @@ struct RawTensors {
         t.impl(), t.data_, t.device_, false, t.allocation_id_, t.id_});
   }
 
-  void operator()(c10::optional<TensorMetadata>& t) {
+  void operator()(std::optional<TensorMetadata>& t) {
     if (t.has_value()) {
       (*this)(*t);
     }
diff --git a/torch/csrc/profiler/orchestration/observer.h b/torch/csrc/profiler/orchestration/observer.h
index 4230851607608..b77febb2784ee 100644
--- a/torch/csrc/profiler/orchestration/observer.h
+++ b/torch/csrc/profiler/orchestration/observer.h
@@ -27,6 +27,7 @@ enum class C10_API_ENUM ProfilerState {
   CUDA, // CPU + CUDA events
   NVTX, // only emit NVTX markers
   ITT, // only emit ITT markers
+  PRIVATEUSE1, // only emit PRIVATEUSE1 markers
   KINETO, // use libkineto
   KINETO_GPU_FALLBACK, // use CUDA events when CUPTI is not available
   KINETO_PRIVATEUSE1_FALLBACK, // use PrivateUse1 events
@@ -39,7 +40,8 @@ enum class C10_API_ENUM ActiveProfilerType {
   LEGACY,
   KINETO,
   NVTX,
-  ITT
+  ITT,
+  PRIVATEUSE1
 };
 
 struct TORCH_API ExperimentalConfig {
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index 966bf68d3ee42..9ecfe5824a385 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -79,8 +79,7 @@ PyTypeObject THPCapturedTracebackType = {
     nullptr, /* tp_new */
 };
 
-namespace pybind11 {
-namespace detail {
+namespace pybind11::detail {
 
 template <>
 struct type_caster<std::shared_ptr<torch::CapturedTraceback>> {
@@ -107,11 +106,9 @@ struct type_caster<std::shared_ptr<torch::CapturedTraceback>> {
   }
 };
 
-} // namespace detail
-} // namespace pybind11
+} // namespace pybind11::detail
 
-namespace torch {
-namespace profiler {
+namespace torch::profiler {
 
 /* [NOTE: RecordFunctionFast]
  * This is an alternate way to call record_function from python.
@@ -308,6 +305,7 @@ void initPythonBindings(PyObject* module) {
       .value("CUDA", ProfilerState::CUDA)
       .value("NVTX", ProfilerState::NVTX)
       .value("ITT", ProfilerState::ITT)
+      .value("PRIVATEUSE1", ProfilerState::PRIVATEUSE1)
       .value("KINETO", ProfilerState::KINETO)
       .value("KINETO_GPU_FALLBACK", ProfilerState::KINETO_GPU_FALLBACK)
       .value(
@@ -319,7 +317,8 @@ void initPythonBindings(PyObject* module) {
       .value("LEGACY", ActiveProfilerType::LEGACY)
       .value("KINETO", ActiveProfilerType::KINETO)
       .value("NVTX", ActiveProfilerType::NVTX)
-      .value("ITT", ActiveProfilerType::ITT);
+      .value("ITT", ActiveProfilerType::ITT)
+      .value("PRIVATEUSE1", ActiveProfilerType::PRIVATEUSE1);
 
   py::enum_<ActivityType>(m, "ProfilerActivity")
       .value("CPU", ActivityType::CPU)
@@ -606,6 +605,33 @@ void initPythonBindings(PyObject* module) {
     }
     return py_symbolize(tb_ptrs);
   });
+  // directly convert address pointers to frames, used for testing symbolize
+  m.def(
+      "symbolize_addresses",
+      [](const std::vector<uint64_t>& frames, const std::string& mode_s) {
+        std::vector<std::tuple<std::string, int64_t, std::string>> frames_out;
+        torch::unwind::Mode mode = torch::unwind::Mode::addr2line;
+        if (mode_s == "fast") {
+          mode = torch::unwind::Mode::fast;
+        } else if (mode_s == "addr2line") {
+          mode = torch::unwind::Mode::addr2line;
+        } else if (mode_s == "dladdr") {
+          mode = torch::unwind::Mode::dladdr;
+        } else {
+          TORCH_CHECK(false, "unexpected mode ", mode_s);
+        }
+        std::vector<void*> frames_p;
+        frames_p.reserve(frames.size());
+        for (auto f : frames) {
+          frames_p.push_back((void*)f); // NOLINT
+        }
+        auto frame_objects = unwind::symbolize(frames_p, mode);
+        frames_out.reserve(frame_objects.size());
+        for (auto& frame : frame_objects) {
+          frames_out.emplace_back(frame.filename, frame.lineno, frame.funcname);
+        }
+        return frames_out;
+      });
   installCapturedTracebackPython();
 
   // NOLINTNEXTLINE(*-c-arrays*)
@@ -639,5 +665,4 @@ void initPythonBindings(PyObject* module) {
     throw python_error();
   }
 }
-} // namespace profiler
-} // namespace torch
+} // namespace torch::profiler
diff --git a/torch/csrc/profiler/standalone/privateuse1_observer.cpp b/torch/csrc/profiler/standalone/privateuse1_observer.cpp
new file mode 100644
index 0000000000000..81eb3074fb3ae
--- /dev/null
+++ b/torch/csrc/profiler/standalone/privateuse1_observer.cpp
@@ -0,0 +1,11 @@
+#include <torch/csrc/profiler/standalone/privateuse1_observer.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+PushPRIVATEUSE1CallbacksStub pushPRIVATEUSE1CallbacksStub;
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/standalone/privateuse1_observer.h b/torch/csrc/profiler/standalone/privateuse1_observer.h
new file mode 100644
index 0000000000000..39259b7444cfb
--- /dev/null
+++ b/torch/csrc/profiler/standalone/privateuse1_observer.h
@@ -0,0 +1,46 @@
+#pragma once
+#include <torch/csrc/profiler/api.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+using CallBackFnPtr = void (*)(
+    const ProfilerConfig& config,
+    const std::unordered_set<at::RecordScope>& scopes);
+
+struct PushPRIVATEUSE1CallbacksStub {
+  PushPRIVATEUSE1CallbacksStub() = default;
+  PushPRIVATEUSE1CallbacksStub(const PushPRIVATEUSE1CallbacksStub&) = delete;
+  PushPRIVATEUSE1CallbacksStub& operator=(const PushPRIVATEUSE1CallbacksStub&) =
+      delete;
+
+  template <typename... ArgTypes>
+  void operator()(ArgTypes&&... args) {
+    return (*push_privateuse1_callbacks_fn)(std::forward<ArgTypes>(args)...);
+  }
+
+  void set_privateuse1_dispatch_ptr(CallBackFnPtr fn_ptr) {
+    push_privateuse1_callbacks_fn = fn_ptr;
+  }
+
+ private:
+  CallBackFnPtr push_privateuse1_callbacks_fn = nullptr;
+};
+
+extern TORCH_API struct PushPRIVATEUSE1CallbacksStub
+    pushPRIVATEUSE1CallbacksStub;
+
+struct RegisterPRIVATEUSE1Observer {
+  RegisterPRIVATEUSE1Observer(
+      PushPRIVATEUSE1CallbacksStub& stub,
+      CallBackFnPtr value) {
+    stub.set_privateuse1_dispatch_ptr(value);
+  }
+};
+
+#define REGISTER_PRIVATEUSE1_OBSERVER(name, fn) \
+  static RegisterPRIVATEUSE1Observer name##__register(name, fn);
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/unwind/action.h b/torch/csrc/profiler/unwind/action.h
index e1ed407384fc9..672fffad8c917 100644
--- a/torch/csrc/profiler/unwind/action.h
+++ b/torch/csrc/profiler/unwind/action.h
@@ -2,6 +2,8 @@
 #include <stdint.h>
 #include <ostream>
 
+namespace torch::unwind {
+
 enum {
   A_UNDEFINED = 0x0,
   A_REG_PLUS_DATA = 0x1, // exp = REG[reg] + data0
@@ -53,3 +55,5 @@ struct Action {
     return out;
   }
 };
+
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/communicate.h b/torch/csrc/profiler/unwind/communicate.h
index 79c27eaeba7fa..063fe542a3419 100644
--- a/torch/csrc/profiler/unwind/communicate.h
+++ b/torch/csrc/profiler/unwind/communicate.h
@@ -5,6 +5,7 @@
 #include <unistd.h>
 #include <memory>
 
+namespace torch::unwind {
 // helper to open a process with stdin/stdout/stderr streams.
 struct Communicate {
   Communicate(const char* command, const char** args) {
@@ -63,3 +64,5 @@ struct Communicate {
   std::unique_ptr<std::ostream> out_;
   std::unique_ptr<std::ostream> err_;
 };
+
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/debug_info.h b/torch/csrc/profiler/unwind/debug_info.h
new file mode 100644
index 0000000000000..35c770c24e0c9
--- /dev/null
+++ b/torch/csrc/profiler/unwind/debug_info.h
@@ -0,0 +1,279 @@
+#pragma once
+#include <torch/csrc/profiler/unwind/dwarf_enums.h>
+#include <torch/csrc/profiler/unwind/dwarf_symbolize_enums.h>
+#include <torch/csrc/profiler/unwind/lexer.h>
+#include <torch/csrc/profiler/unwind/sections.h>
+#include <torch/csrc/profiler/unwind/unwind_error.h>
+#include <cstdint>
+#include <optional>
+
+namespace torch::unwind {
+
+struct DebugInfo {
+  DebugInfo(Sections& s) : s_(s) {}
+
+  void parse(uint64_t offset) {
+    auto L = parseHeader(offset);
+    parseCompileUnit(L);
+  }
+  unwind::optional<uint64_t> lineNumberProgramOffset() {
+    return line_number_program_offset_;
+  }
+  uint64_t nextOffset() {
+    return end_ - s_.debug_info.data;
+  }
+  std::vector<std::pair<uint64_t, uint64_t>> ranges() {
+    if (range_ptr_) {
+      auto offset = range_ptr_->first;
+      if (range_ptr_->second == DW_FORM_rnglistx) {
+        UNWIND_CHECK(rnglists_base_, "rnglistx but not rnglists_base_ set");
+        LOG_INFO("index for rnglistx {:x} + {:x}\n", *rnglists_base_, offset);
+        CheckedLexer L = s_.debug_rnglists.lexer(
+            *rnglists_base_ + offset * sec_offset_size_);
+        auto read = readSegmentOffset(L);
+        offset = *rnglists_base_ + read;
+      }
+      return version_ == 4 ? readRanges4(offset) : readRanges5(offset);
+    }
+    if (!highpc_) {
+      return {};
+    }
+    return {{lowpc_, lowpc_ + *highpc_}};
+  }
+
+  bool is64bit() {
+    return is_64bit_;
+  }
+
+ private:
+  CheckedLexer parseHeader(uint64_t offset) {
+    offset_ = offset;
+    CheckedLexer L = s_.debug_info.lexer(offset_);
+    std::tie(length_, is_64bit_) = L.readSectionLength();
+    sec_offset_size_ = is_64bit_ ? 8 : 4;
+    end_ = (const char*)L.loc() + length_;
+    version_ = L.read<uint16_t>();
+    UNWIND_CHECK(
+        version_ == 5 || version_ == 4,
+        "unexpected dwarf version {}",
+        version_);
+    uint8_t address_size = 0;
+    if (version_ == 5) {
+      auto unit_type = L.read<uint8_t>();
+      UNWIND_CHECK(unit_type == 0x1, "unexpected unit type {}", unit_type);
+      address_size = L.read<uint8_t>();
+      debug_abbrev_offset_ =
+          is_64bit_ ? L.read<uint64_t>() : L.read<uint32_t>();
+    } else {
+      debug_abbrev_offset_ =
+          is_64bit_ ? L.read<uint64_t>() : L.read<uint32_t>();
+      address_size = L.read<uint8_t>();
+    }
+    LOG_INFO(
+        "compilation unit at offset {:x} with length {:x} and debug_abbrev_offset {:x}\n",
+        offset,
+        length_,
+        debug_abbrev_offset_);
+    UNWIND_CHECK(
+        address_size == 8,
+        "expected 64-bit dwarf but found address size {}",
+        address_size);
+    return L;
+  }
+
+  uint64_t readSegmentOffset(CheckedLexer& L) {
+    return s_.readSegmentOffset(L, is_64bit_);
+  }
+
+  uint64_t readEncoded(CheckedLexer& L, uint64_t encoding) {
+    switch (encoding) {
+      case DW_FORM_data8:
+      case DW_FORM_addr:
+        return L.read<uint64_t>();
+      case DW_FORM_data4:
+        return L.read<uint32_t>();
+      case DW_FORM_addrx: {
+        auto idx = L.readULEB128();
+        return s_.debug_addr.lexer(address_base_ + sizeof(uint64_t) * idx)
+            .read<uint64_t>();
+      }
+      case DW_FORM_sec_offset:
+        return readSegmentOffset(L);
+      case DW_FORM_rnglistx: {
+        return L.readULEB128();
+      }
+      default:
+        UNWIND_CHECK(false, "unexpected encoding");
+    }
+  }
+
+  void parseCompileUnit(CheckedLexer& L) {
+    auto entry = L.readULEB128();
+    auto A = findAbbrev(debug_abbrev_offset_, entry);
+    while (true) {
+      auto attr = A.readULEB128();
+      auto form = A.readULEB128();
+      if (attr == 0 && form == 0) {
+        break;
+      }
+      if (form == DW_FORM_implicit_const) {
+        A.readSLEB128();
+      }
+      if (attr == DW_AT_low_pc) {
+        lowpc_ = readEncoded(L, form);
+        LOG_INFO("  lowpc {:x}\n", lowpc_);
+      } else if (attr == DW_AT_high_pc) {
+        highpc_ = readEncoded(L, form);
+        range_ptr_ = std::nullopt;
+        LOG_INFO("  highpc {:x}\n", *highpc_);
+      } else if (attr == DW_AT_addr_base) {
+        UNWIND_CHECK(form == DW_FORM_sec_offset, "unexpected addr_base form");
+        address_base_ = readSegmentOffset(L);
+        LOG_INFO("  address base {:x}\n", address_base_);
+      } else if (attr == DW_AT_rnglists_base) {
+        UNWIND_CHECK(
+            form == DW_FORM_sec_offset, "unexpected rnglists_base form");
+        rnglists_base_ = readSegmentOffset(L);
+        LOG_INFO("  range base {:x}\n", *rnglists_base_);
+      } else if (form == DW_FORM_string) {
+        L.readCString();
+      } else if (attr == DW_AT_stmt_list) {
+        UNWIND_CHECK(form == DW_FORM_sec_offset, "unexpected stmt_list form");
+        LOG_INFO("  program table offset {:x}\n", *line_number_program_offset_);
+        line_number_program_offset_ = readSegmentOffset(L);
+      } else if (form == DW_FORM_exprloc) {
+        auto sz = L.readULEB128();
+        L.skip(int64_t(sz));
+      } else if (form == DW_FORM_block1) {
+        auto sz = L.read<uint8_t>();
+        L.skip(int64_t(sz));
+      } else if (attr == DW_AT_ranges) {
+        auto range_offset = readEncoded(L, form);
+        LOG_INFO("setting range_ptr to {:x} {:x}\n", range_offset, form);
+        range_ptr_.emplace(range_offset, form);
+      } else if (
+          form == DW_FORM_udata || form == DW_FORM_rnglistx ||
+          form == DW_FORM_strx || form == DW_FORM_loclistx ||
+          form == DW_FORM_addrx) {
+        L.readULEB128();
+      } else if (form == DW_FORM_sdata) {
+        L.readSLEB128();
+      } else {
+        auto sz = formSize(form, sec_offset_size_);
+        UNWIND_CHECK(sz, "unsupported form in compilation unit {:x}", form);
+        L.skip(int64_t(*sz));
+      }
+    }
+  }
+
+  std::vector<std::pair<uint64_t, uint64_t>> readRanges4(uint64_t offset) {
+    CheckedLexer L = s_.debug_ranges.lexer(offset);
+    std::vector<std::pair<uint64_t, uint64_t>> ranges;
+    uint64_t base = lowpc_;
+    while (true) {
+      auto start = L.read<uint64_t>();
+      auto end = L.read<uint64_t>();
+      if (start == 0 && end == 0) {
+        break;
+      }
+      if (start == std::numeric_limits<uint64_t>::max()) {
+        base = end;
+      } else {
+        ranges.emplace_back(base + start, base + end);
+      }
+    }
+    return ranges;
+  }
+
+  std::vector<std::pair<uint64_t, uint64_t>> readRanges5(uint64_t offset) {
+    CheckedLexer L = s_.debug_rnglists.lexer(offset);
+    uint64_t base = 0;
+    LOG_INFO("BEGIN RANGES {:x}\n", offset);
+    std::vector<std::pair<uint64_t, uint64_t>> ranges;
+    while (true) {
+      auto op = L.read<uint8_t>();
+      switch (op) {
+        case DW_RLE_end_of_list:
+          LOG_INFO("END RANGES\n");
+          return ranges;
+        case DW_RLE_base_addressx: {
+          base = readEncoded(L, DW_FORM_addrx);
+          LOG_INFO("BASE ADDRX {:x}\n", base);
+        } break;
+        case DW_RLE_startx_length: {
+          auto s = readEncoded(L, DW_FORM_addrx);
+          auto e = L.readULEB128();
+          LOG_INFO("startx_length {:x} {:x}\n", s, e);
+          ranges.emplace_back(s, s + e);
+        } break;
+        case DW_RLE_base_address:
+          base = L.read<uint64_t>();
+          LOG_INFO("BASE ADDR {:x}\n", base);
+          break;
+        case DW_RLE_offset_pair: {
+          auto s = L.readULEB128();
+          auto e = L.readULEB128();
+          LOG_INFO("offset_pair {:x} {:x}\n", s, e);
+          ranges.emplace_back(base + s, base + e);
+        } break;
+        case DW_RLE_start_length: {
+          auto s = L.read<uint64_t>();
+          auto e = L.readULEB128();
+          LOG_INFO("start_length {:x} {:x}\n", s, e);
+          ranges.emplace_back(s, s + e);
+        } break;
+        default:
+          UNWIND_CHECK(false, "unknown range op: {}", op);
+      }
+    }
+  }
+
+  CheckedLexer findAbbrev(uint64_t offset, uint64_t entry) {
+    CheckedLexer L = s_.debug_abbrev.lexer(offset);
+    while (true) {
+      auto abbrev_code = L.readULEB128();
+      UNWIND_CHECK(
+          abbrev_code != 0,
+          "could not find entry {} at offset {:x}",
+          entry,
+          offset);
+      auto tag = L.readULEB128();
+      L.read<uint8_t>(); // has children
+      if (abbrev_code == entry) {
+        UNWIND_CHECK(
+            tag == DW_TAG_compile_unit,
+            "first entry was not a compile unit but {}",
+            tag);
+        return L;
+      }
+      while (true) {
+        auto attr = L.readULEB128();
+        auto form = L.readULEB128();
+        if (attr == 0 && form == 0) {
+          break;
+        }
+        if (form == DW_FORM_implicit_const) {
+          L.readSLEB128();
+        }
+      }
+    }
+  }
+
+  Sections& s_;
+  optional<uint64_t> line_number_program_offset_;
+  uint64_t offset_ = 0;
+  uint8_t sec_offset_size_ = 0;
+  uint64_t length_ = 0;
+  const char* end_ = nullptr;
+  uint64_t debug_abbrev_offset_ = 0;
+  bool is_64bit_ = false;
+
+  std::optional<std::pair<uint64_t, uint8_t>> range_ptr_;
+  uint64_t lowpc_ = 0;
+  optional<uint64_t> highpc_;
+  uint16_t version_ = 0;
+  uint64_t address_base_ = 0;
+  optional<uint64_t> rnglists_base_;
+};
+
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/dwarf_symbolize_enums.h b/torch/csrc/profiler/unwind/dwarf_symbolize_enums.h
new file mode 100644
index 0000000000000..2c229823027d3
--- /dev/null
+++ b/torch/csrc/profiler/unwind/dwarf_symbolize_enums.h
@@ -0,0 +1,181 @@
+#pragma once
+#include <torch/csrc/profiler/unwind/unwind_error.h>
+#include <cstdint>
+#include <optional>
+
+enum {
+  DW_TAG_subprogram = 0x2e,
+  DW_TAG_inlined_subroutine = 0x1d,
+  DW_TAG_compile_unit = 0x11,
+  DW_AT_sibling = 0x1, // reference
+  DW_AT_name = 0x3, // string
+  DW_AT_stmt_list = 0x10, // lineptr
+  DW_AT_addr_base = 0x73, // sec_offset
+  DW_AT_rnglists_base = 0x74, // sec_offset
+  DW_AT_low_pc = 0x11, // address
+  DW_AT_high_pc = 0x12, // address
+  DW_AT_specification = 0x47, // reference
+  DW_AT_abstract_origin = 0x31, // reference
+  DW_AT_linkage_name = 0x6e, // string
+  DW_AT_ranges = 0x55, // rnglist
+  DW_AT_str_offsets_base = 0x72, // sec_offset
+  DW_FORM_addr = 0x01,
+  DW_FORM_block2 = 0x03,
+  DW_FORM_block4 = 0x04,
+  DW_FORM_data2 = 0x05,
+  DW_FORM_data4 = 0x06,
+  DW_FORM_data8 = 0x07,
+  DW_FORM_string = 0x08,
+  DW_FORM_block = 0x09,
+  DW_FORM_block1 = 0x0a,
+  DW_FORM_data1 = 0x0b,
+  DW_FORM_flag = 0x0c,
+  DW_FORM_sdata = 0x0d,
+  DW_FORM_strp = 0x0e,
+  DW_FORM_udata = 0x0f,
+  DW_FORM_ref_addr = 0x10,
+  DW_FORM_ref1 = 0x11,
+  DW_FORM_ref2 = 0x12,
+  DW_FORM_ref4 = 0x13,
+  DW_FORM_ref8 = 0x14,
+  DW_FORM_ref_udata = 0x15,
+  DW_FORM_indirect = 0x16,
+  DW_FORM_sec_offset = 0x17,
+  DW_FORM_exprloc = 0x18,
+  DW_FORM_flag_present = 0x19,
+  DW_FORM_strx = 0x1a,
+  DW_FORM_addrx = 0x1b,
+  DW_FORM_ref_sup4 = 0x1c,
+  DW_FORM_strp_sup = 0x1d,
+  DW_FORM_data16 = 0x1e,
+  DW_FORM_line_strp = 0x1f,
+  DW_FORM_ref_sig8 = 0x20,
+  DW_FORM_implicit_const = 0x21,
+  DW_FORM_loclistx = 0x22,
+  DW_FORM_rnglistx = 0x23,
+  DW_FORM_ref_sup8 = 0x24,
+  DW_FORM_strx1 = 0x25,
+  DW_FORM_strx2 = 0x26,
+  DW_FORM_strx3 = 0x27,
+  DW_FORM_strx4 = 0x28,
+  DW_FORM_addrx1 = 0x29,
+  DW_FORM_addrx2 = 0x2a,
+  DW_FORM_addrx3 = 0x2b,
+  DW_FORM_addrx4 = 0x2c,
+  /* GNU Debug Fission extensions.  */
+  DW_FORM_GNU_addr_index = 0x1f01,
+  DW_FORM_GNU_str_index = 0x1f02,
+  DW_FORM_GNU_ref_alt = 0x1f20, /* offset in alternate .debuginfo.  */
+  DW_FORM_GNU_strp_alt = 0x1f21, /* offset in alternate .debug_str. */
+  DW_LNCT_path = 0x1,
+  DW_LNCT_directory_index = 0x2,
+  DW_LNS_extended_op = 0x00,
+  DW_LNE_end_sequence = 0x01,
+  DW_LNE_set_address = 0x02,
+  DW_LNS_copy = 0x01,
+  DW_LNS_advance_pc = 0x02,
+  DW_LNS_advance_line = 0x03,
+  DW_LNS_set_file = 0x04,
+  DW_LNS_const_add_pc = 0x08,
+  DW_LNS_fixed_advance_pc = 0x09,
+  DW_RLE_end_of_list = 0x0,
+  DW_RLE_base_addressx = 0x1,
+  DW_RLE_startx_endx = 0x2,
+  DW_RLE_startx_length = 0x3,
+  DW_RLE_offset_pair = 0x4,
+  DW_RLE_base_address = 0x5,
+  DW_RLE_start_end = 0x6,
+  DW_RLE_start_length = 0x7
+};
+
+static torch::unwind::optional<size_t> formSize(
+    uint64_t form,
+    uint8_t sec_offset_size) {
+  switch (form) {
+    case DW_FORM_addr:
+      return sizeof(void*);
+    case DW_FORM_block2:
+    case DW_FORM_block4:
+      return std::nullopt;
+    case DW_FORM_data2:
+      return 2;
+    case DW_FORM_data4:
+      return 4;
+    case DW_FORM_data8:
+      return 8;
+    case DW_FORM_string:
+    case DW_FORM_block:
+    case DW_FORM_block1:
+      return std::nullopt;
+    case DW_FORM_data1:
+    case DW_FORM_flag:
+      return 1;
+    case DW_FORM_sdata:
+      return std::nullopt;
+    case DW_FORM_strp:
+      return sec_offset_size;
+    case DW_FORM_udata:
+      return std::nullopt;
+    case DW_FORM_ref_addr:
+      return sec_offset_size;
+    case DW_FORM_ref1:
+      return 1;
+    case DW_FORM_ref2:
+      return 2;
+    case DW_FORM_ref4:
+      return 4;
+    case DW_FORM_ref8:
+      return 8;
+    case DW_FORM_ref_udata:
+    case DW_FORM_indirect:
+      return std::nullopt;
+    case DW_FORM_sec_offset:
+      return sec_offset_size;
+    case DW_FORM_exprloc:
+      return std::nullopt;
+    case DW_FORM_flag_present:
+      return 0;
+    case DW_FORM_strx:
+    case DW_FORM_addrx:
+      return std::nullopt;
+    case DW_FORM_ref_sup4:
+      return 4;
+    case DW_FORM_strp_sup:
+      return sec_offset_size;
+    case DW_FORM_data16:
+      return 16;
+    case DW_FORM_line_strp:
+      return sec_offset_size;
+    case DW_FORM_ref_sig8:
+      return 8;
+    case DW_FORM_implicit_const:
+      return 0;
+    case DW_FORM_loclistx:
+    case DW_FORM_rnglistx:
+      return std::nullopt;
+    case DW_FORM_ref_sup8:
+      return 8;
+    case DW_FORM_strx1:
+      return 1;
+    case DW_FORM_strx2:
+      return 2;
+    case DW_FORM_strx3:
+      return 3;
+    case DW_FORM_strx4:
+      return 4;
+    case DW_FORM_addrx1:
+      return 1;
+    case DW_FORM_addrx2:
+      return 2;
+    case DW_FORM_addrx3:
+      return 3;
+    case DW_FORM_addrx4:
+      return 4;
+    case DW_FORM_GNU_addr_index:
+    case DW_FORM_GNU_str_index:
+    case DW_FORM_GNU_ref_alt:
+    case DW_FORM_GNU_strp_alt:
+    default:
+      return std::nullopt;
+  }
+}
diff --git a/torch/csrc/profiler/unwind/eh_frame_hdr.h b/torch/csrc/profiler/unwind/eh_frame_hdr.h
index 9800166675093..c69c066dae68f 100644
--- a/torch/csrc/profiler/unwind/eh_frame_hdr.h
+++ b/torch/csrc/profiler/unwind/eh_frame_hdr.h
@@ -7,6 +7,7 @@
 
 // Overview of the format described in
 // https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/ehframehdr.html
+namespace torch::unwind {
 
 struct EHFrameHdr {
   EHFrameHdr(void* base) : base_(base) {
@@ -93,3 +94,5 @@ struct EHFrameHdr {
   int64_t fde_count_;
   uint32_t table_size_;
 };
+
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/fast_symbolizer.h b/torch/csrc/profiler/unwind/fast_symbolizer.h
new file mode 100644
index 0000000000000..2c79ed81f5076
--- /dev/null
+++ b/torch/csrc/profiler/unwind/fast_symbolizer.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <fmt/format.h>
+#include <sys/types.h>
+#include <torch/csrc/profiler/unwind/debug_info.h>
+#include <torch/csrc/profiler/unwind/line_number_program.h>
+#include <torch/csrc/profiler/unwind/sections.h>
+#include <torch/csrc/profiler/unwind/unwind.h>
+#include <torch/csrc/profiler/unwind/unwind_error.h>
+#include <cstddef>
+#include <memory>
+
+namespace torch::unwind {
+
+#define UNWIND_WARN(w, ...)                   \
+  do {                                        \
+    w.emplace_back(fmt::format(__VA_ARGS__)); \
+    LOG_INFO("WARNING: {}\n", w.back());      \
+  } while (0);
+
+struct FastSymbolizer {
+  FastSymbolizer() = default;
+  Frame symbolize(const std::string& library, uint64_t offset) {
+    LOG_INFO("symbolizing {} + 0x{:x}\n", library, offset);
+    Frame frame;
+    frame.funcname = "??";
+    frame.filename = library;
+    frame.lineno = offset;
+    auto s = getOrCreateSections(library);
+    if (auto e = s->findSubprogramName(offset)) {
+      frame.funcname = *e;
+    } else {
+      UNWIND_WARN(
+          warnings_,
+          "failed to find subprogram name for {} 0x{:x}",
+          library,
+          offset);
+    }
+    if (auto e = findLine(s, offset)) {
+      frame.filename = e->first;
+      frame.lineno = e->second;
+    } else {
+      UNWIND_WARN(
+          warnings_, "failed to find file/line for {} 0x{:x}", library, offset);
+    }
+    return frame;
+  }
+  const std::vector<std::string>& warnings() {
+    return warnings_;
+  }
+
+ private:
+  void parseDebugInfo(Sections* s) {
+    uint64_t offset = 0;
+    while (offset < s->debug_info.size) {
+      DebugInfo info(*s);
+      info.parse(offset);
+      if (auto lnp_offset = info.lineNumberProgramOffset()) {
+        for (auto r : info.ranges()) {
+          s->addDebugInfoRange(r.first, r.second, line_number_programs_.size());
+        }
+        line_number_programs_.emplace_back(
+            std::make_unique<LineNumberProgram>(*s, *lnp_offset));
+      }
+      offset = info.nextOffset();
+    }
+  }
+  Sections* getOrCreateSections(const std::string& library) {
+    auto it = libraries_.find(library);
+    if (it == libraries_.end()) {
+      it = libraries_.insert({library, std::make_unique<Sections>()}).first;
+      try {
+        Sections* s = it->second.get();
+        s->parse(library.c_str());
+        parseDebugInfo(s);
+      } catch (UnwindError& err) {
+        UNWIND_WARN(
+            warnings_, "failed to parse library {}: {}", library, err.what());
+      }
+    }
+    return it->second.get();
+  }
+  optional<std::pair<std::string, int64_t>> findLine(
+      Sections* s,
+      uint64_t offset) {
+    if (auto idx = s->findDebugInfoOffset(offset)) {
+      auto r = line_number_programs_.at(*idx).get();
+      try {
+        r->parse();
+      } catch (UnwindError& err) {
+        UNWIND_WARN(
+            warnings_,
+            "failed to read line number program [{:x}] {}",
+            r->offset(),
+            err.what());
+      }
+      if (auto e = r->find(offset)) {
+        return std::make_pair(r->filename(e->file), e->line);
+      }
+    }
+    return std::nullopt;
+  }
+  std::unordered_map<std::string, std::unique_ptr<Sections>> libraries_;
+  std::vector<std::unique_ptr<LineNumberProgram>> line_number_programs_;
+  std::vector<std::string> warnings_;
+};
+
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/fde.h b/torch/csrc/profiler/unwind/fde.h
index 5e8cc0baee18f..ea8b4ca94eaea 100644
--- a/torch/csrc/profiler/unwind/fde.h
+++ b/torch/csrc/profiler/unwind/fde.h
@@ -7,6 +7,8 @@
 #include <sstream>
 #include <vector>
 
+namespace torch::unwind {
+
 struct TableState {
   Action cfa;
   std::array<Action, D_REG_SIZE> registers;
@@ -398,3 +400,5 @@ struct FDE {
     return strstr(augmentation_string_, s) != nullptr;
   }
 };
+
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/lexer.h b/torch/csrc/profiler/unwind/lexer.h
index 0c1d33abe4e9e..117df6b9b0286 100644
--- a/torch/csrc/profiler/unwind/lexer.h
+++ b/torch/csrc/profiler/unwind/lexer.h
@@ -1,19 +1,31 @@
 #pragma once
-#include <stdint.h>
-#include <string.h>
+#include <cstdint>
+#include <cstring>
+#include <utility>
 
 #include <torch/csrc/profiler/unwind/dwarf_enums.h>
 #include <torch/csrc/profiler/unwind/unwind_error.h>
 
-struct Lexer {
-  Lexer(void* data, void* base = nullptr)
-      : next_((const char*)data), base_((int64_t)base) {}
+namespace torch::unwind {
+
+template <bool checked>
+struct LexerImpl {
+  LexerImpl(void* data, void* base = nullptr, void* end = nullptr)
+      : next_((const char*)data),
+        base_((int64_t)base),
+        end_((const char*)end) {}
 
   template <typename T>
   T read() {
     T result;
+    auto end = next_ + sizeof(T);
+    UNWIND_CHECK(
+        !checked || end <= end_,
+        "read out of bounds {} >= {}",
+        (void*)end,
+        (void*)end_);
     memcpy(&result, next_, sizeof(T));
-    next_ += sizeof(T);
+    next_ = end;
     return result;
   }
 
@@ -21,7 +33,7 @@ struct Lexer {
   int64_t readSLEB128() {
     int64_t Value = 0;
     unsigned Shift = 0;
-    uint8_t Byte;
+    uint8_t Byte = 0;
     do {
       Byte = read<uint8_t>();
       uint64_t Slice = Byte & 0x7f;
@@ -29,12 +41,12 @@ struct Lexer {
           (Shift == 63 && Slice != 0 && Slice != 0x7f)) {
         throw UnwindError("sleb128 too big for int64");
       }
-      Value |= Slice << Shift;
+      Value |= int64_t(Slice << Shift);
       Shift += 7;
     } while (Byte >= 128);
     // Sign extend negative numbers if needed.
     if (Shift < 64 && (Byte & 0x40)) {
-      Value |= (-1ULL) << Shift;
+      Value |= int64_t((-1ULL) << Shift);
     }
     return Value;
   }
@@ -42,7 +54,7 @@ struct Lexer {
   uint64_t readULEB128() {
     uint64_t Value = 0;
     unsigned Shift = 0;
-    uint8_t p;
+    uint8_t p = 0;
     do {
       p = read<uint8_t>();
       uint64_t Slice = p & 0x7f;
@@ -56,8 +68,17 @@ struct Lexer {
   }
   const char* readCString() {
     auto result = next_;
-    next_ += strlen(next_) + 1;
-    return result;
+    if (!checked) {
+      next_ += strlen(next_) + 1;
+      return result;
+    }
+    while (next_ < end_) {
+      if (*next_++ == '\0') {
+        return result;
+      }
+    }
+    UNWIND_CHECK(
+        false, "string is out of bounds {} >= {}", (void*)next_, (void*)end_);
   }
   int64_t readEncoded(uint8_t enc) {
     int64_t r = 0;
@@ -81,20 +102,27 @@ struct Lexer {
     }
     return readEncoded(enc);
   }
+
   int64_t read4or8Length() {
+    return readSectionLength().first;
+  }
+
+  std::pair<int64_t, bool> readSectionLength() {
     int64_t length = read<uint32_t>();
     if (length == 0xFFFFFFFF) {
-      length = read<int64_t>();
+      return std::make_pair(read<int64_t>(), true);
     }
-    return length;
+    return std::make_pair(length, false);
   }
+
   void* loc() const {
     return (void*)next_;
   }
-  Lexer& skip(int64_t bytes) {
+  LexerImpl& skip(int64_t bytes) {
     next_ += bytes;
     return *this;
   }
+
   int64_t readEncodedValue(uint8_t enc) {
     switch (enc & 0xF) {
       case DW_EH_PE_udata2:
@@ -121,4 +149,11 @@ struct Lexer {
  private:
   const char* next_;
   int64_t base_;
+  const char* end_;
 };
+
+// using Lexer = LexerImpl<false>;
+using CheckedLexer = LexerImpl<true>;
+using Lexer = LexerImpl<false>;
+
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/line_number_program.h b/torch/csrc/profiler/unwind/line_number_program.h
new file mode 100644
index 0000000000000..4a1ea281e27d0
--- /dev/null
+++ b/torch/csrc/profiler/unwind/line_number_program.h
@@ -0,0 +1,325 @@
+#include <c10/util/irange.h>
+#include <torch/csrc/profiler/unwind/debug_info.h>
+#include <torch/csrc/profiler/unwind/dwarf_enums.h>
+#include <torch/csrc/profiler/unwind/dwarf_symbolize_enums.h>
+#include <torch/csrc/profiler/unwind/lexer.h>
+#include <torch/csrc/profiler/unwind/sections.h>
+#include <torch/csrc/profiler/unwind/unwind_error.h>
+#include <tuple>
+
+namespace torch::unwind {
+
+struct LineNumberProgram {
+  LineNumberProgram(Sections& s, uint64_t offset) : s_(s), offset_(offset) {}
+
+  uint64_t offset() {
+    return offset_;
+  }
+  void parse() {
+    if (parsed_) {
+      return;
+    }
+    parsed_ = true;
+    CheckedLexer L = s_.debug_line.lexer(offset_);
+    std::tie(length_, is_64bit_) = L.readSectionLength();
+    program_end_ = (char*)L.loc() + length_;
+    auto version = L.read<uint16_t>();
+    UNWIND_CHECK(
+        version == 5 || version == 4,
+        "expected version 4 or 5 but found {}",
+        version);
+    if (version == 5) {
+      auto address_size = L.read<uint8_t>();
+      UNWIND_CHECK(
+          address_size == 8,
+          "expected 64-bit dwarf but found address size {}",
+          address_size);
+      segment_selector_size_ = L.read<uint8_t>();
+    }
+    header_length_ = is_64bit_ ? L.read<uint64_t>() : L.read<uint32_t>();
+    program_ = L;
+    program_.skip(int64_t(header_length_));
+    minimum_instruction_length_ = L.read<uint8_t>();
+    maximum_operations_per_instruction_ = L.read<uint8_t>();
+    default_is_stmt_ = L.read<uint8_t>();
+    line_base_ = L.read<int8_t>();
+    line_range_ = L.read<uint8_t>();
+    opcode_base_ = L.read<uint8_t>();
+    UNWIND_CHECK(line_range_ != 0, "line_range_ must be non-zero");
+    standard_opcode_lengths_.resize(opcode_base_);
+    for (size_t i = 1; i < opcode_base_; i++) {
+      standard_opcode_lengths_[i] = L.read<uint8_t>();
+    }
+    // fmt::print("{:x} {:x} {} {} {} {} {}\n", offset_, header_length_,
+    // minimum_instruction_length_, maximum_operations_per_instruction_,
+    // line_base_, line_range_, opcode_base_);
+    uint8_t directory_entry_format_count = L.read<uint8_t>();
+
+    if (version == 5) {
+      struct Member {
+        uint64_t content_type;
+        uint64_t form;
+      };
+      std::vector<Member> directory_members;
+      for (size_t i = 0; i < directory_entry_format_count; i++) {
+        directory_members.push_back({L.readULEB128(), L.readULEB128()});
+      }
+      uint64_t directories_count = L.readULEB128();
+      for (size_t i = 0; i < directories_count; i++) {
+        for (auto& member : directory_members) {
+          switch (member.content_type) {
+            case DW_LNCT_path: {
+              include_directories_.emplace_back(
+                  s_.readString(L, member.form, is_64bit_, 0));
+            } break;
+            default: {
+              skipForm(L, member.form);
+            } break;
+          }
+        }
+      }
+
+      for (auto i : c10::irange(directories_count)) {
+        (void)i;
+        LOG_INFO("{} {}\n", i, include_directories_[i]);
+      }
+      auto file_name_entry_format_count = L.read<uint8_t>();
+      std::vector<Member> file_members;
+      for (size_t i = 0; i < file_name_entry_format_count; i++) {
+        file_members.push_back({L.readULEB128(), L.readULEB128()});
+      }
+      auto files_count = L.readULEB128();
+      for (size_t i = 0; i < files_count; i++) {
+        for (auto& member : file_members) {
+          switch (member.content_type) {
+            case DW_LNCT_path: {
+              file_names_.emplace_back(
+                  s_.readString(L, member.form, is_64bit_, 0));
+            } break;
+            case DW_LNCT_directory_index: {
+              file_directory_index_.emplace_back(readData(L, member.form));
+              UNWIND_CHECK(
+                  file_directory_index_.back() < include_directories_.size(),
+                  "directory index out of range");
+            } break;
+            default: {
+              skipForm(L, member.form);
+            } break;
+          }
+        }
+      }
+      for (auto i : c10::irange(files_count)) {
+        (void)i;
+        LOG_INFO("{} {} {}\n", i, file_names_[i], file_directory_index_[i]);
+      }
+    } else {
+      include_directories_.emplace_back(""); // implicit cwd
+      while (true) {
+        auto str = L.readCString();
+        if (*str == '\0') {
+          break;
+        }
+        include_directories_.emplace_back(str);
+      }
+      file_names_.emplace_back("");
+      file_directory_index_.emplace_back(0);
+      while (true) {
+        auto str = L.readCString();
+        if (*str == '\0') {
+          break;
+        }
+        auto directory_index = L.readULEB128();
+        L.readULEB128(); // mod_time
+        L.readULEB128(); // file_length
+        file_names_.emplace_back(str);
+        file_directory_index_.push_back(directory_index);
+      }
+    }
+    UNWIND_CHECK(
+        maximum_operations_per_instruction_ == 1,
+        "maximum_operations_per_instruction_ must be 1");
+    UNWIND_CHECK(
+        minimum_instruction_length_ == 1,
+        "minimum_instruction_length_ must be 1");
+    readProgram();
+  }
+  struct Entry {
+    uint32_t file = 1;
+    int64_t line = 1;
+  };
+  unwind::optional<Entry> find(uint64_t address) {
+    auto e = program_index_.find(address);
+    if (!e) {
+      return std::nullopt;
+    }
+    return all_programs_.at(*e).find(address);
+  }
+  std::string filename(uint64_t index) {
+    return fmt::format(
+        "{}/{}",
+        include_directories_.at(file_directory_index_.at(index)),
+        file_names_.at(index));
+  }
+
+ private:
+  void skipForm(CheckedLexer& L, uint64_t form) {
+    auto sz = formSize(form, is_64bit_ ? 8 : 4);
+    UNWIND_CHECK(sz, "unsupported form {}", form);
+    L.skip(int64_t(*sz));
+  }
+
+  uint64_t readData(CheckedLexer& L, uint64_t encoding) {
+    switch (encoding) {
+      case DW_FORM_data1:
+        return L.read<uint8_t>();
+      case DW_FORM_data2:
+        return L.read<uint16_t>();
+      case DW_FORM_data4:
+        return L.read<uint32_t>();
+      case DW_FORM_data8:
+        return L.read<uint64_t>();
+      case DW_FORM_udata:
+        return L.readULEB128();
+      default:
+        UNWIND_CHECK(false, "unsupported data encoding {}", encoding);
+    }
+  }
+
+  void produceEntry() {
+    if (shadow_) {
+      return;
+    }
+    if (ranges_.size() == 1) {
+      start_address_ = address_;
+    }
+    PRINT_LINE_TABLE(
+        "{:x}\t{}\t{}\n", address_, filename(entry_.file), entry_.line);
+    UNWIND_CHECK(
+        entry_.file < file_names_.size(),
+        "file index {} > {} entries",
+        entry_.file,
+        file_names_.size());
+    ranges_.add(address_, entry_, true);
+  }
+  void endSequence() {
+    if (shadow_) {
+      return;
+    }
+    PRINT_LINE_TABLE(
+        "{:x}\tEND\n", address_, filename(entry_.file), entry_.line);
+    program_index_.add(start_address_, all_programs_.size(), false);
+    program_index_.add(address_, std::nullopt, false);
+    all_programs_.emplace_back(std::move(ranges_));
+    ranges_ = RangeTable<Entry>();
+  }
+  void readProgram() {
+    while (program_.loc() < program_end_) {
+      PRINT_INST("{:x}: ", (char*)program_.loc() - (s_.debug_line.data));
+      uint8_t op = program_.read<uint8_t>();
+      if (op >= opcode_base_) {
+        auto op2 = int64_t(op - opcode_base_);
+        address_ += op2 / line_range_;
+        entry_.line += line_base_ + (op2 % line_range_);
+        PRINT_INST(
+            "address += {}, line += {}\n",
+            op2 / line_range_,
+            line_base_ + (op2 % line_range_));
+        produceEntry();
+      } else {
+        switch (op) {
+          case DW_LNS_extended_op: {
+            auto len = program_.readULEB128();
+            auto extended_op = program_.read<uint8_t>();
+            switch (extended_op) {
+              case DW_LNE_end_sequence: {
+                PRINT_INST("end_sequence\n");
+                endSequence();
+                entry_ = Entry{};
+              } break;
+              case DW_LNE_set_address: {
+                address_ = program_.read<uint64_t>();
+                if (!shadow_) {
+                  PRINT_INST(
+                      "set address {:x} {:x} {:x}\n",
+                      address_,
+                      min_address_,
+                      max_address_);
+                }
+                shadow_ = address_ == 0;
+              } break;
+              default: {
+                PRINT_INST("skip extended op {}\n", extended_op);
+                program_.skip(int64_t(len - 1));
+              } break;
+            }
+          } break;
+          case DW_LNS_copy: {
+            PRINT_INST("copy\n");
+            produceEntry();
+          } break;
+          case DW_LNS_advance_pc: {
+            PRINT_INST("advance pc\n");
+            address_ += program_.readULEB128();
+          } break;
+          case DW_LNS_advance_line: {
+            entry_.line += program_.readSLEB128();
+            PRINT_INST("advance line {}\n", entry_.line);
+
+          } break;
+          case DW_LNS_set_file: {
+            PRINT_INST("set file\n");
+            entry_.file = program_.readULEB128();
+          } break;
+          case DW_LNS_const_add_pc: {
+            PRINT_INST("const add pc\n");
+            address_ += (255 - opcode_base_) / line_range_;
+          } break;
+          case DW_LNS_fixed_advance_pc: {
+            PRINT_INST("fixed advance pc\n");
+            address_ += program_.read<uint16_t>();
+          } break;
+          default: {
+            PRINT_INST("other {}\n", op);
+            auto n = standard_opcode_lengths_[op];
+            for (int i = 0; i < n; ++i) {
+              program_.readULEB128();
+            }
+          } break;
+        }
+      }
+    }
+    PRINT_INST(
+        "{:x}: end {:x}\n",
+        ((char*)program_.loc() - s_.debug_line.data),
+        program_end_ - s_.debug_line.data);
+  }
+
+  uint64_t address_ = 0;
+  bool shadow_ = false;
+  bool parsed_ = false;
+  Entry entry_ = {};
+  std::vector<std::string> include_directories_;
+  std::vector<std::string> file_names_;
+  std::vector<uint64_t> file_directory_index_;
+  uint8_t segment_selector_size_ = 0;
+  uint8_t minimum_instruction_length_ = 0;
+  uint8_t maximum_operations_per_instruction_ = 0;
+  int8_t line_base_ = 0;
+  uint8_t line_range_ = 0;
+  uint8_t opcode_base_ = 0;
+  bool default_is_stmt_ = false;
+  CheckedLexer program_ = {nullptr};
+  char* program_end_ = nullptr;
+  uint64_t header_length_ = 0;
+  uint64_t length_ = 0;
+  bool is_64bit_ = false;
+  std::vector<uint8_t> standard_opcode_lengths_;
+  Sections& s_;
+  uint64_t offset_;
+  uint64_t start_address_ = 0;
+  RangeTable<uint64_t> program_index_;
+  std::vector<RangeTable<Entry>> all_programs_;
+  RangeTable<Entry> ranges_;
+};
+
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/mem_file.h b/torch/csrc/profiler/unwind/mem_file.h
new file mode 100644
index 0000000000000..e82ffeb2cde98
--- /dev/null
+++ b/torch/csrc/profiler/unwind/mem_file.h
@@ -0,0 +1,150 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <elf.h>
+#include <fcntl.h>
+#include <fmt/format.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <torch/csrc/profiler/unwind/lexer.h>
+#include <torch/csrc/profiler/unwind/unwind_error.h>
+#include <unistd.h>
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+
+namespace torch::unwind {
+
+struct Section {
+  char* data = nullptr;
+  size_t size = 0;
+  const char* string(size_t offset) {
+    return lexer(offset).readCString();
+  }
+  CheckedLexer lexer(size_t offset) {
+    return CheckedLexer(data + offset, data, data + size);
+  }
+};
+
+/// Memory maps a file into the address space read-only, and manages the
+/// lifetime of the mapping. Here are a few use cases:
+/// 1. Used in the loader to read in initial image, and to inspect
+// ELF files for dependencies before callling dlopen.
+///
+/// 2. Used in unity to load the elf file.
+struct MemFile {
+  explicit MemFile(const char* filename_)
+      : fd_(open(filename_, O_RDONLY)),
+        mem_(nullptr),
+        n_bytes_(0),
+        name_(filename_) {
+    UNWIND_CHECK(
+        fd_ != -1, "failed to open {}: {}", filename_, strerror(errno));
+    // NOLINTNEXTLINE
+    struct stat s;
+    if (-1 == fstat(fd_, &s)) {
+      close(fd_); // destructors don't run during exceptions
+      UNWIND_CHECK(false, "failed to stat {}: {}", filename_, strerror(errno));
+    }
+    n_bytes_ = s.st_size;
+    UNWIND_CHECK(
+        n_bytes_ > sizeof(Elf64_Ehdr), "empty shared library: {}", filename_);
+    mem_ = (char*)mmap(nullptr, n_bytes_, PROT_READ, MAP_SHARED, fd_, 0);
+    if (MAP_FAILED == mem_) {
+      close(fd_);
+      UNWIND_CHECK(false, "failed to mmap {}: {}", filename_, strerror(errno));
+    }
+    ehdr_ = (Elf64_Ehdr*)mem_;
+#define ELF_CHECK(cond) UNWIND_CHECK(cond, "not an ELF file: {}", filename_)
+    ELF_CHECK(ehdr_->e_ident[EI_MAG0] == ELFMAG0);
+    ELF_CHECK(ehdr_->e_ident[EI_MAG1] == ELFMAG1);
+    ELF_CHECK(ehdr_->e_ident[EI_MAG2] == ELFMAG2);
+    ELF_CHECK(ehdr_->e_ident[EI_MAG3] == ELFMAG3);
+    ELF_CHECK(ehdr_->e_ident[EI_CLASS] == ELFCLASS64);
+    ELF_CHECK(ehdr_->e_ident[EI_VERSION] == EV_CURRENT);
+    ELF_CHECK(ehdr_->e_version == EV_CURRENT);
+    ELF_CHECK(ehdr_->e_machine == EM_X86_64);
+#undef ELF_CHECK
+    UNWIND_CHECK(
+        ehdr_->e_shoff + sizeof(Elf64_Shdr) * ehdr_->e_shnum <= n_bytes_,
+        "invalid section header table {} {} {}",
+        ehdr_->e_shoff + sizeof(Elf64_Shdr) * ehdr_->e_shnum,
+        n_bytes_,
+        ehdr_->e_shnum);
+    shdr_ = (Elf64_Shdr*)(mem_ + ehdr_->e_shoff);
+    UNWIND_CHECK(
+        ehdr_->e_shstrndx < ehdr_->e_shnum, "invalid strtab section offset");
+    auto& strtab_hdr = shdr_[ehdr_->e_shstrndx];
+    strtab_ = getSection(strtab_hdr);
+  }
+
+  MemFile(const MemFile&) = delete;
+  MemFile& operator=(const MemFile&) = delete;
+  [[nodiscard]] const char* data() const {
+    return (const char*)mem_;
+  }
+
+  /// Returns whether or not the file descriptor
+  /// of the underlying file is valid.
+  int valid() {
+    return fcntl(fd_, F_GETFD) != -1 || errno != EBADF;
+  }
+
+  ~MemFile() {
+    if (mem_) {
+      munmap((void*)mem_, n_bytes_);
+    }
+    if (fd_) {
+      close(fd_);
+    }
+  }
+
+  /// Returns the size of the underlying file defined by the `MemFile`
+  size_t size() {
+    return n_bytes_;
+  }
+  [[nodiscard]] int fd() const {
+    return fd_;
+  }
+
+  Section getSection(const Elf64_Shdr& shdr) {
+    UNWIND_CHECK(shdr.sh_offset + shdr.sh_size <= n_bytes_, "invalid section");
+    return Section{mem_ + shdr.sh_offset, shdr.sh_size};
+  }
+
+  Section getSection(const char* name, bool optional) {
+    for (int i = 0; i < ehdr_->e_shnum; i++) {
+      if (strcmp(strtab_.string(shdr_[i].sh_name), name) == 0) {
+        return getSection(shdr_[i]);
+      }
+    }
+    UNWIND_CHECK(optional, "{} has no section {}", name_, name);
+    return Section{nullptr, 0};
+  }
+
+  Section strtab() {
+    return strtab_;
+  }
+
+ private:
+  template <typename T>
+  T* load(size_t offset) {
+    UNWIND_CHECK(offset < n_bytes_, "out of range");
+    return (T*)(mem_ + offset);
+  }
+  int fd_;
+  char* mem_;
+  size_t n_bytes_;
+  std::string name_;
+  Elf64_Ehdr* ehdr_;
+  Elf64_Shdr* shdr_;
+  Section strtab_ = {nullptr, 0};
+};
+
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/range_table.h b/torch/csrc/profiler/unwind/range_table.h
new file mode 100644
index 0000000000000..08cb4f492fb6c
--- /dev/null
+++ b/torch/csrc/profiler/unwind/range_table.h
@@ -0,0 +1,74 @@
+#pragma once
+#include <torch/csrc/profiler/unwind/unwind_error.h>
+#include <algorithm>
+#include <memory>
+#include <optional>
+#include <unordered_map>
+#include <vector>
+
+namespace torch::unwind {
+template <typename T>
+struct RangeTable {
+  RangeTable() {
+    // guarentee that lower_bound[-1] is always valid
+    addresses_.push_back(0);
+    payloads_.emplace_back(std::nullopt);
+  }
+  void add(uint64_t address, unwind::optional<T> payload, bool sorted) {
+    if (addresses_.back() > address) {
+      UNWIND_CHECK(!sorted, "expected addresses to be sorted");
+      sorted_ = false;
+    }
+    addresses_.push_back(address);
+    payloads_.emplace_back(std::move(payload));
+  }
+  unwind::optional<T> find(uint64_t address) {
+    maybeSort();
+    auto it = std::upper_bound(addresses_.begin(), addresses_.end(), address);
+    return payloads_.at(it - addresses_.begin() - 1);
+  }
+  void dump() {
+    for (size_t i = 0; i < addresses_.size(); i++) {
+      fmt::print("{} {:x}: {}\n", i, addresses_[i], payloads_[i] ? "" : "END");
+    }
+  }
+  size_t size() const {
+    return addresses_.size();
+  }
+  uint64_t back() {
+    maybeSort();
+    return addresses_.back();
+  }
+
+ private:
+  void maybeSort() {
+    if (sorted_) {
+      return;
+    }
+    std::vector<uint64_t> indices;
+    indices.reserve(addresses_.size());
+    for (size_t i = 0; i < addresses_.size(); i++) {
+      indices.push_back(i);
+    }
+    std::sort(indices.begin(), indices.end(), [&](uint64_t a, uint64_t b) {
+      return addresses_[a] < addresses_[b] ||
+          (addresses_[a] == addresses_[b] &&
+           bool(payloads_[a]) < bool(payloads_[b]));
+    });
+    std::vector<uint64_t> addresses;
+    std::vector<unwind::optional<T>> payloads;
+    addresses.reserve(addresses_.size());
+    payloads.reserve(addresses_.size());
+    for (auto i : indices) {
+      addresses.push_back(addresses_[i]);
+      payloads.push_back(payloads_[i]);
+    }
+    addresses_ = std::move(addresses);
+    payloads_ = std::move(payloads);
+    sorted_ = true;
+  }
+  bool sorted_ = true;
+  std::vector<uint64_t> addresses_;
+  std::vector<unwind::optional<T>> payloads_;
+};
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/sections.h b/torch/csrc/profiler/unwind/sections.h
new file mode 100644
index 0000000000000..bb984cde9b397
--- /dev/null
+++ b/torch/csrc/profiler/unwind/sections.h
@@ -0,0 +1,124 @@
+#pragma once
+#include <cxxabi.h>
+#include <elf.h>
+#include <torch/csrc/profiler/unwind/dwarf_enums.h>
+#include <torch/csrc/profiler/unwind/dwarf_symbolize_enums.h>
+#include <torch/csrc/profiler/unwind/mem_file.h>
+#include <torch/csrc/profiler/unwind/range_table.h>
+#include <torch/csrc/profiler/unwind/unwind_error.h>
+#include <cstdint>
+
+namespace torch::unwind {
+
+static std::string demangle(const std::string& mangled_name) {
+  int status = 0;
+  char* realname =
+      abi::__cxa_demangle(mangled_name.c_str(), nullptr, nullptr, &status);
+  if (status == 0) {
+    std::string demangled_name(realname);
+    // NOLINTNEXTLINE
+    free(realname);
+    return demangled_name;
+  } else {
+    return mangled_name;
+  }
+}
+
+struct Sections {
+  Sections() = default;
+  void parse(const char* name) {
+    library_ = std::make_unique<MemFile>(name);
+    strtab = library_->getSection(".strtab", false);
+
+    symtab = library_->getSection(".symtab", true);
+    debug_info = library_->getSection(".debug_info", true);
+    if (debug_info.size > 0) {
+      debug_abbrev = library_->getSection(".debug_abbrev", false);
+      debug_str = library_->getSection(".debug_str", false);
+      debug_line = library_->getSection(".debug_line", false);
+      // dwarf 5
+      debug_line_str = library_->getSection(".debug_line_str", true);
+      debug_rnglists = library_->getSection(".debug_rnglists", true);
+      debug_addr = library_->getSection(".debug_addr", true);
+      // dwarf 4
+      debug_ranges = library_->getSection(".debug_ranges", true);
+    }
+    parseSymtab();
+  }
+
+  Section debug_info;
+  Section debug_abbrev;
+  Section debug_str;
+  Section debug_line;
+  Section debug_line_str;
+  Section debug_rnglists;
+  Section debug_ranges;
+  Section debug_addr;
+  Section symtab;
+  Section strtab;
+
+  const char* readString(
+      CheckedLexer& data,
+      uint64_t encoding,
+      bool is_64bit,
+      uint64_t str_offsets_base) {
+    switch (encoding) {
+      case DW_FORM_string: {
+        return data.readCString();
+      }
+      case DW_FORM_strp: {
+        return debug_str.string(readSegmentOffset(data, is_64bit));
+      }
+      case DW_FORM_line_strp: {
+        return debug_line_str.string(readSegmentOffset(data, is_64bit));
+      }
+      default:
+        UNWIND_CHECK(false, "unsupported string encoding {:x}", encoding);
+    }
+  }
+
+  uint64_t readSegmentOffset(CheckedLexer& data, bool is_64bit) {
+    return is_64bit ? data.read<uint64_t>() : data.read<uint32_t>();
+  }
+
+  unwind::optional<uint64_t> findDebugInfoOffset(uint64_t address) {
+    return debug_info_offsets_.find(address);
+  }
+  size_t compilationUnitCount() {
+    return debug_info_offsets_.size() / 2;
+  }
+  void addDebugInfoRange(
+      uint64_t start,
+      uint64_t end,
+      uint64_t debug_info_offset) {
+    debug_info_offsets_.add(start, debug_info_offset, false);
+    debug_info_offsets_.add(end, std::nullopt, false);
+  }
+  optional<std::string> findSubprogramName(uint64_t address) {
+    if (auto e = symbol_table_.find(address)) {
+      return demangle(strtab.string(*e));
+    }
+    return std::nullopt;
+  }
+
+ private:
+  void parseSymtab() {
+    auto L = symtab.lexer(0);
+    char* end = symtab.data + symtab.size;
+    while (L.loc() < end) {
+      auto symbol = L.read<Elf64_Sym>();
+      if (symbol.st_shndx == SHN_UNDEF ||
+          ELF64_ST_TYPE(symbol.st_info) != STT_FUNC) {
+        continue;
+      }
+      symbol_table_.add(symbol.st_value, symbol.st_name, false);
+      symbol_table_.add(symbol.st_value + symbol.st_size, std::nullopt, false);
+    }
+  }
+
+  std::unique_ptr<MemFile> library_;
+  RangeTable<uint64_t> debug_info_offsets_;
+  RangeTable<uint64_t> symbol_table_;
+};
+
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/unwind.cpp b/torch/csrc/profiler/unwind/unwind.cpp
index f3fbde151b775..74d7877edadf1 100644
--- a/torch/csrc/profiler/unwind/unwind.cpp
+++ b/torch/csrc/profiler/unwind/unwind.cpp
@@ -1,6 +1,7 @@
 #include <c10/util/Exception.h>
 #include <torch/csrc/profiler/unwind/unwind.h>
 #include <torch/csrc/utils/cpp_stacktraces.h>
+#include <unordered_map>
 
 #if !defined(__linux__) || !defined(__x86_64__) || !defined(__has_include) || \
     !__has_include("ext/stdio_filebuf.h")
@@ -11,14 +12,14 @@ std::vector<void*> unwind() {
       "record_context_cpp is not support on non-linux non-x86_64 platforms");
 }
 
-c10::optional<std::pair<std::string, uint64_t>> libraryFor(void* addr) {
+std::optional<std::pair<std::string, uint64_t>> libraryFor(void* addr) {
   TORCH_CHECK(
       false,
       "record_context_cpp is not support on non-linux non-x86_64 platforms");
 }
 
 #ifndef FBCODE_CAFFE2
-std::vector<Frame> symbolize(const std::vector<void*>& frames) {
+std::vector<Frame> symbolize(const std::vector<void*>& frames, Mode mode) {
   TORCH_CHECK(
       false,
       "record_context_cpp is not support on non-linux non-x86_64 platforms");
@@ -48,10 +49,15 @@ Stats stats() {
 #include <torch/csrc/profiler/unwind/communicate.h>
 #include <torch/csrc/profiler/unwind/dwarf_enums.h>
 #include <torch/csrc/profiler/unwind/eh_frame_hdr.h>
+#include <torch/csrc/profiler/unwind/fast_symbolizer.h>
 #include <torch/csrc/profiler/unwind/fde.h>
 #include <torch/csrc/profiler/unwind/unwinder.h>
 #include <shared_mutex>
 
+extern "C" void unwind_c(std::vector<void*>* result, int64_t rsp, int64_t rbp);
+extern "C" void unwind_entry(std::vector<void*>* result);
+
+namespace torch::unwind {
 struct UpgradeExclusive {
   UpgradeExclusive(std::shared_lock<std::shared_timed_mutex>& rdlock)
       : rdlock_(rdlock) {
@@ -197,7 +203,7 @@ struct UnwindCache {
     Unwinder unwinder = Unwinder::unknown();
     try {
       unwinder = libraryFor(addr).unwinderFor(addr);
-    } catch (UnwindError& err) {
+    } catch (unwind::UnwindError& err) {
       // because unwinders are cached this will only print
       // once per frame that cannot be unwound.
       TORCH_WARN("Unsupported unwinding pattern: ", err.what());
@@ -276,53 +282,13 @@ struct UnwindCache {
 static UnwindCache unwind_cache;
 static std::shared_timed_mutex cache_mutex_;
 
-extern "C" void unwind_c(std::vector<void*>* result, int64_t rsp, int64_t rbp);
-extern "C" void unwind_c(std::vector<void*>* result, int64_t rsp, int64_t rbp) {
-  std::shared_lock lock(cache_mutex_);
-  UnwindState state{};
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  state.rip = *(int64_t*)(rsp);
-  // +8 because we saved rsp after the return address was already pushed
-  // to the stack
-  state.rsp = rsp + 8;
-  state.rbp = rbp;
-  unwind_cache.checkRefresh(lock);
-  while (true) { // unwind for _start sets rip as being undefined
-    // NOLINTNEXTLINE(performance-no-int-to-ptr)
-    result->push_back((void*)state.rip);
-    const Unwinder& uw = unwind_cache.unwinderFor(state.rip, lock);
-    if (uw.terminator()) {
-      if (uw.isUnknown()) {
-        result->push_back(nullptr);
-      }
-      break;
-    }
-    state = uw.run(state);
-  }
-}
-
-extern "C" void unwind_entry(std::vector<void*>* result);
-
-// calling convention puts the first three pointer/int64_t arguments in
-// rdi rsi rdx (all caller-saved)
-// rdi already holds the pointer to the result vector
-// we add arguments for current rsp and rbp and then tail call
-// into unwind_c
-__asm__(
-    ".global unwind_entry\n"
-    "unwind_entry:\n"
-    "mov %rsp, %rsi;\n"
-    "mov %rbp, %rdx;\n"
-    "jmp unwind_c;\n");
-
-namespace torch::unwind {
 std::vector<void*> unwind() {
   std::vector<void*> frames;
   unwind_entry(&frames);
   return frames;
 }
 
-c10::optional<std::pair<std::string, uint64_t>> libraryFor(void* addr) {
+std::optional<std::pair<std::string, uint64_t>> libraryFor(void* addr) {
   if (!addr) {
     return c10::nullopt;
   }
@@ -335,6 +301,15 @@ c10::optional<std::pair<std::string, uint64_t>> libraryFor(void* addr) {
       library_info->name(), (uint64_t)addr - library_info->load_bias());
 }
 
+static std::string dladdr_lookup(void* addr) {
+  Dl_info dlinfo;
+  std::string funcname = "??";
+  if (dladdr(addr, &dlinfo) && dlinfo.dli_sname) {
+    funcname = demangle(dlinfo.dli_sname);
+  }
+  return funcname;
+}
+
 struct Symbolizer {
   Symbolizer() {
     auto envar = std::getenv("TORCH_ADDR2LINE_BINARY");
@@ -345,9 +320,6 @@ struct Symbolizer {
     } else {
       addr2line_binary_ = "addr2line"; // default
     }
-    if (torch::get_disable_addr2line()) {
-      addr2line_binary_ = nullptr;
-    }
   }
   static std::lock_guard<std::mutex> guard() {
     static std::mutex mutex;
@@ -367,16 +339,6 @@ struct Symbolizer {
       frame_map_[addr] = Frame{"??", "<unwind unsupported>", 0};
       return;
     }
-    if (addr2line_binary_ == nullptr) {
-      Dl_info dlinfo;
-      std::string funcname = "??";
-      if (dladdr(addr, &dlinfo) && dlinfo.dli_sname) {
-        funcname = demangle(dlinfo.dli_sname);
-      }
-      frame_map_[addr] = Frame{
-          maybe_library->first, std::move(funcname), maybe_library->second - 1};
-      return;
-    }
     has_pending_results_ = true;
     auto& entry = getOrCreate(maybe_library->first);
     entry.queried.push_back(addr);
@@ -448,23 +410,59 @@ struct Symbolizer {
       frame_map_[e.queried[e.completed]] = std::move(frame);
     }
   }
-  std::string demangle(const std::string& mangled_name) {
-    int status = 0;
-    char* realname =
-        abi::__cxa_demangle(mangled_name.c_str(), nullptr, nullptr, &status);
-    if (status == 0) {
-      std::string demangled_name(realname);
-      // NOLINTNEXTLINE
-      free(realname);
-      return demangled_name;
-    } else {
-      return mangled_name;
+};
+
+static std::vector<Frame> symbolize_fast(
+    const std::vector<void*>& frames,
+    Mode mode) {
+  static std::mutex cache_mutex;
+  static std::array<ska::flat_hash_map<void*, Frame>, 2> frame_maps;
+  auto& frame_map = frame_maps[mode == Mode::fast ? 0 : 1];
+
+  std::vector<uint32_t> indices_to_lookup;
+  std::vector<Frame> results;
+  results.reserve(frames.size());
+  {
+    std::lock_guard<std::mutex> lock(cache_mutex);
+    for (auto i : c10::irange(frames.size())) {
+      void* f = frames.at(i);
+      auto it = frame_map.find(f);
+      if (it == frame_map.end()) {
+        indices_to_lookup.push_back(i);
+        results.emplace_back(Frame{"??", "??", 0});
+      } else {
+        results.emplace_back(it->second);
+      }
     }
   }
-};
+  if (!indices_to_lookup.empty()) {
+    // do symbolizer work
+    FastSymbolizer symbolizer;
+    for (auto i : indices_to_lookup) {
+      void* addr = frames.at(i);
+      Frame& f = results.at(i);
+      auto library = libraryFor(frames.at(i));
+      if (library) {
+        if (mode == Mode::fast) {
+          f = symbolizer.symbolize(library->first, library->second - 1);
+        } else {
+          f = Frame{library->first, "??", library->second - 1};
+        }
+      }
+      if (f.funcname == "??") {
+        f.funcname = dladdr_lookup(addr);
+      }
+    }
+    std::lock_guard<std::mutex> lock(cache_mutex);
+    for (auto i : indices_to_lookup) {
+      frame_map.emplace(frames.at(i), results.at(i));
+    }
+  }
+  return results;
+}
 
-#ifndef FBCODE_CAFFE2
-std::vector<Frame> symbolize(const std::vector<void*>& frames) {
+static std::vector<Frame> symbolize_addr2line(
+    const std::vector<void*>& frames) {
   auto guard = Symbolizer::guard();
   Symbolizer& s = Symbolizer::get();
   for (auto f : frames) {
@@ -477,6 +475,16 @@ std::vector<Frame> symbolize(const std::vector<void*>& frames) {
   }
   return results;
 }
+
+// fbcode will use llvm symbolize since there is an llvm dependency already
+#ifndef FBCODE_CAFFE2
+std::vector<Frame> symbolize(const std::vector<void*>& frames, Mode mode) {
+  if (mode == Mode::addr2line) {
+    return symbolize_addr2line(frames);
+  } else {
+    return symbolize_fast(frames, mode);
+  }
+}
 #endif
 
 Stats stats() {
@@ -484,4 +492,42 @@ Stats stats() {
 }
 
 } // namespace torch::unwind
+
+extern "C" void unwind_c(std::vector<void*>* result, int64_t rsp, int64_t rbp) {
+  std::shared_lock lock(torch::unwind::cache_mutex_);
+  torch::unwind::UnwindState state{};
+  // NOLINTNEXTLINE(performance-no-int-to-ptr)
+  state.rip = *(int64_t*)(rsp);
+  // +8 because we saved rsp after the return address was already pushed
+  // to the stack
+  state.rsp = rsp + 8;
+  state.rbp = rbp;
+  torch::unwind::unwind_cache.checkRefresh(lock);
+  while (true) { // unwind for _start sets rip as being undefined
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
+    result->push_back((void*)state.rip);
+    const torch::unwind::Unwinder& uw =
+        torch::unwind::unwind_cache.unwinderFor(state.rip, lock);
+    if (uw.terminator()) {
+      if (uw.isUnknown()) {
+        result->push_back(nullptr);
+      }
+      break;
+    }
+    state = uw.run(state);
+  }
+}
+
+// calling convention puts the first three pointer/int64_t arguments in
+// rdi rsi rdx (all caller-saved)
+// rdi already holds the pointer to the result vector
+// we add arguments for current rsp and rbp and then tail call
+// into unwind_c
+__asm__(
+    ".global unwind_entry\n"
+    "unwind_entry:\n"
+    "mov %rsp, %rsi;\n"
+    "mov %rbp, %rdx;\n"
+    "jmp unwind_c;\n");
+
 #endif
diff --git a/torch/csrc/profiler/unwind/unwind.h b/torch/csrc/profiler/unwind/unwind.h
index 69b27f49e5b79..1c302dfca445f 100644
--- a/torch/csrc/profiler/unwind/unwind.h
+++ b/torch/csrc/profiler/unwind/unwind.h
@@ -1,11 +1,11 @@
 #pragma once
 #include <c10/macros/Export.h>
 #include <c10/util/Optional.h>
+#include <cstdint>
 #include <string>
 #include <vector>
 
-namespace torch {
-namespace unwind {
+namespace torch::unwind {
 // gather current stack, relatively fast.
 // gets faster once the cache of program counter locations is warm.
 TORCH_API std::vector<void*> unwind();
@@ -16,16 +16,20 @@ struct Frame {
   uint64_t lineno;
 };
 
+enum class Mode { addr2line, fast, dladdr };
+
 // note: symbolize is really slow
 // it will launch an addr2line process that has to parse dwarf
 // information from the libraries that frames point into.
 // Callers should first batch up all the unique void* pointers
 // across a number of unwind states and make a single call to
 // symbolize.
-TORCH_API std::vector<Frame> symbolize(const std::vector<void*>& frames);
+TORCH_API std::vector<Frame> symbolize(
+    const std::vector<void*>& frames,
+    Mode mode);
 
 // returns path to the library, and the offset of the addr inside the library
-TORCH_API c10::optional<std::pair<std::string, uint64_t>> libraryFor(
+TORCH_API std::optional<std::pair<std::string, uint64_t>> libraryFor(
     void* addr);
 
 struct Stats {
@@ -36,5 +40,4 @@ struct Stats {
 };
 Stats stats();
 
-} // namespace unwind
-} // namespace torch
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/unwind_error.h b/torch/csrc/profiler/unwind/unwind_error.h
index af2e4dff01090..229c5182c4159 100644
--- a/torch/csrc/profiler/unwind/unwind_error.h
+++ b/torch/csrc/profiler/unwind/unwind_error.h
@@ -1,6 +1,31 @@
 #pragma once
+#include <c10/util/Optional.h>
+#include <fmt/format.h>
 #include <stdexcept>
 
+namespace torch::unwind {
+
 struct UnwindError : public std::runtime_error {
   using std::runtime_error::runtime_error;
 };
+
+#define UNWIND_CHECK(cond, fmtstring, ...)                          \
+  do {                                                              \
+    if (!(cond)) {                                                  \
+      throw unwind::UnwindError(fmt::format(                        \
+          "{}:{}: " fmtstring, __FILE__, __LINE__, ##__VA_ARGS__)); \
+    }                                                               \
+  } while (0)
+
+// #define LOG_INFO(...) fmt::print(__VA_ARGS__)
+#define LOG_INFO(...)
+
+// #define PRINT_INST(...) LOG_INFO(__VA_ARGS__)
+#define PRINT_INST(...)
+
+// #define PRINT_LINE_TABLE(...) LOG_INFO(__VA_ARGS__)
+#define PRINT_LINE_TABLE(...)
+
+using c10::optional; // NOLINT
+
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/unwind/unwind_fb.cpp b/torch/csrc/profiler/unwind/unwind_fb.cpp
index 22a805036f699..f40005adae829 100644
--- a/torch/csrc/profiler/unwind/unwind_fb.cpp
+++ b/torch/csrc/profiler/unwind/unwind_fb.cpp
@@ -5,10 +5,9 @@
 #include <llvm/DebugInfo/Symbolize/Symbolize.h>
 #include <torch/csrc/profiler/unwind/unwind.h>
 
-namespace torch {
-namespace unwind {
+namespace torch::unwind {
 
-std::vector<Frame> symbolize(const std::vector<void*>& frames) {
+std::vector<Frame> symbolize(const std::vector<void*>& frames, Mode mode) {
   static std::mutex symbolize_mutex;
   static llvm::symbolize::LLVMSymbolizer symbolizer;
   static ska::flat_hash_map<void*, Frame> frame_map_;
@@ -38,7 +37,6 @@ std::vector<Frame> symbolize(const std::vector<void*>& frames) {
   return results;
 }
 
-} // namespace unwind
-} // namespace torch
+} // namespace torch::unwind
 
 #endif
diff --git a/torch/csrc/profiler/unwind/unwinder.h b/torch/csrc/profiler/unwind/unwinder.h
index 1d0a30e2f919f..d673f47af8db2 100644
--- a/torch/csrc/profiler/unwind/unwinder.h
+++ b/torch/csrc/profiler/unwind/unwinder.h
@@ -4,6 +4,8 @@
 #include <cstdint>
 #include <limits>
 
+namespace torch::unwind {
+
 struct UnwindState {
   int64_t rip, rbp, rsp;
 };
@@ -75,3 +77,5 @@ struct Unwinder {
   int64_t rbp_off_;
   bool deref_{false};
 };
+
+} // namespace torch::unwind
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index 22b645c168673..f301596fca813 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -10,9 +10,7 @@
 #include <libkineto.h>
 #endif
 #ifdef USE_DISTRIBUTED
-#ifdef USE_C10D
 #include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
-#endif // USE_C10D
 #endif // USE_DISTRIBUTED
 
 namespace torch {
@@ -20,10 +18,10 @@ namespace profiler {
 namespace impl {
 
 namespace {
-c10::optional<bool> soft_assert_raises_;
+std::optional<bool> soft_assert_raises_;
 } // namespace
 
-void setSoftAssertRaises(c10::optional<bool> value) {
+void setSoftAssertRaises(std::optional<bool> value) {
   soft_assert_raises_ = value;
 }
 
@@ -337,7 +335,6 @@ std::vector<std::string> inputTypes(const at::RecordFunction& fn) {
 // -- NCCL Metadata -----------------------------------------------------------
 // ----------------------------------------------------------------------------
 #ifdef USE_DISTRIBUTED
-#ifdef USE_C10D
 static constexpr auto kCommsName = "Collective name";
 static constexpr auto kDtype = "dtype";
 static constexpr auto kInMsgNelems = "In msg nelems";
@@ -352,14 +349,12 @@ static constexpr auto kProcessGroupDesc = "Process Group Description";
 static constexpr auto kGroupRanks = "Process Group Ranks";
 
 static constexpr int32_t kTruncatLength = 30;
-#endif // USE_C10D
 #endif // USE_DISTRIBUTED
 
 std::unordered_map<std::string, std::string> saveNcclMeta(
     const at::RecordFunction& fn) {
   std::unordered_map<std::string, std::string> map;
 #ifdef USE_DISTRIBUTED
-#ifdef USE_C10D
   auto debugInfo = dynamic_cast<ParamCommsDebugInfo*>(
       c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO));
   if (debugInfo == nullptr) {
@@ -434,7 +429,6 @@ std::unordered_map<std::string, std::string> saveNcclMeta(
                 ", "),
             groupRanks.back()));
   }
-#endif // USE_C10D
 #endif // USE_DISTRIBUTED
   return map;
 }
diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h
index e27d4084412c8..c8216c93f41c5 100644
--- a/torch/csrc/profiler/util.h
+++ b/torch/csrc/profiler/util.h
@@ -38,7 +38,7 @@ namespace torch {
 namespace profiler {
 namespace impl {
 TORCH_API bool softAssertRaises();
-TORCH_API void setSoftAssertRaises(c10::optional<bool> value);
+TORCH_API void setSoftAssertRaises(std::optional<bool> value);
 TORCH_API void logSoftAssert(
     const char* func,
     const char* file,
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index 4ea523cedc942..8d18180ed9195 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -314,8 +314,8 @@ static void set_default_storage_type(Backend backend, ScalarType dtype) {
 }
 
 static void set_default_tensor_type(
-    c10::optional<Backend> backend,
-    c10::optional<ScalarType> dtype) {
+    std::optional<Backend> backend,
+    std::optional<ScalarType> dtype) {
   if (backend.has_value()) {
     TORCH_CHECK_TYPE(
         *backend != Backend::Undefined, "default type cannot be undefined");
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index 5a610c28d2b1e..7552f6d0c028a 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -206,7 +206,7 @@ bool maybeThrowBackCompatKeepdimWarn(char* func);
 
 // NB: This is in torch/csrc/cuda/utils.cpp, for whatever reason
 #ifdef USE_CUDA
-std::vector<c10::optional<at::cuda::CUDAStream>>
+std::vector<std::optional<at::cuda::CUDAStream>>
 THPUtils_PySequence_to_CUDAStreamList(PyObject* obj);
 #endif
 
diff --git a/torch/csrc/utils/cpp_stacktraces.cpp b/torch/csrc/utils/cpp_stacktraces.cpp
index a04342976e613..715271d76c826 100644
--- a/torch/csrc/utils/cpp_stacktraces.cpp
+++ b/torch/csrc/utils/cpp_stacktraces.cpp
@@ -47,9 +47,31 @@ bool get_cpp_stacktraces_enabled() {
   return enabled;
 }
 
-bool get_disable_addr2line() {
-  static bool disabled = compute_disable_addr2line();
-  return disabled;
+static torch::unwind::Mode compute_symbolize_mode() {
+  auto envar_c = std::getenv("TORCH_SYMBOLIZE_MODE");
+  if (envar_c) {
+    std::string envar = envar_c;
+    if (envar == "dladdr") {
+      return unwind::Mode::dladdr;
+    } else if (envar == "addr2line") {
+      return unwind::Mode::addr2line;
+    } else if (envar == "fast") {
+      return unwind::Mode::fast;
+    } else {
+      TORCH_CHECK(
+          false,
+          "expected {dladdr, addr2line, fast} for TORCH_SYMBOLIZE_MODE, got ",
+          envar);
+    }
+  } else {
+    return compute_disable_addr2line() ? unwind::Mode::dladdr
+                                       : unwind::Mode::addr2line;
+  }
+}
+
+unwind::Mode get_symbolize_mode() {
+  static unwind::Mode mode = compute_symbolize_mode();
+  return mode;
 }
 
 } // namespace torch
diff --git a/torch/csrc/utils/cpp_stacktraces.h b/torch/csrc/utils/cpp_stacktraces.h
index 30602b0c9b731..8c38e972faf71 100644
--- a/torch/csrc/utils/cpp_stacktraces.h
+++ b/torch/csrc/utils/cpp_stacktraces.h
@@ -1,8 +1,9 @@
 #pragma once
 
 #include <torch/csrc/Export.h>
+#include <torch/csrc/profiler/unwind/unwind.h>
 
 namespace torch {
 TORCH_API bool get_cpp_stacktraces_enabled();
-TORCH_API bool get_disable_addr2line();
+TORCH_API torch::unwind::Mode get_symbolize_mode();
 } // namespace torch
diff --git a/torch/csrc/utils/device_lazy_init.h b/torch/csrc/utils/device_lazy_init.h
index b290ae04d792e..4d736898e5359 100644
--- a/torch/csrc/utils/device_lazy_init.h
+++ b/torch/csrc/utils/device_lazy_init.h
@@ -33,7 +33,7 @@ static inline void maybe_initialize_device(at::Device& device) {
   }
 }
 
-static inline void maybe_initialize_device(c10::optional<at::Device>& device) {
+static inline void maybe_initialize_device(std::optional<at::Device>& device) {
   if (!device.has_value()) {
     return;
   }
diff --git a/torch/csrc/utils/out_types.cpp b/torch/csrc/utils/out_types.cpp
index 3d55b9caaf1ca..7e712f2087169 100644
--- a/torch/csrc/utils/out_types.cpp
+++ b/torch/csrc/utils/out_types.cpp
@@ -7,10 +7,10 @@ namespace utils {
 // consistent with the out tensor's options
 void check_out_type_matches(
     const at::Tensor& result,
-    c10::optional<at::ScalarType> scalarType,
+    std::optional<at::ScalarType> scalarType,
     bool scalarType_is_none,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
+    std::optional<at::Layout> layout,
+    std::optional<at::Device> device,
     bool device_is_none) {
   if (scalarType_is_none && !layout && device_is_none) { // common case
     return;
diff --git a/torch/csrc/utils/out_types.h b/torch/csrc/utils/out_types.h
index 1cab00bc270f2..68bf759f30038 100644
--- a/torch/csrc/utils/out_types.h
+++ b/torch/csrc/utils/out_types.h
@@ -7,10 +7,10 @@ namespace utils {
 
 TORCH_API void check_out_type_matches(
     const at::Tensor& result,
-    c10::optional<at::ScalarType> scalarType,
+    std::optional<at::ScalarType> scalarType,
     bool scalarType_is_none,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
+    std::optional<at::Layout> layout,
+    std::optional<at::Device> device,
     bool device_is_none);
 
 }
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 9ea90e8911dbd..90c331488e0c9 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -267,7 +267,7 @@ static py::object dispatch_on_subclass(
     PyObject* torch_api_function,
     bool is_torch_function,
     const char* torch_function_name_str,
-    c10::optional<c10::impl::TorchDispatchModeKey> maybe_mode_key =
+    std::optional<c10::impl::TorchDispatchModeKey> maybe_mode_key =
         c10::nullopt) {
   py::object ret;
   for (auto& arg : overloaded_args) {
@@ -1003,13 +1003,13 @@ std::string FunctionParameter::type_name() const {
   }
 }
 
-static inline c10::optional<int64_t> parse_as_integer(const std::string& s) {
+static inline std::optional<int64_t> parse_as_integer(const std::string& s) {
   if (s.empty())
     return c10::nullopt;
   char* str_end = nullptr;
   long ans = strtol(s.c_str(), &str_end, 0);
   // *str_end == 0 if the entire string was parsed as an integer.
-  return (*str_end == 0) ? c10::optional<int64_t>(ans) : c10::nullopt;
+  return (*str_end == 0) ? std::optional<int64_t>(ans) : c10::nullopt;
 }
 
 /*
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 7bbef2f622ad6..06c32d52f0172 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -231,12 +231,12 @@ struct PythonArgs {
   inline bool has_torch_function();
   inline std::string get_func_name();
   inline at::Tensor tensor(int i);
-  inline c10::optional<at::Tensor> optionalTensor(int i);
+  inline std::optional<at::Tensor> optionalTensor(int i);
   inline at::Scalar scalar(int i);
   inline at::Scalar scalarWithDefault(int i, const at::Scalar& default_scalar);
   inline std::vector<at::Scalar> scalarlist(int i);
   inline std::vector<at::Tensor> tensorlist(int i);
-  inline torch::List<c10::optional<at::Tensor>> list_of_optional_tensors(int i);
+  inline torch::List<std::optional<at::Tensor>> list_of_optional_tensors(int i);
   template <int N>
   inline std::array<at::Tensor, N> tensorlist_n(int i);
   inline std::vector<int64_t> intlist(int i);
@@ -246,7 +246,7 @@ struct PythonArgs {
   inline std::vector<int64_t> intlistWithDefault(
       int i,
       std::vector<int64_t> default_intlist);
-  inline c10::optional<at::Generator> generator(int i);
+  inline std::optional<at::Generator> generator(int i);
   inline at::Storage storage(int i);
   inline at::Storage storage(
       int i,
@@ -257,35 +257,35 @@ struct PythonArgs {
   inline at::ScalarType scalartypeWithDefault(
       int i,
       at::ScalarType default_scalartype);
-  inline c10::optional<at::ScalarType> scalartypeOptional(int i);
-  inline c10::optional<at::Scalar> scalarOptional(int i);
-  inline c10::optional<int64_t> toInt64Optional(int i);
-  inline c10::optional<c10::SymInt> toSymIntOptional(int i);
-  inline c10::optional<bool> toBoolOptional(int i);
-  inline c10::optional<double> toDoubleOptional(int i);
+  inline std::optional<at::ScalarType> scalartypeOptional(int i);
+  inline std::optional<at::Scalar> scalarOptional(int i);
+  inline std::optional<int64_t> toInt64Optional(int i);
+  inline std::optional<c10::SymInt> toSymIntOptional(int i);
+  inline std::optional<bool> toBoolOptional(int i);
+  inline std::optional<double> toDoubleOptional(int i);
   inline c10::OptionalArray<double> doublelistOptional(int i);
   inline std::vector<double> doublelist(int i);
   inline std::vector<double> getDoublelist(int i);
   inline at::Layout layout(int i);
   inline at::Layout layoutWithDefault(int i, at::Layout default_layout);
-  inline c10::optional<at::Layout> layoutOptional(int i);
+  inline std::optional<at::Layout> layoutOptional(int i);
   inline at::Device device(int i);
   inline at::Device deviceWithDefault(int i, const at::Device& default_device);
-  inline c10::optional<at::Device> deviceOptional(int i);
+  inline std::optional<at::Device> deviceOptional(int i);
   inline at::Dimname dimname(int i);
   inline std::vector<at::Dimname> dimnamelist(int i);
-  inline c10::optional<std::vector<at::Dimname>> toDimnameListOptional(int i);
+  inline std::optional<std::vector<at::Dimname>> toDimnameListOptional(int i);
   inline at::MemoryFormat memoryformat(int i);
-  inline c10::optional<at::MemoryFormat> memoryformatOptional(int i);
+  inline std::optional<at::MemoryFormat> memoryformatOptional(int i);
   inline at::QScheme toQScheme(int i);
   inline std::string string(int i);
   inline std::string stringWithDefault(int i, const std::string& default_str);
-  inline c10::optional<std::string> stringOptional(int i);
+  inline std::optional<std::string> stringOptional(int i);
   inline c10::string_view stringView(int i);
   inline c10::string_view stringViewWithDefault(
       int i,
       const c10::string_view default_str);
-  inline c10::optional<c10::string_view> stringViewOptional(int i);
+  inline std::optional<c10::string_view> stringViewOptional(int i);
   inline PyObject* pyobject(int i);
   inline int64_t toInt64(int i);
   inline c10::SymInt toSymInt(int i);
@@ -300,7 +300,7 @@ struct PythonArgs {
   inline bool toBool(int i);
   inline bool toBoolWithDefault(int i, bool default_bool);
   inline bool isNone(int i);
-  inline c10::optional<c10::DispatchKeySet> toDispatchKeySetOptional(int i);
+  inline std::optional<c10::DispatchKeySet> toDispatchKeySetOptional(int i);
 
  private:
   at::Tensor tensor_slow(int i);
@@ -393,7 +393,7 @@ inline at::Tensor PythonArgs::tensor(int i) {
   return tensor_slow(i);
 }
 
-inline c10::optional<at::Tensor> PythonArgs::optionalTensor(int i) {
+inline std::optional<at::Tensor> PythonArgs::optionalTensor(int i) {
   at::Tensor t = tensor(i);
   // NOLINTNEXTLINE(bugprone-branch-clone)
   if (t.defined()) {
@@ -433,7 +433,7 @@ inline at::Scalar PythonArgs::scalarWithDefault(
   return scalar_slow(i);
 }
 
-inline c10::optional<at::Scalar> PythonArgs::scalarOptional(int i) {
+inline std::optional<at::Scalar> PythonArgs::scalarOptional(int i) {
   if (!args[i])
     return c10::nullopt;
   return scalar_slow(i);
@@ -457,15 +457,15 @@ inline std::vector<at::Tensor> PythonArgs::tensorlist(int i) {
   return res;
 }
 
-inline torch::List<c10::optional<at::Tensor>> PythonArgs::
+inline torch::List<std::optional<at::Tensor>> PythonArgs::
     list_of_optional_tensors(int i) {
   if (!args[i])
-    return torch::List<c10::optional<at::Tensor>>();
+    return torch::List<std::optional<at::Tensor>>();
   auto tuple = six::isTuple(args[i]);
   THPObjectPtr arg = six::maybeAsTuple(args[i]);
   // NOLINTNEXTLINE(bugprone-branch-clone)
   auto size = tuple ? PyTuple_GET_SIZE(arg.get()) : PyList_GET_SIZE(arg.get());
-  torch::List<c10::optional<at::Tensor>> res;
+  torch::List<std::optional<at::Tensor>> res;
   res.reserve(size);
   for (const auto idx : c10::irange(size)) {
     PyObject* obj = tuple ? PyTuple_GET_ITEM(arg.get(), idx)
@@ -729,7 +729,7 @@ inline std::vector<double> PythonArgs::doublelist(int i) {
   return this->getDoublelist(i);
 }
 
-inline c10::optional<c10::DispatchKeySet> PythonArgs::toDispatchKeySetOptional(
+inline std::optional<c10::DispatchKeySet> PythonArgs::toDispatchKeySetOptional(
     int i) {
   if (!args[i]) {
     return {};
@@ -769,7 +769,7 @@ inline at::ScalarType PythonArgs::scalartype(int i) {
   return toScalarType(obj);
 }
 
-inline c10::optional<at::ScalarType> PythonArgs::scalartypeOptional(int i) {
+inline std::optional<at::ScalarType> PythonArgs::scalartypeOptional(int i) {
   if (!args[i])
     return c10::nullopt;
   return scalartype(i);
@@ -794,7 +794,7 @@ inline at::Layout PythonArgs::layoutWithDefault(
   return layout(i);
 }
 
-inline c10::optional<at::Layout> PythonArgs::layoutOptional(int i) {
+inline std::optional<at::Layout> PythonArgs::layoutOptional(int i) {
   if (!args[i])
     return c10::nullopt;
   return layout(i);
@@ -835,7 +835,7 @@ inline at::Device PythonArgs::deviceWithDefault(
   return device(i);
 }
 
-inline c10::optional<at::Device> PythonArgs::deviceOptional(int i) {
+inline std::optional<at::Device> PythonArgs::deviceOptional(int i) {
   if (!args[i])
     return c10::nullopt;
   return device(i);
@@ -860,7 +860,7 @@ inline std::vector<at::Dimname> parseDimnameList(PyObject* arg) {
   return res;
 }
 
-inline c10::optional<std::vector<at::Dimname>> PythonArgs::
+inline std::optional<std::vector<at::Dimname>> PythonArgs::
     toDimnameListOptional(int i) {
   if (!args[i])
     return c10::nullopt;
@@ -888,7 +888,7 @@ inline at::MemoryFormat PythonArgs::memoryformat(int i) {
   return memory_format->memory_format;
 }
 
-inline c10::optional<at::MemoryFormat> PythonArgs::memoryformatOptional(int i) {
+inline std::optional<at::MemoryFormat> PythonArgs::memoryformatOptional(int i) {
   if (!args[i])
     return c10::nullopt;
   return memoryformat(i);
@@ -916,7 +916,7 @@ inline std::string PythonArgs::stringWithDefault(
   return THPUtils_unpackString(args[i]);
 }
 
-inline c10::optional<std::string> PythonArgs::stringOptional(int i) {
+inline std::optional<std::string> PythonArgs::stringOptional(int i) {
   if (!args[i])
     return c10::nullopt;
   return THPUtils_unpackString(args[i]);
@@ -934,7 +934,7 @@ inline c10::string_view PythonArgs::stringViewWithDefault(
   return THPUtils_unpackStringView(args[i]);
 }
 
-inline c10::optional<c10::string_view> PythonArgs::stringViewOptional(int i) {
+inline std::optional<c10::string_view> PythonArgs::stringViewOptional(int i) {
   if (!args[i])
     return c10::nullopt;
   return THPUtils_unpackStringView(args[i]);
@@ -988,26 +988,26 @@ inline int64_t PythonArgs::toInt64WithDefault(int i, int64_t default_int) {
   return toInt64(i);
 }
 
-inline c10::optional<int64_t> PythonArgs::toInt64Optional(int i) {
+inline std::optional<int64_t> PythonArgs::toInt64Optional(int i) {
   if (!args[i])
     return c10::nullopt;
   return toInt64(i);
 }
 
-inline c10::optional<c10::SymInt> PythonArgs::toSymIntOptional(int i) {
+inline std::optional<c10::SymInt> PythonArgs::toSymIntOptional(int i) {
   if (!args[i])
     return c10::nullopt;
   return toSymInt(i);
 }
 
-inline c10::optional<bool> PythonArgs::toBoolOptional(int i) {
+inline std::optional<bool> PythonArgs::toBoolOptional(int i) {
   if (!args[i]) {
     return c10::nullopt;
   }
   return toBool(i);
 }
 
-inline c10::optional<double> PythonArgs::toDoubleOptional(int i) {
+inline std::optional<double> PythonArgs::toDoubleOptional(int i) {
   if (!args[i]) {
     return c10::nullopt;
   }
@@ -1069,7 +1069,7 @@ inline bool PythonArgs::isNone(int i) {
   return args[i] == nullptr;
 }
 
-inline c10::optional<at::Generator> PythonArgs::generator(int i) {
+inline std::optional<at::Generator> PythonArgs::generator(int i) {
   if (!args[i])
     return c10::nullopt;
   return reinterpret_cast<THPGenerator*>(args[i])->cdata;
diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h
index 73b991cf3fbfc..b060db00db733 100644
--- a/torch/csrc/utils/python_compat.h
+++ b/torch/csrc/utils/python_compat.h
@@ -11,6 +11,7 @@ extern "C" {
 
 #define IS_PYTHON_3_11_PLUS PY_VERSION_HEX >= 0x030B00C1
 #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000
+#define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000
 
 PYCAPI_COMPAT_STATIC_INLINE(int)
 PyCode_GetNCellvars(PyCodeObject* code) {
@@ -32,6 +33,9 @@ PyCode_GetNFreevars(PyCodeObject* code) {
 #endif
 }
 
+// Provided by CPython but getting the header for them is very hard
+extern void _PyWeakref_ClearRef(PyWeakReference* self);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index a3e71a2542e3d..e370923b398d8 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -826,7 +826,7 @@ void initDispatchBindings(PyObject* module) {
 
   m.def(
       "_parse_dispatch_key",
-      [](const char* dispatch_key) -> c10::optional<c10::DispatchKey> {
+      [](const char* dispatch_key) -> std::optional<c10::DispatchKey> {
         try {
           return c10::parseDispatchKey(dispatch_key);
         } catch (const c10::Error& err) {
diff --git a/torch/csrc/utils/python_raii.h b/torch/csrc/utils/python_raii.h
index 70a5ddfeb55ee..411e558715e8b 100644
--- a/torch/csrc/utils/python_raii.h
+++ b/torch/csrc/utils/python_raii.h
@@ -22,7 +22,7 @@ struct RAIIContextManager {
   }
 
  private:
-  c10::optional<GuardT> guard_;
+  std::optional<GuardT> guard_;
   std::tuple<Args...> args_;
 };
 
@@ -55,7 +55,7 @@ struct DeprecatedRAIIContextManager {
   }
 
  private:
-  c10::optional<GuardT> guard_;
+  std::optional<GuardT> guard_;
   std::tuple<Args...> args_;
 };
 
diff --git a/torch/csrc/utils/python_symnode.h b/torch/csrc/utils/python_symnode.h
index c4814930507bf..f8c710cf6579f 100644
--- a/torch/csrc/utils/python_symnode.h
+++ b/torch/csrc/utils/python_symnode.h
@@ -140,7 +140,7 @@ class PythonSymNodeImpl : public c10::SymNodeImpl {
     return getPyObj().attr("int_")().cast<int64_t>();
   }
 
-  c10::optional<int64_t> maybe_as_int() override {
+  std::optional<int64_t> maybe_as_int() override {
     py::gil_scoped_acquire acquire;
     const auto& r = getPyObj().attr("maybe_as_int")();
     if (r.is_none()) {
diff --git a/torch/csrc/utils/schema_info.cpp b/torch/csrc/utils/schema_info.cpp
index 56e1c6b4a6be2..0caa5b254d279 100644
--- a/torch/csrc/utils/schema_info.cpp
+++ b/torch/csrc/utils/schema_info.cpp
@@ -6,7 +6,7 @@ namespace utils {
 void SchemaInfo::addArgumentValue(
     const std::string& name,
     const at::IValue& value) {
-  c10::optional<int> index = schema_.argumentIndexWithName(name);
+  std::optional<int> index = schema_.argumentIndexWithName(name);
   TORCH_INTERNAL_ASSERT(
       index != c10::nullopt, "Schema has no argument named ", name);
   value_map_[name] = value;
@@ -14,7 +14,7 @@ void SchemaInfo::addArgumentValue(
 }
 
 void SchemaInfo::addArgumentValues(
-    const std::vector<c10::optional<at::IValue>>& value_list) {
+    const std::vector<std::optional<at::IValue>>& value_list) {
   TORCH_INTERNAL_ASSERT(
       value_list.size() <= schema_.arguments().size(),
       "Schema does not have enough arguments for value list");
@@ -106,7 +106,7 @@ bool SchemaInfo::has_argument(c10::string_view name) {
 }
 
 bool SchemaInfo::is_mutable(c10::string_view name) {
-  c10::optional<int> index = schema_.argumentIndexWithName(name);
+  std::optional<int> index = schema_.argumentIndexWithName(name);
   TORCH_INTERNAL_ASSERT(
       index.has_value(), "Schema has no argument named ", name);
 
@@ -144,10 +144,10 @@ bool SchemaInfo::may_alias(
   if (basic_check) {
     return true;
   }
-  c10::optional<c10::AliasTypeSet> lhsAliasTypeSet =
+  std::optional<c10::AliasTypeSet> lhsAliasTypeSet =
       schema_.mapTypeToAliasTypeSet(
           schema_.getCorrectList(lhs.type)[lhs.index].type());
-  c10::optional<c10::AliasTypeSet> rhsAliasTypeSet =
+  std::optional<c10::AliasTypeSet> rhsAliasTypeSet =
       schema_.mapTypeToAliasTypeSet(
           schema_.getCorrectList(rhs.type)[rhs.index].type());
   bool types_can_alias =
@@ -205,10 +205,10 @@ bool SchemaInfo::may_contain_alias(
 bool SchemaInfo::mayContainAliasImpl(
     const c10::SchemaArgument& lhs,
     const c10::SchemaArgument& rhs) {
-  c10::optional<c10::AliasTypeSet> lhsContainedAliasTypeSet =
+  std::optional<c10::AliasTypeSet> lhsContainedAliasTypeSet =
       schema_.getAliasTypeSetContainedTypes(schema_.mapTypeToAliasTypeSet(
           schema_.getCorrectList(lhs.type)[lhs.index].type()));
-  c10::optional<c10::AliasTypeSet> rhsAliasTypeSet =
+  std::optional<c10::AliasTypeSet> rhsAliasTypeSet =
       schema_.mapTypeToAliasTypeSet(
           schema_.getCorrectList(rhs.type)[rhs.index].type());
   bool types_can_alias =
@@ -339,7 +339,7 @@ void SchemaInfo::initSchemaInfo() {
           }
         }
       }
-      c10::optional<c10::AliasTypeSet> contained_types =
+      std::optional<c10::AliasTypeSet> contained_types =
           schema_.getAliasTypeSetContainedTypes(
               schema_.mapTypeToAliasTypeSet(argument.type()));
       if (contained_types && !contained_types->empty()) {
diff --git a/torch/csrc/utils/schema_info.h b/torch/csrc/utils/schema_info.h
index 461f5a6f0427b..acda1bffc1538 100644
--- a/torch/csrc/utils/schema_info.h
+++ b/torch/csrc/utils/schema_info.h
@@ -61,7 +61,7 @@ struct TORCH_API SchemaInfo {
   void addArgumentValue(const std::string& name, const at::IValue& value);
 
   void addArgumentValues(
-      const std::vector<c10::optional<at::IValue>>& value_list);
+      const std::vector<std::optional<at::IValue>>& value_list);
 
   void addArgumentValues(
       const std::unordered_map<std::string, at::IValue>& values);
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index e1755b5b36248..4fd398d1a8faf 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -42,7 +42,7 @@ using at::ScalarType;
 using at::Storage;
 using at::Tensor;
 using at::TensorOptions;
-using c10::optional;
+using std::optional;
 
 namespace torch::utils {
 namespace {
@@ -53,7 +53,7 @@ thread_local bool kOnlyLiftCPUTensors = false;
 TensorOptions build_options(
     c10::TensorOptions options,
     at::ScalarType scalar_type,
-    const c10::optional<Device>& device = c10::nullopt) {
+    const std::optional<Device>& device = c10::nullopt) {
   options = options.dtype(scalar_type);
   if (device.has_value()) {
     return options.device(device);
@@ -172,7 +172,7 @@ ScalarType infer_scalar_type(PyObject* obj) {
       Py_TYPE(obj)->tp_name,
       "'");
   if (PySequence_Check(obj)) {
-    c10::optional<ScalarType> scalarType;
+    std::optional<ScalarType> scalarType;
     auto length = PySequence_Length(obj);
     if (length < 0)
       throw python_error();
@@ -290,7 +290,7 @@ void recursive_store(
 Tensor internal_new_from_data(
     c10::TensorOptions options,
     at::ScalarType scalar_type,
-    c10::optional<Device> device_opt,
+    std::optional<Device> device_opt,
     PyObject* data,
     bool copy_variables,
     bool copy_numpy,
@@ -489,7 +489,7 @@ Tensor internal_new_from_data(
 Tensor new_from_data_copy(
     c10::TensorOptions options,
     at::ScalarType scalar_type,
-    c10::optional<Device> device,
+    std::optional<Device> device,
     PyObject* data) {
   return internal_new_from_data(
       options,
@@ -504,7 +504,7 @@ Tensor new_from_data_copy(
 Tensor legacy_new_from_sequence(
     c10::TensorOptions options,
     at::ScalarType scalar_type,
-    c10::optional<Device> device,
+    std::optional<Device> device,
     PyObject* data) {
   TORCH_CHECK_TYPE(
       PySequence_Check(data),
@@ -570,7 +570,7 @@ void check_base_legacy_new(
 // TODO: Make this accept options instead of dispatch key
 void check_legacy_ctor_device(
     c10::DispatchKey dispatch_key,
-    c10::optional<Device> device) {
+    std::optional<Device> device) {
   if (device.has_value()) {
     TORCH_CHECK(
         dispatchKeyToDeviceType(dispatch_key) == device.value().type(),
@@ -833,7 +833,7 @@ Tensor legacy_tensor_new(
 Tensor indexing_tensor_from_data(
     c10::TensorOptions options,
     at::ScalarType scalar_type,
-    c10::optional<Device> device,
+    std::optional<Device> device,
     PyObject* data) {
   // Specific to tensor indexing, converts an indexing list to an
   // indexing tensor (type Byte or Long)
@@ -877,7 +877,7 @@ static Tensor sparse_compressed_tensor_ctor_worker(
     c10::DispatchKey dispatch_key,
     at::ScalarType scalar_type,
     PythonArgs& r,
-    c10::optional<c10::Layout> required_layout) {
+    std::optional<c10::Layout> required_layout) {
   TORCH_INTERNAL_ASSERT(!isSparseCsr(dispatchKeyToBackend(dispatch_key)));
   TORCH_INTERNAL_ASSERT(!isSparse(dispatchKeyToBackend(dispatch_key)));
   enum {
@@ -971,7 +971,7 @@ static Tensor sparse_compressed_tensor_ctor_worker(
         /*copy_variables=*/false,
         /*copy_numpy=*/true,
         /*type_inference=*/true);
-    c10::optional<c10::Layout> layout =
+    std::optional<c10::Layout> layout =
         (required_layout
              ? r.layoutWithDefault(ARG_LAYOUT, required_layout.value())
              : r.layoutOptional(ARG_LAYOUT));
@@ -1027,7 +1027,7 @@ static Tensor sparse_compressed_tensor_ctor_worker(
         /*copy_variables=*/false,
         /*copy_numpy=*/true,
         /*type_inference=*/true);
-    c10::optional<c10::Layout> layout =
+    std::optional<c10::Layout> layout =
         (required_layout
              ? r.layoutWithDefault(ARG_LAYOUT1, required_layout.value())
              : r.layoutOptional(ARG_LAYOUT1));
@@ -1054,7 +1054,7 @@ Tensor sparse_compressed_tensor_ctor(
     c10::DispatchKey dispatch_key,
     at::ScalarType scalar_type,
     PythonArgs& r) {
-  c10::optional<c10::Layout> required_layout{};
+  std::optional<c10::Layout> required_layout{};
   return sparse_compressed_tensor_ctor_worker(
       "sparse_compressed_tensor",
       dispatch_key,
@@ -1067,7 +1067,7 @@ Tensor sparse_csr_tensor_ctor(
     c10::DispatchKey dispatch_key,
     at::ScalarType scalar_type,
     PythonArgs& r) {
-  c10::optional<c10::Layout> required_layout(c10::Layout::SparseCsr);
+  std::optional<c10::Layout> required_layout(c10::Layout::SparseCsr);
   return sparse_compressed_tensor_ctor_worker(
       "sparse_csr_tensor", dispatch_key, scalar_type, r, required_layout);
 }
@@ -1076,7 +1076,7 @@ Tensor sparse_csc_tensor_ctor(
     c10::DispatchKey dispatch_key,
     at::ScalarType scalar_type,
     PythonArgs& r) {
-  c10::optional<c10::Layout> required_layout(c10::Layout::SparseCsc);
+  std::optional<c10::Layout> required_layout(c10::Layout::SparseCsc);
   return sparse_compressed_tensor_ctor_worker(
       "sparse_csc_tensor", dispatch_key, scalar_type, r, required_layout);
 }
@@ -1085,7 +1085,7 @@ Tensor sparse_bsr_tensor_ctor(
     c10::DispatchKey dispatch_key,
     at::ScalarType scalar_type,
     PythonArgs& r) {
-  c10::optional<c10::Layout> required_layout(c10::Layout::SparseBsr);
+  std::optional<c10::Layout> required_layout(c10::Layout::SparseBsr);
   return sparse_compressed_tensor_ctor_worker(
       "sparse_bsr_tensor", dispatch_key, scalar_type, r, required_layout);
 }
@@ -1094,7 +1094,7 @@ Tensor sparse_bsc_tensor_ctor(
     c10::DispatchKey dispatch_key,
     at::ScalarType scalar_type,
     PythonArgs& r) {
-  c10::optional<c10::Layout> required_layout(c10::Layout::SparseBsc);
+  std::optional<c10::Layout> required_layout(c10::Layout::SparseBsc);
   return sparse_compressed_tensor_ctor_worker(
       "sparse_bsc_tensor", dispatch_key, scalar_type, r, required_layout);
 }
@@ -1660,9 +1660,9 @@ Tensor tensor_fromDLPack(PyObject* data) {
 
 Tensor asarray(
     PyObject* obj,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Device> device,
-    c10::optional<bool> copy,
+    std::optional<ScalarType> dtype,
+    std::optional<Device> device,
+    std::optional<bool> copy,
     bool requires_grad) {
   Tensor tensor;
 
diff --git a/torch/csrc/utils/tensor_new.h b/torch/csrc/utils/tensor_new.h
index a1c34bd448882..70a4fbca0bac3 100644
--- a/torch/csrc/utils/tensor_new.h
+++ b/torch/csrc/utils/tensor_new.h
@@ -44,7 +44,7 @@ at::Tensor legacy_tensor_new(
 at::Tensor indexing_tensor_from_data(
     c10::TensorOptions options,
     at::ScalarType scalar_type,
-    c10::optional<at::Device> device,
+    std::optional<at::Device> device,
     PyObject* data);
 at::Tensor sparse_coo_tensor_ctor(
     c10::DispatchKey dispatch_key,
@@ -130,9 +130,9 @@ at::Tensor tensor_frombuffer(
 at::Tensor tensor_fromDLPack(PyObject* data);
 at::Tensor asarray(
     PyObject* obj,
-    c10::optional<c10::ScalarType> dtype,
-    c10::optional<c10::Device> device,
-    c10::optional<bool> copy,
+    std::optional<c10::ScalarType> dtype,
+    std::optional<c10::Device> device,
+    std::optional<bool> copy,
     bool requires_grad);
 } // namespace utils
 } // namespace torch
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index a94ed7783dfd5..9b07b9d32f1c0 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -473,7 +473,7 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
     }
   }
 
-  const auto target_device = [&]() -> c10::optional<Device> {
+  const auto target_device = [&]() -> std::optional<Device> {
     // note(crcrpar): zero-size arrays come with nullptr.
     // ref:
     // https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html#cuda-array-interface-version-3
diff --git a/torch/csrc/utils/torch_dispatch_mode.h b/torch/csrc/utils/torch_dispatch_mode.h
index 79173aeb3e007..d1c1392e37d63 100644
--- a/torch/csrc/utils/torch_dispatch_mode.h
+++ b/torch/csrc/utils/torch_dispatch_mode.h
@@ -35,7 +35,7 @@ struct StashTorchDispatchModeGuard {
 
  private:
   std::shared_ptr<c10::impl::PyObject_TorchDispatchMode> saved_mode_;
-  c10::optional<c10::impl::TorchDispatchModeKey> saved_mode_key_;
+  std::optional<c10::impl::TorchDispatchModeKey> saved_mode_key_;
 };
 
 struct StashTorchDispatchStackGuard {
diff --git a/torch/csrc/utils/variadic.h b/torch/csrc/utils/variadic.h
index 9c021d9f5cd3d..78ffe29971423 100644
--- a/torch/csrc/utils/variadic.h
+++ b/torch/csrc/utils/variadic.h
@@ -18,7 +18,7 @@ struct CountTensors : IterArgs<CountTensors> {
   void operator()(const at::Tensor& x) {
     out += 1;
   }
-  void operator()(const c10::optional<at::Tensor>& x) {
+  void operator()(const std::optional<at::Tensor>& x) {
     out += x.has_value();
   }
   void operator()(at::ArrayRef<at::Tensor> xs) {
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 1344de8b9fde9..b8929179fd4a3 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -154,6 +154,14 @@ def _sleep(cycles):
     torch._C._cuda_sleep(cycles)
 
 
+def _extract_arch_version(arch_string: str):
+    """Extracts the architecture string from a CUDA version"""
+    base = arch_string.split("_")[1]
+    if base.endswith("a"):
+        base = base[:-1]
+    return int(base)
+
+
 def _check_capability():
     incorrect_binary_warn = """
     Found GPU%d %s which requires CUDA_VERSION >= %d to
@@ -177,7 +185,7 @@ def _check_capability():
             name = get_device_name(d)
             current_arch = major * 10 + minor
             min_arch = min(
-                (int(arch.split("_")[1]) for arch in torch.cuda.get_arch_list()),
+                (_extract_arch_version(arch) for arch in torch.cuda.get_arch_list()),
                 default=35,
             )
             if current_arch < min_arch:
@@ -198,7 +206,7 @@ def _check_cubins():
     arch_list = get_arch_list()
     if len(arch_list) == 0:
         return
-    supported_sm = [int(arch.split("_")[1]) for arch in arch_list if "sm_" in arch]
+    supported_sm = [_extract_arch_version(arch) for arch in arch_list if "sm_" in arch]
     for idx in range(device_count()):
         cap_major, cap_minor = get_device_capability(idx)
         # NVIDIA GPU compute architectures are backward compatible within major version
diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py
index 22d541f4e2879..d361213815865 100644
--- a/torch/cuda/streams.py
+++ b/torch/cuda/streams.py
@@ -36,7 +36,7 @@ def __new__(cls, device=None, priority=0, **kwargs):
             with torch.cuda.device(device):
                 return super().__new__(cls, priority=priority, **kwargs)
 
-    def wait_event(self, event):
+    def wait_event(self, event) -> None:
         r"""Make all future work submitted to the stream wait for an event.
 
         Args:
@@ -53,7 +53,7 @@ def wait_event(self, event):
         """
         event.wait(self)
 
-    def wait_stream(self, stream):
+    def wait_stream(self, stream) -> None:
         r"""Synchronize with another stream.
 
         All future work submitted to this stream will wait until all kernels
@@ -82,7 +82,7 @@ def record_event(self, event=None):
         event.record(self)
         return event
 
-    def query(self):
+    def query(self) -> bool:
         r"""Check if all the work submitted has been completed.
 
         Returns:
@@ -90,7 +90,7 @@ def query(self):
         """
         return super().query()
 
-    def synchronize(self):
+    def synchronize(self) -> None:
         r"""Wait for all the kernels in this stream to complete.
 
         .. note:: This is a wrapper around ``cudaStreamSynchronize()``: see
@@ -102,7 +102,7 @@ def synchronize(self):
     def _as_parameter_(self):
         return ctypes.c_void_p(self.cuda_stream)
 
-    def __eq__(self, o):
+    def __eq__(self, o) -> bool:
         if isinstance(o, Stream):
             return super().__eq__(o)
         return False
@@ -128,7 +128,7 @@ class ExternalStream(Stream):
         stream_ptr(int): Integer representation of the `cudaStream_t` value.
             allocated externally.
         device(torch.device or int, optional): the device where the stream
-            was originally allocated. if device is specified incorrectly,
+            was originally allocated. If device is specified incorrectly,
             subsequent launches using this stream may fail.
     """
 
@@ -183,7 +183,7 @@ def record(self, stream=None):
             stream = torch.cuda.current_stream()
         super().record(stream)
 
-    def wait(self, stream=None):
+    def wait(self, stream=None) -> None:
         r"""Make all future work submitted to the given stream wait for this event.
 
         Use ``torch.cuda.current_stream()`` if no stream is specified.
@@ -212,7 +212,7 @@ def elapsed_time(self, end_event):
         """
         return super().elapsed_time(end_event)
 
-    def synchronize(self):
+    def synchronize(self) -> None:
         r"""Wait for the event to complete.
 
         Waits until the completion of all work currently captured in this event.
@@ -234,7 +234,7 @@ def ipc_handle(self):
     def _as_parameter_(self):
         return ctypes.c_void_p(self.cuda_event)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         if self.cuda_event:
             return f"<torch.cuda.Event {self._as_parameter_.value:#x}>"
         else:
diff --git a/torch/custom_class_detail.h b/torch/custom_class_detail.h
index 736d5aacdaa32..e27721c349864 100644
--- a/torch/custom_class_detail.h
+++ b/torch/custom_class_detail.h
@@ -61,7 +61,7 @@ struct arg {
   // IValue's default constructor makes it None, which is not distinguishable
   // from an actual, user-provided default value that is None. This boolean
   // helps distinguish between the two cases.
-  c10::optional<c10::IValue> value_;
+  std::optional<c10::IValue> value_;
 };
 
 namespace detail {
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index 47e0e78a6be27..eb7a690fa9589 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -127,6 +127,7 @@ def breakpoint(rank: int = 0):
     )
 
     from .remote_device import _remote_device
+    from .device_mesh import init_device_mesh, DeviceMesh
 
     set_debug_level_from_env()
 
diff --git a/torch/distributed/_tensor/__init__.py b/torch/distributed/_tensor/__init__.py
index 3e5e628b0522b..f7afe41e753c5 100644
--- a/torch/distributed/_tensor/__init__.py
+++ b/torch/distributed/_tensor/__init__.py
@@ -10,6 +10,8 @@
 from torch.distributed._tensor.ops.utils import normalize_to_torch_size
 from torch.distributed._tensor.placement_types import Placement, Replicate, Shard
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh, init_device_mesh
+from torch.optim.optimizer import _foreach_supported_types
+
 
 # All public APIs from dtensor package
 __all__ = [
@@ -23,6 +25,12 @@
 ]
 
 
+# Append DTensor to the list of supported types for foreach implementation of optimizer
+# so that we will try to use foreach over the for-loop implementation on CUDA.
+if DTensor not in _foreach_supported_types:
+    _foreach_supported_types.append(DTensor)
+
+
 def _dtensor_init_helper(
     init_op,
     size: torch.Size,
diff --git a/torch/distributed/_tensor/_collective_utils.py b/torch/distributed/_tensor/_collective_utils.py
index cd62a76307f26..93052d6ddd622 100644
--- a/torch/distributed/_tensor/_collective_utils.py
+++ b/torch/distributed/_tensor/_collective_utils.py
@@ -177,6 +177,21 @@ def unpad_tensor(tensor: torch.Tensor, pad_dim: int, pad_size: int) -> torch.Ten
     )
 
 
+def fill_empty_tensor_to_shards(
+    shards: List[torch.Tensor], shard_dim: int, num_empty_tensors: int
+) -> List[torch.Tensor]:
+    if num_empty_tensors == 0:
+        return shards
+    tensor_size = list(shards[0].size())
+    tensor_size = [
+        size if idx != shard_dim else 0 for idx, size in enumerate(tensor_size)
+    ]
+    tensor = shards[0].new_zeros(tensor_size)
+    for _ in range(num_empty_tensors):
+        shards.append(tensor)
+    return shards
+
+
 def spec_to_bytes(spec: "placement_types.DTensorSpec") -> int:
     assert spec.tensor_meta is not None, "spec should have tensor meta defined!"
     return spec.tensor_meta.dtype.itemsize * math.prod(spec.shape)
diff --git a/torch/distributed/_tensor/placement_types.py b/torch/distributed/_tensor/placement_types.py
index 6eb19de18abe0..0fb569c4fac20 100644
--- a/torch/distributed/_tensor/placement_types.py
+++ b/torch/distributed/_tensor/placement_types.py
@@ -7,6 +7,7 @@
 import torch.distributed._functional_collectives as funcol
 
 from torch.distributed._tensor._collective_utils import (
+    fill_empty_tensor_to_shards,
     mesh_broadcast,
     mesh_scatter,
     pad_tensor,
@@ -60,9 +61,21 @@ def _split_tensor(
             self.dim <= tensor.ndim
         ), f"Sharding dim {self.dim} greater than tensor ndim {tensor.ndim}"
 
-        # chunk tensor over dimension `dim` into n slices with padding if necessary
+        # chunk tensor over dimension `dim` into n slices
         tensor_list = list(torch.chunk(tensor, num_chunks, dim=self.dim))
-        # compute the chunk size inline with ``torch.chunk``
+        num_empty_tensors = num_chunks - len(tensor_list)
+
+        # if no need to have padding or tensor dim size is evenly sharded already
+        # we can return early.
+        if not with_padding or tensor.size(self.dim) % num_chunks == 0:
+            if contiguous:
+                tensor_list = [t.contiguous() for t in tensor_list]
+            return (
+                fill_empty_tensor_to_shards(tensor_list, self.dim, num_empty_tensors),
+                [],
+            )
+
+        # compute the chunk size inline with ``torch.chunk`` to calculate padding
         full_chunk_size = (tensor.size(self.dim) + num_chunks - 1) // num_chunks
 
         # Compute chunk size for each chunk for ``self.dim``
@@ -74,26 +87,17 @@ def _split_tensor(
         pad_sizes = [full_chunk_size - chunk_size for chunk_size in chunk_sizes]
 
         # Reuse tensor to fill empty chunk with empty tensor
-        num_empty_tensors = num_chunks - len(tensor_list)
-        tensor_size = list(tensor_list[0].size())
-        tensor_size = [
-            size if idx != self.dim else 0 for idx, size in enumerate(tensor_size)
-        ]
-        tensor = tensor.new_zeros(tensor_size)
-        for _ in range(num_empty_tensors):
-            tensor_list.append(tensor)
-
-        if with_padding or contiguous:
-            shard_list = []
-            for shard, pad_size in zip(tensor_list, pad_sizes):
-                # Fill the empty tensor with zeroes with padding.
-                if with_padding and pad_size > 0:
-                    shard = pad_tensor(shard, self.dim, pad_size)
-                shard = shard.contiguous() if contiguous else shard
-                shard_list.append(shard)
-            return shard_list, pad_sizes
-        else:
-            return tensor_list, pad_sizes
+        tensor_list = fill_empty_tensor_to_shards(
+            tensor_list, self.dim, num_empty_tensors
+        )
+        shard_list = []
+        for shard, pad_size in zip(tensor_list, pad_sizes):
+            # Fill the empty tensor with zeroes with padding.
+            if with_padding and pad_size > 0:
+                shard = pad_tensor(shard, self.dim, pad_size)
+            shard = shard.contiguous() if contiguous else shard
+            shard_list.append(shard)
+        return shard_list, pad_sizes
 
     @staticmethod
     def _local_shard_size_on_dim(
@@ -141,13 +145,13 @@ def _shard_tensor(
             tensor, num_chunks, with_padding=True, contiguous=True
         )
 
-        output = torch.empty_like(scatter_list[my_coordinate[mesh_dim]])
+        mesh_dim_local_rank = my_coordinate[mesh_dim]
+        output = torch.empty_like(scatter_list[mesh_dim_local_rank])
         mesh_scatter(output, scatter_list, mesh, mesh_dim=mesh_dim)
 
         # Only unpad if the local_tensor was padded on the dimension.
-        pad_size = pad_sizes[my_coordinate[mesh_dim]]
-        if pad_size > 0:
-            output = unpad_tensor(output, self.dim, pad_size)
+        if pad_sizes and pad_sizes[mesh_dim_local_rank] > 0:
+            output = unpad_tensor(output, self.dim, pad_sizes[mesh_dim_local_rank])
         return output
 
     def _reduce_shard_tensor(
diff --git a/torch/distributed/checkpoint/default_planner.py b/torch/distributed/checkpoint/default_planner.py
index 0249f4bdf7b19..c9590c38d3e61 100644
--- a/torch/distributed/checkpoint/default_planner.py
+++ b/torch/distributed/checkpoint/default_planner.py
@@ -258,7 +258,7 @@ def _should_include_key(self, key: str, metadata: Metadata) -> bool:
         for unflattened_key in planner_data:
             if unflattened_keys:
                 unflattened_keys.append(
-                    ".".join([unflattened_keys[-1], unflattened_key])
+                    ".".join([unflattened_keys[-1], str(unflattened_key)])
                 )
 
             else:
diff --git a/torch/distributed/checkpoint/format_utils.py b/torch/distributed/checkpoint/format_utils.py
index aca8c454db09e..41ebaf8be61bf 100644
--- a/torch/distributed/checkpoint/format_utils.py
+++ b/torch/distributed/checkpoint/format_utils.py
@@ -222,8 +222,8 @@ def torch_save_to_dcp(
     Given the location of a torch save file, converts it into a DCP checkpoint.
 
     Args:
-        torch_save_path: Filename to store the converted Torch save file.
-        dcp_checkpoint_dir: Directory containing the DCP checkpoint.
+        torch_save_path: Filename of the Torch save file.
+        dcp_checkpoint_dir: Directory to store the DCP checkpoint.
 
     .. warning::
         To avoid OOM, it's recommended to only run this function on a single rank.
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index 72c3955e7d1e5..acfba81899c04 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -670,9 +670,13 @@ def _poll(self) -> Optional[RunProcsResult]:
             if self._is_done():
                 # we should ALWAYS have ALL the return values when all the processes are done
                 self._worker_finished_event.set()
-                # Wait untill all processes are finished. At this point workers finished executing
-                # user function
-                self._pc.join()
+
+                # At this point workers finished running the user function
+                # But the child process might still have not exited. Wait for them.
+                # pc.join() blocks [forever] until "a" proc exits. Loop until all of them exits.
+                while not self._pc.join():
+                    logger.debug("entrypoint fn finished, waiting for all child procs to exit...")
+
                 _validate_full_rank(
                     self._return_values, self.nprocs, "return_value queue"
                 )
diff --git a/torch/distributed/fsdp/_flat_param.py b/torch/distributed/fsdp/_flat_param.py
index 2f344d19e9305..ed141465155c0 100644
--- a/torch/distributed/fsdp/_flat_param.py
+++ b/torch/distributed/fsdp/_flat_param.py
@@ -1184,12 +1184,14 @@ def init_flat_param_attributes(self) -> None:
         flat_param._local_shard = flat_param.data
         if self._offload_params:
             # Pin the memory for faster H2D transfer
-            flat_param._local_shard = flat_param._local_shard.pin_memory()
+            flat_param._local_shard = flat_param._local_shard.pin_memory(
+                device=self.device
+            )
             # Pre-allocate the sharded gradient on CPU to enable non-blocking
             # D2H transfer during the backward pass
             flat_param._cpu_grad = torch.zeros_like(
                 flat_param._local_shard, device=cpu_device
-            ).pin_memory()
+            ).pin_memory(device=self.device)
         if self._uses_param_mixed_precision:
             # For parameter mixed precision, we maintain a low precision
             # sharded tensor on the compute device to be all-gathered (for
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 89d9638217f1d..f1e579adae009 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -387,7 +387,7 @@ def _pre_forward(
         if handle and handle._offload_params and handle.flat_param._cpu_grad is None:
             handle.flat_param._cpu_grad = torch.zeros_like(
                 handle.flat_param._local_shard, device=torch.device("cpu")
-            ).pin_memory()
+            ).pin_memory(device=state.compute_device)
 
         should_cast_forward_inputs = (
             state._handle and not state._handle._force_full_precision
diff --git a/torch/distributed/optim/functional_adagrad.py b/torch/distributed/optim/functional_adagrad.py
index dfd50db175913..96e075c8216ca 100644
--- a/torch/distributed/optim/functional_adagrad.py
+++ b/torch/distributed/optim/functional_adagrad.py
@@ -30,7 +30,6 @@ def __init__(
         eps: float = 1e-10,
         coalesce_grad: bool = True,
         foreach: bool = False,
-        fused: bool = False,
         maximize: bool = False,
         _allow_empty_param_list: bool = False,
     ):
@@ -45,7 +44,6 @@ def __init__(
         }
         self.coalesce_grad = coalesce_grad
         self.foreach = foreach
-        self.fused = fused
         self.maximize = maximize
         self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
 
@@ -103,7 +101,4 @@ def step(self, gradients: List[Optional[Tensor]]):
                 foreach=self.foreach,
                 maximize=self.maximize,
                 has_complex=has_complex,
-                fused=self.fused,
-                grad_scale=None,
-                found_inf=None,
             )
diff --git a/torch/distributed/pipelining/_PipelineStage.py b/torch/distributed/pipelining/_PipelineStage.py
index b30d99366caf5..db0340677b172 100644
--- a/torch/distributed/pipelining/_PipelineStage.py
+++ b/torch/distributed/pipelining/_PipelineStage.py
@@ -7,6 +7,7 @@
 import torch
 import torch.distributed as dist
 import torch.fx as fx
+import torch.nn as nn
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.distributed._composable.fsdp.fully_shard import FSDPModule
 from torch.fx.node import map_aggregate
@@ -55,11 +56,11 @@ def __repr__(self):
 
 
 def _make_tensor_from_meta(
-    example: FakeTensor,
+    example: Union[torch.Tensor, FakeTensor],
     device: torch.device,
 ) -> torch.Tensor:
     """
-    Create a real tensor from a fake tensor.
+    Create a real tensor from a tensor.
     """
     return torch.empty(
         example.size(),
@@ -142,7 +143,7 @@ def __init__(
         self.log_prefix = f"[Stage {self.stage_index}]"
 
         # Forward infra
-        self.args_recv_info: Dict[int, Tuple[InputInfo]] = {}
+        self.args_recv_info: Dict[int, Tuple[InputInfo, ...]] = {}
         self.set_requires_grad: Dict[int, bool] = {}
         self.act_send_info: Dict[int, List] = {}
 
@@ -211,7 +212,7 @@ def _create_grad_recv_info(
 
     def _get_recv_ops(
         self,
-        recv_infos: Tuple[InputInfo],
+        recv_infos: Tuple[InputInfo, ...],
     ) -> List[dist.P2POp]:
         """
         Helper function shared by `get_fwd_recv_ops` and `get_bwd_recv_ops`.
@@ -239,7 +240,7 @@ def get_fwd_recv_ops(self) -> List[dist.P2POp]:
         Returns a list of ops that are needed to receive the input arguments
         for this stage.
         """
-        recv_infos: Tuple[InputInfo] = self.args_recv_info[self.fwd_chunk_id]
+        recv_infos: Tuple[InputInfo, ...] = self.args_recv_info[self.fwd_chunk_id]
 
         # In case there is backward pass, set requires_grad for receive buffers
         # before first forward
@@ -360,7 +361,7 @@ def clear_runtime_states(self) -> None:
 
     def _map_tensor_from_recv_info(
         self,
-        recv_infos: Tuple[InputInfo],
+        recv_infos: Tuple[InputInfo, ...],
     ):
         """
         Map tensors from recv infos to a list.
@@ -819,3 +820,399 @@ def __init__(
         # Get my pipe info
         pipe_info = pipe.info()
         super().__init__(stage_module, stage_index, pipe_info, device, group)
+
+
+# Manual PipelineStage functions and definition
+
+METADATA_TENSOR_LEN = 100
+PLACEHOLDER_VAL = -1
+
+
+def create_empty_tensors(
+    tensor: Union[torch.Tensor, List[torch.Tensor]], device: torch.device
+) -> List[torch.Tensor]:
+    """
+    Creates a list of empty tensors with the same properties (like shape and dtype) as the input tensor(s),
+    and places them on the specified device.
+    Args:
+        tensor (Union[torch.Tensor, List[torch.tensor]]): The input tensor(s).
+        device (torch.device): The device where the new tensors will be placed.
+    Returns:
+        List[torch.Tensor]: A list of empty tensors with the same properties as the input tensor(s).
+    """
+    if isinstance(tensor, torch.Tensor):
+        return [torch.empty_like(tensor, device=device)]
+    elif isinstance(tensor, (list, tuple)):
+        return [torch.empty_like(t, device=device) for t in tensor]
+    raise TypeError(f"Unsupported type {type(tensor)} cannot create empty tensors")
+
+
+def create_metadata_tensor(
+    tensors: Optional[List[torch.Tensor]] = None,
+    device: Optional[torch.device] = torch.device("cpu"),
+) -> torch.Tensor:
+    """
+    Create a metadata tensor that can be sent over the wire.
+    This tensor contains the number of dimensions and the shape of each tensor being sent.
+
+    The data is of format [num_dims, dim1, dim2, ...].
+    If the tensor is None, a tensor of only placeholder values will be returned.
+
+    Inputs:
+        tensors: A list of tensors, the tensors will converted into its shape dimensions and
+                 these dimensions will be concatenated.
+        device: The device where the metadata tensor will be created.
+    If the tensor is None, then this tensor will contain PLACEHOLDER_VALs.
+
+    """
+    metadata_tensor = torch.full(
+        (METADATA_TENSOR_LEN,),
+        PLACEHOLDER_VAL,
+        dtype=torch.int32,
+        device=device,
+    )
+    if tensors:
+        # Create a list of tensors containing the number of dimensions and the shape of each tensor
+        data = [
+            # data is of format [num_dims, dim1, dim2, ...]
+            torch.tensor(
+                [len(tensor.shape)] + list(tensor.shape),
+                dtype=torch.int32,
+                device=device,
+            )
+            for tensor in tensors
+        ]
+        # Concatenate the data into a single tensor
+        data_tensor = torch.cat(data)
+        dt_shape = data_tensor.shape[0]
+        if dt_shape > METADATA_TENSOR_LEN:
+            raise ValueError(
+                f"Metadata tensor size ({dt_shape}) exceeds maximum allowed length ({METADATA_TENSOR_LEN})."
+            )
+        metadata_tensor[:dt_shape] = data_tensor
+    return metadata_tensor
+
+
+def extract_metadata_from_tensor(tensor: torch.Tensor) -> List[torch.Size]:
+    """
+    Extract the number of dimensions and the shape of each tensor from a metadata tensor.
+    """
+    metadata: List[torch.Size] = []
+    i = 0
+    while i < len(tensor) and tensor[i] != PLACEHOLDER_VAL:
+        num_dims = int(tensor[i].item())
+        shape = torch.Size(tensor[i + 1 : i + 1 + num_dims].tolist())
+        metadata.append(shape)
+        i += num_dims + 1
+    return metadata
+
+
+def get_stage_shapes(
+    stage_modules: List[nn.Module],
+    stage_ids: List[int],
+    num_stages: int,
+    rank: int,
+    world_size: int,
+    device: torch.device,
+    microbatch: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+):
+    """
+    Performs a dry run through all the pipeline stages (a rank can have multiple pipeline stages in the case of
+    virtual pipelining) and returns the shape of the inputs and outputs of the module.
+    Only the first stage must pass in a microbatch.
+
+    Each rank must call get_stage_shapes or the program will hang.
+
+    Args:
+        stage_modules: The chunks assigned to this rank. Rhe length should be 1 for any
+                non-interleaved schedules and >1 for any interleaved schedules.
+        stage_ids: The id of the stages assigned to this rank.
+        num_stages: Total number of stages.
+        rank: Rank of the current process.
+        world_size: Number of processes participating in the pipeline.
+        device: Device where the tensors are allocated.
+
+    Returns a dictionary containing the following keys:
+        "inputs": Shape of the inputs to the module
+        "outputs": Shape of the outputs of the module
+    """
+
+    stage_id_to_shapes: Dict[int, Dict[str, list[torch.Size]]] = {}
+    for stage_id, model in zip(stage_ids, stage_modules):
+        input_shape_metadata_tensor = create_metadata_tensor(device=device)
+        # TODO: Assumes prev_stage == rank - 1 and next_stage == rank + 1
+        prev_rank = (rank - 1) % world_size
+        next_rank = (rank + 1) % world_size
+        shapes = {}
+
+        # first stage doesn't receive anything and uses a microbatch
+        if stage_id == 0:
+            if microbatch is None:
+                raise RuntimeError("Microbatch is required for first stage")
+            example_fwd_inputs = microbatch
+            if isinstance(example_fwd_inputs, torch.Tensor):
+                example_fwd_inputs = [example_fwd_inputs]
+        else:
+            # other stages must receive shape information
+            # TODO: send/recv should take a group, rather than use the default group
+            dist.recv(input_shape_metadata_tensor, prev_rank)
+            metadata = extract_metadata_from_tensor(input_shape_metadata_tensor)
+            example_fwd_inputs = [
+                torch.empty(shape_list, device=device) for shape_list in metadata
+            ]
+        shapes["inputs"] = [fwd_input.shape for fwd_input in example_fwd_inputs]
+
+        # perform forward
+        # TODO: if forward fails raise a more descriptive error explaining which stage failed
+        fwd_outputs = model(*example_fwd_inputs)
+        fwd_outputs = create_empty_tensors(fwd_outputs, device)
+        shapes["outputs"] = [fwd_output.shape for fwd_output in fwd_outputs]
+
+        # send shape dims
+        if stage_id != num_stages - 1:
+            output_shape_metadata_tensor = create_metadata_tensor(
+                fwd_outputs, device=device
+            )
+            dist.send(output_shape_metadata_tensor, next_rank)
+        stage_id_to_shapes[stage_id] = shapes
+    logger.info(stage_id_to_shapes)
+    return stage_id_to_shapes
+
+
+class ManualPipelineStage(PipelineStageBase):
+    """
+    A class representing a pipeline stage in a pipeline parallelism setup.
+    This class is created manually by providing a example input (and optionally output)
+    as opposed to the PipelineStage class that is outputed from pipeline().
+    This class extends the `PipelineStageBase` class and can similarly be used
+    in `PipelineScheule`.
+    Args:
+        submodule (nn.Module): The PyTorch module wrapped by this stage.
+        stage_index (int): The ID of this stage.
+        num_stages (int): The total number of stages.
+        device (torch.device): The device where this stage is located.
+        num_microbatches (int): The number of microbatches to use.
+        input_args (Union[torch.Tensor, List[torch.tensor]], optional): The input arguments for the submodule.
+        output_args (Union[torch.Tensor, List[torch.tensor]], optional): The output arguments for the submodule.
+        group (dist.ProcessGroup, optional): The process group for distributed training. If None, default group.
+    """
+
+    def __init__(
+        self,
+        submodule: nn.Module,
+        stage_index: int,
+        num_stages: int,
+        device: torch.device,
+        num_microbatches: int,
+        input_args: Union[torch.Tensor, List[torch.Tensor]],
+        output_args: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+        group: Optional[dist.ProcessGroup] = None,
+    ):
+        super().__init__(
+            submodule, stage_index, num_stages, device, num_microbatches, group
+        )
+        self.submod.to(self.device)
+        # When we materialize the model partition on cuda, we call reset_parameters() if it is available
+        # logger.info(f"input args {input_args=}")
+        self.inputs: List[torch.Tensor] = []
+        self.outputs: List[torch.Tensor] = []
+
+        self.inputs = create_empty_tensors(input_args, device)
+
+        if output_args is None:
+            logger.info("output_args not provided, performing forward using input_args")
+            self.outputs = self.submod(*self.inputs)
+            # create buffers for the output so that the data is in the correct
+            # shape in order to use in p2p op (send)
+            self.outputs = create_empty_tensors(self.outputs, device)
+        else:
+            self.outputs = create_empty_tensors(output_args, device)
+
+        # these are the buffers used in backwards send/recv, they are allocated later
+        self.outputs_grad: List[torch.Tensor] = []
+
+        def stage_global_rank(peer_rank):
+            return (
+                peer_rank
+                if self.group is None
+                else dist.get_global_rank(self.group, peer_rank)
+            )
+
+        self.prev_stage = stage_global_rank((self.group_rank - 1) % self.group_size)
+        self.next_stage = stage_global_rank((self.group_rank + 1) % self.group_size)
+
+        # Receive info during forward
+        # TODO: create args_recv_info lazily? (same needed for PipelineStage)
+        for chunk_id in range(self.chunks):
+            self.set_requires_grad[chunk_id] = False
+            if not self.is_first:
+                # We assume that we always receive from stage - 1
+                recv_infos = tuple(
+                    [
+                        RecvInfo(
+                            f"recv_for_{self.stage_index}_from_{self.stage_index - 1}",
+                            self.stage_index - 1,
+                            _make_tensor_from_meta(inp, self.device),
+                        )
+                        for inp in self.inputs
+                    ]
+                )
+
+                self.args_recv_info[chunk_id] = recv_infos
+            else:
+                self.args_recv_info[chunk_id] = tuple(
+                    [RootArgPlaceholder() for _ in self.inputs]
+                )
+
+        # Send info during forward for each activation
+        # only need the rank that is being sent to
+        self.act_send_info: Dict[int, List] = {}
+        for idx in range(len(self.outputs)):
+            # We assume we always send to stage + 1
+            if not self.is_last:
+                self.act_send_info[idx] = [self.stage_index + 1]
+            else:
+                self.act_send_info[idx] = []
+
+        logger.debug(
+            f"finished pipeline stage init, {self.stage_index=}, {self.is_first=}, "  # noqa: G004
+            f"{self.is_last=}, {self.num_stages=}, "
+            f"inputs: {[inp.shape for inp in self.inputs]}, "
+            f"output: {[output.shape for output in self.outputs]}"
+        )
+
+    def _create_grad_recv_info(
+        self,
+        act_send_info: Dict,
+    ) -> Tuple[RecvInfo, ...]:
+        grad_recv_info: Tuple[RecvInfo, ...] = ()
+        if not self.is_last:
+            # Receiving gradients from multiple sources is not supported
+            # hence we only take the first destination
+            grad_recv_info = tuple(
+                [
+                    RecvInfo(
+                        f"recv_grad_for_{self.stage_index}_from_{dst_list[0]}",
+                        dst_list[0],
+                        _make_tensor_from_meta(self.outputs[idx], self.device),
+                    )
+                    for idx, dst_list in act_send_info.items()
+                ]
+            )
+        return grad_recv_info
+
+    def init_p2p_neighbors(self):
+        """
+        Set up p2p communitors between previous and next stages
+        by sending a dummy tensor.
+
+        If this is used, must be called for all pipeline stages.
+        """
+        ops = []
+        recv_tensor = torch.zeros(1, device="cuda")
+        send_tensor = torch.ones(1, device="cuda")
+        # forward
+        if not self.is_first:
+            ops.append(dist.P2POp(dist.irecv, recv_tensor, self.prev_stage, self.group))
+        if not self.is_last:
+            ops.append(dist.P2POp(dist.isend, send_tensor, self.next_stage, self.group))
+
+        # backward
+        if not self.is_first:
+            ops.append(dist.P2POp(dist.isend, send_tensor, self.prev_stage, self.group))
+        if not self.is_last:
+            ops.append(dist.P2POp(dist.irecv, recv_tensor, self.next_stage, self.group))
+
+        return True
+
+
+def validate_stage_shapes(pipeline_stages: List[ManualPipelineStage]):
+    """
+    Check that the buffer shapes match between stages was expected by performing an all_gather between
+    all stages.
+    """
+    if len(pipeline_stages) == 0:
+        raise ValueError("No pipeline stages provided.")
+
+    virtual_pipeline_size = len(pipeline_stages)
+    all_inputs = []
+    all_outputs = []
+    world_size = pipeline_stages[0].group_size
+    num_stages = pipeline_stages[0].num_stages
+
+    # perform all gathers between all stages
+    for virtual_id, stage in enumerate(pipeline_stages):
+        world_size = stage.group_size
+        stage_id: int = stage.stage_index
+        rank = stage.group_rank
+        # check that world_size and num_stages are consistent across all stages
+        if stage.group_size != world_size:
+            raise ValueError(
+                f"Stage id {stage_id} has world size ({stage.group_size}) \
+                which does not match world size ({world_size}) of other stages."
+            )
+        if stage.num_stages != num_stages:
+            raise ValueError(
+                f"Stage id {stage_id} has num stages ({stage.num_stages}) \
+                which does not match num stages ({num_stages}) of other stages."
+            )
+
+        pg_rank = dist.get_rank(stage.group)
+        if rank != pg_rank:
+            raise ValueError(
+                f"Rank {rank} is not equal to process group rank {pg_rank}"
+            )
+
+        if (num_stages := stage.num_stages) % world_size != 0:
+            raise ValueError(
+                f"Number of stages ({num_stages}) must be a multiple of the world_size ({world_size})"
+            )
+
+        # all gather each ranks inputs
+        tensor_list = [
+            create_metadata_tensor(device=stage.device) for _ in range(stage.group_size)
+        ]
+        expected_inputs = stage.inputs
+        stage_input = create_metadata_tensor(expected_inputs, device=stage.device)
+        dist.all_gather(tensor_list, stage_input)
+        stage_input_shapes = [
+            extract_metadata_from_tensor(tensor) for tensor in tensor_list
+        ]
+
+        # all gather each ranks outputs
+        tensor_list = [
+            create_metadata_tensor(device=stage.device) for _ in range(stage.group_size)
+        ]
+        expected_outputs = stage.outputs
+        stage_output = create_metadata_tensor(expected_outputs, device=stage.device)
+        dist.all_gather(tensor_list, stage_output)
+        stage_output_shapes = [
+            extract_metadata_from_tensor(tensor) for tensor in tensor_list
+        ]
+
+        logger.debug(
+            f"Rank: {pg_rank}"  # noqa: G004
+            f"Stage id: {stage_id}"
+            f"Stage num stages: {stage.num_stages}"
+            f"Stage rank: {rank}"
+            f"Stage world size: {world_size}"
+            f"Stage {virtual_id * world_size}-{(virtual_id + 1) * world_size - 1} input shapes: {stage_input_shapes}"  # noqa: G003
+            f"Stage {virtual_id * world_size}-{(virtual_id + 1) * world_size - 1} output shapes: {stage_output_shapes}"  # noqa: G003
+        )
+
+        all_inputs.extend(stage_input_shapes)
+        all_outputs.extend(stage_output_shapes)
+
+        # log only rank 0's view, they will all be equivalent
+        if pg_rank == 0:
+            logger.info(
+                f"all stage inputs: {all_inputs}"  # noqa: G004
+                f"all stage outputs: {all_outputs}"
+            )
+
+    # Check if the output for stage 0 matches the input at stage 1, and so forth
+    for i in range(virtual_pipeline_size * world_size - 1):
+        if (out := all_outputs[i]) != (inp := all_inputs[i + 1]):
+            raise ValueError(
+                f"Stage_id {i} output shape {out} at does not match stage_id {i + 1} input shape {inp}."
+            )
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index 1ac7fd6b5e9e3..728bdf25a981f 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -246,6 +246,12 @@ def _get_param_buffer_mapping(
     for name, buffer in original_module.named_buffers(remove_duplicate=False):
         buffer_lookup.setdefault(id(buffer), []).append(name)
 
+    # reverse lists so FQN assignment is FIFO wrt model structure
+    for name, fqns in param_lookup.items():
+        param_lookup[name] = fqns[::-1]
+    for name, fqns in buffer_lookup.items():
+        buffer_lookup[name] = fqns[::-1]
+
     param_buffer_table: Dict[str, str] = {}
     for dynamo_name, dynamo_param in traced_module.named_parameters(
         remove_duplicate=False
diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py
index 14f91ee64679d..31701d9fb685f 100644
--- a/torch/export/unflatten.py
+++ b/torch/export/unflatten.py
@@ -172,32 +172,93 @@ def __init__(
         self.range_constraints = export_module.range_constraints
         self.equality_constraints: List = []
 
+        # aliasing/unused param or buffer issues:
+        # in strict-mode export, dynamo export will deduplicate aliased tensors,
+        # and ignore unused tensors. For aliasing, this causes issues when some aliases
+        # are unused, and we're unable to match the placeholder node to the correct FQN.
+        # This leads to the graph signature potentially having the wrong target FQN,
+        # and downstream issues where parameters are assigned to the wrong target attribute,
+        # mismatching the relevant placeholder node in the unflattened module.
+        # To resolve this we restore (_assign_attr) all aliased/unused tensors in
+        # the state_dict as module attributes, but only keep the used tensors in the
+        # graph's forward pass (_sink_params).
         state_dict = export_module.state_dict
-        for name in self.graph_signature.parameters:
-            cloned = torch.nn.Parameter(state_dict[name].clone())
+        assigned_params: Set[str] = set()  # tracking unused params
+        id_to_param: Dict[int, torch.nn.Parameter] = {}  # handling weight-sharing
+        for name in self.graph_signature.parameters:  # this loop adds used params
+            param = state_dict[name]
+            if id(param) not in id_to_param:
+                id_to_param[id(param)] = torch.nn.Parameter(param.clone())
+
             _assign_attr(
-                cloned,
+                id_to_param[id(param)],
                 self,
                 name,
                 attr_kind=_AttrKind.PARAMETER,
             )
+            assigned_params.add(name)
 
         non_persistent_buffers = set(self.graph_signature.non_persistent_buffers)
-        for name in self.graph_signature.buffers:
+        assigned_buffers: Set[str] = set()  # tracking unused buffers
+        id_to_buffer: Dict[
+            int, Tuple[torch.nn.Parameter, bool]
+        ] = {}  # handle weight-sharing
+        for name in self.graph_signature.buffers:  # this loop adds used buffers
             if name in non_persistent_buffers:
                 persistent = False
-                cloned = export_module.constants[name].clone()
+                buffer = export_module.constants[name]
             else:
                 persistent = True
-                cloned = state_dict[name].clone()
+                buffer = state_dict[name]
+
+            if id(buffer) not in id_to_buffer:
+                id_to_buffer[id(buffer)] = (buffer.clone(), persistent)
 
             _assign_attr(
-                cloned,
+                id_to_buffer[id(buffer)][0],
                 self,
                 name,
                 attr_kind=_AttrKind.BUFFER,
                 persistent=persistent,
             )
+            assigned_buffers.add(name)
+
+        # restore aliased/unused params and buffers
+        # these appear in state dict but not graph signature
+        for name, tensor in state_dict.items():
+            if name in assigned_params or name in assigned_buffers:  # already assigned
+                continue
+
+            is_buffer = False
+            if id(tensor) in id_to_buffer or not isinstance(
+                tensor, torch.nn.Parameter
+            ):  # aliased buffer
+                is_buffer = True
+
+            if is_buffer:
+                if (
+                    id(tensor) not in id_to_buffer
+                ):  # this is completely unused (not weight-sharing)
+                    id_to_buffer[id(tensor)] = (
+                        tensor,
+                        True,
+                    )  # assign to respect original model
+                _assign_attr(
+                    id_to_buffer[id(tensor)][0],
+                    self,
+                    name,
+                    attr_kind=_AttrKind.BUFFER,
+                    persistent=True,
+                )
+            else:
+                if id(tensor) not in id_to_param:  # this is unused
+                    id_to_param[id(tensor)] = tensor
+                _assign_attr(
+                    id_to_param[id(tensor)],
+                    self,
+                    name,
+                    attr_kind=_AttrKind.PARAMETER,
+                )
 
         # use id map so we don't double-clone aliased constants
         id_to_const: Dict[int, Union[torch.Tensor, torch._C.ScriptObject]] = {}
@@ -223,6 +284,7 @@ def add_to_consts_map(obj_id, node_name, target_name):
             name_list = consts_map[obj_id]
             name_list.append((node_name, target_name))
 
+        added_params_buffers: Set[str] = set()  # track aliased/unused params, buffers
         for s in self.graph_signature.input_specs:
             if s.kind == InputKind.PARAMETER or (
                 s.kind == InputKind.BUFFER and s.persistent
@@ -233,6 +295,7 @@ def add_to_consts_map(obj_id, node_name, target_name):
                     id(export_module.state_dict[s.target]), s.arg.name, s.target
                 )
                 consts_targets.add(s.target)
+                added_params_buffers.add(s.target)
             elif (
                 (s.kind == InputKind.BUFFER and not s.persistent)
                 or s.kind == InputKind.CONSTANT_TENSOR
@@ -253,6 +316,18 @@ def add_to_consts_map(obj_id, node_name, target_name):
                 ), "Constants should be either aliased or appear in graph signature"
                 ph_name, _ = consts_map[id(const)][0]
                 add_to_consts_map(id(const), ph_name, const_name)
+                added_params_buffers.add(s.target)
+
+        # add aliased/unused params and buffers that don't appear in graph signature
+        for fqn, tensor in export_module.state_dict.items():
+            if fqn not in added_params_buffers:
+                if id(tensor) not in consts_map:
+                    # completely unused (no weight-sharing), ignore.
+                    # this weight doesn't appear in graph module,
+                    # so won't cause FQN assignment issues
+                    continue
+                ph_name, _ = consts_map[id(tensor)][0]
+                add_to_consts_map(id(tensor), ph_name, fqn)
 
         # node name -> list of possible targets
         inputs_to_state: Dict[str, List[str]] = {}
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index 07905b0348473..9976c4e9beca2 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -59,6 +59,7 @@
 
 CONSTANT_NUMEL_LIMIT = 1
 
+null_ctx_type = type(nullcontext)
 # We currently convert all SymInt to proxies before we use them.
 # This could plausibly be handled at the Dynamo level.
 pytree.register_pytree_node(
@@ -214,6 +215,8 @@ def try_set_proxy_slot(outer_s, proxy_callable, *args):
     set_proxy_slot(tensor, tracer, _ProxyTensor(proxy, constant))
 
 def track_tensor_tree(inner_res, proxy_res, *, constant, tracer):
+    _set_unbacked_bindings(inner_res, proxy_res)
+
     def wrap_with_proxy(e, proxy, constant):
         if isinstance(e, torch.Tensor):
             track_tensor(e, proxy, tracer=tracer, constant=constant)
@@ -521,21 +524,6 @@ def can_handle_tensor(x):
     else:
         constant = None
 
-    from .symbolic_shapes import compute_unbacked_bindings
-    # Can't use detect_fake_mode here,
-    #
-    # python test/distributed/_tensor/test_dtensor_compile.py -k
-    # test_tp_compile_fullgraph_is_seq_parallel_False
-    #
-    # will fail.  Very strange, it probably isn't right for them to be using
-    # two fake modes there...
-    fake_mode = torch._C._get_dispatch_mode(
-        torch._C._TorchDispatchModeKey.FAKE
-    )
-    if fake_mode and fake_mode.shape_env:
-        if symbol_to_path := compute_unbacked_bindings(fake_mode.shape_env, out):
-            proxy_out.node.meta["unbacked_bindings"] = symbol_to_path
-
     track_tensor_tree(out, proxy_out, constant=constant, tracer=tracer)
     return out
 
@@ -1138,145 +1126,283 @@ def create_node(self, *args, **kwargs):
 
         return node
 
+class _MakefxTracer:
+
+    def __init__(
+        self,
+        decomposition_table: Optional[Dict[Callable, Callable]],
+        tracing_mode: str,
+        _allow_non_fake_inputs: bool,
+        pre_dispatch: bool,
+        record_module_stack: bool,
+        _allow_fake_constant: bool,
+        _error_on_data_dependent_ops: bool
+    ):
+        # Configurations that are used to initialize the context managers and their states.
+        # Should not modify them during tracing.
+        self.decomposition_table: Dict[Callable, Callable] = decomposition_table or {}
+        self.decomposition_table.setdefault(torch.ops.aten.sym_numel.default, torch._decomp.decompositions.sym_numel)
+        self.tracing_mode: str = tracing_mode
+        self._allow_non_fake_inputs: bool = _allow_non_fake_inputs
+        self.pre_dispatch: bool = pre_dispatch
+        self.record_module_stack: bool = record_module_stack
+        self._allow_fake_constant: bool = _allow_fake_constant
+        self._error_on_data_dependent_ops: bool = _error_on_data_dependent_ops
+
+        # All context managers and their states should be initialized before tracing based on the inputs
+        # and configurations. After tracing, their states should be cleaned except for shape_env.
+        # Remember to specify how to intialize it from user inputs and from parent tracer whenever
+        # adding new modes in _MakefxTracer.
+        self.fake_tensor_mode: Union[null_ctx_type, FakeTensorMode] = nullcontext()
+        self.proxy_mode: Union[null_ctx_type, ProxyTorchDispatchMode] = nullcontext()
+        self.proxy_function_mode: Union[null_ctx_type, PreDispatchTorchFunctionMode] = nullcontext()
+        self.fx_tracer: Union[null_ctx_type, Tracer] = nullcontext()
+        self.python_dispatcher_mode: Union[null_ctx_type, Any] = nullcontext()
+        self.torch_fn_metadata_mode: Union[null_ctx_type, TorchFunctionMetadataMode] = nullcontext()
+
+    def _checkpoint_modes(self) -> List[Any]:
+        return [
+            self.fake_tensor_mode,
+            self.proxy_mode,
+            self.proxy_function_mode,
+            self.fx_tracer,
+            self.python_dispatcher_mode,
+            self.torch_fn_metadata_mode
+        ]
+
+    def _restore_modes(
+        self,
+        prev_fake_tensor_mode: Union[null_ctx_type, FakeTensorMode],
+        prev_proxy_mode: Union[null_ctx_type, ProxyTorchDispatchMode],
+        prev_proxy_function_mode: Union[null_ctx_type, PreDispatchTorchFunctionMode],
+        prev_fx_tracer: Union[null_ctx_type, Tracer],
+        prev_python_dispatcher_mode: Union[null_ctx_type, Any],
+        prev_torch_fn_metadata_mode : Union[null_ctx_type, TorchFunctionMetadataMode],
+    ) -> None:
+        self.fake_tensor_mode = prev_fake_tensor_mode
+        self.proxy_mode = prev_proxy_mode
+        self.proxy_function_mode = prev_proxy_function_mode
+        self.fx_tracer = prev_fx_tracer
+        self.python_dispatcher_mode = prev_python_dispatcher_mode
+        self.torch_fn_metadata_mode = prev_torch_fn_metadata_mode
 
-def make_fx(f,
-            decomposition_table=None,
-            tracing_mode="real",
-            _allow_non_fake_inputs=False,
-            *,
-            pre_dispatch=False,
-            record_module_stack=False,
-            _allow_fake_constant=False,
-            _error_on_data_dependent_ops=True):
-    assert tracing_mode in ["real", "fake", "symbolic"]
+    @contextmanager
+    def _init_modes_from_inputs(self, f, args):
+        prev_modes = self._checkpoint_modes()
+        try:
+            # Avoid importing sympy at a module level
+            from .symbolic_shapes import ShapeEnv
+            if hasattr(f, "_orig_mod") and self.record_module_stack:
+                scope_root = f._orig_mod
+                self.fx_tracer = _ModuleStackTracer(scope_root)
+            else:
+                self.fx_tracer = PythonKeyTracer()
+
+            if self.tracing_mode == "fake":
+                import torch._dynamo
+                fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args)
+                if fake_tensor_mode is None:
+                    import torch._functorch.config as _config
+                    with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
+                        fake_tensor_mode = FakeTensorMode(
+                            allow_fallback_kernels=True,
+                            allow_non_fake_inputs=self._allow_non_fake_inputs,
+                            shape_env=ShapeEnv(),
+                            static_shapes=True,
+                        )
+                self.fake_tensor_mode = fake_tensor_mode
+            elif self.tracing_mode == "symbolic":
+                import torch._dynamo
+                fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args)
+                if fake_tensor_mode is None:
+                    shape_env = ShapeEnv()
+                    import torch._functorch.config as _config
+                    with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
+                        fake_tensor_mode = FakeTensorMode(
+                            allow_fallback_kernels=False,
+                            allow_non_fake_inputs=self._allow_non_fake_inputs,
+                            shape_env=shape_env)
+                assert fake_tensor_mode.shape_env is not None, "shape_env should be set if tracing with 'symbolic'"
+                self.fake_tensor_mode = fake_tensor_mode
+            else:
+                if not self.tracing_mode == "real":
+                    raise AssertionError(f"Unexpected tracing type: {self.tracing_mode}")
 
-    if decomposition_table is None:
-        decomposition_table = {}
+            self._construct_modes_with_fx_tracer(self.fx_tracer)
+            yield
+        finally:
+            self._restore_modes(*prev_modes)
+
+    def _construct_modes_with_fx_tracer(self, fx_tracer):
+        self.proxy_mode = ProxyTorchDispatchMode(
+            fx_tracer,
+            self.tracing_mode,
+            pre_dispatch=self.pre_dispatch,
+            _allow_fake_constant=self._allow_fake_constant,
+            _error_on_data_dependent_ops=self._error_on_data_dependent_ops
+        )
 
-    if torch.ops.aten.sym_numel.default not in decomposition_table:
-        decomposition_table = {
-            **decomposition_table,
-            torch.ops.aten.sym_numel.default: torch._decomp.decompositions.sym_numel
-        }
+        if self.pre_dispatch:
+            self.proxy_function_mode = PreDispatchTorchFunctionMode(fx_tracer)
 
-    @functools.wraps(f)
-    def wrapped(*args):
-        # Avoid importing sympy at a module level
-        from .symbolic_shapes import ShapeEnv
+        # pre-autograd tracing uses per-dispatch-key modes,
+        # which requires the python dispatcher
+        if self.tracing_mode == "symbolic" or self.pre_dispatch:
+            self.python_dispatcher_mode = enable_python_dispatcher()
 
-        phs = pytree.tree_map(lambda _: fx.PH, args)  # type: ignore[attr-defined]
+        self.torch_fn_metadata_mode = TorchFunctionMetadataMode(fx_tracer)
 
-        if hasattr(f, "_orig_mod") and record_module_stack:
-            scope_root = f._orig_mod
-            fx_tracer = _ModuleStackTracer(scope_root)
-        else:
-            fx_tracer = PythonKeyTracer()
-        fake_tensor_mode: Any = nullcontext()
-        if tracing_mode == "real":
-            fake_tensor_mode = nullcontext()
-        elif tracing_mode == "fake":
-            import torch._dynamo
-            fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args)
-            if fake_tensor_mode is None:
-                import torch._functorch.config as _config
-                with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
-                    fake_tensor_mode = FakeTensorMode(
-                        allow_fallback_kernels=True,
-                        allow_non_fake_inputs=_allow_non_fake_inputs,
-                        shape_env=ShapeEnv(),
-                        static_shapes=True,
-                    )
-        elif tracing_mode == "symbolic":
-            import torch._dynamo
-            fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args)
-            if fake_tensor_mode is None:
-                shape_env = ShapeEnv()
-                import torch._functorch.config as _config
-                with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
-                    fake_tensor_mode = FakeTensorMode(
-                        allow_fallback_kernels=False,
-                        allow_non_fake_inputs=_allow_non_fake_inputs,
-                        shape_env=shape_env)
-            else:
-                shape_env = fake_tensor_mode.shape_env
-                assert shape_env is not None, "shape_env should be set if tracing with 'symbolic'"
+    @contextmanager
+    def _init_modes_from_parent(self, parent_tracer):
+        # By default, subtracer creates new modes based on parent tracer's config.
+        # However, there are cases where we want to share the same modes with parent tracer
+        # For example, fake_tensor_mode, we want the example value's fake_mode of parent graph and subgraphs to be the same.
+        prev_modes = self._checkpoint_modes()
+        try:
+            self.fake_tensor_mode = parent_tracer.fake_tensor_mode
 
-        else:
-            raise AssertionError(f"Unexpected tracing type: {tracing_mode}")
+            def _create_sub_fx_tracer(parent_tracer):
+                if type(parent_tracer) == PythonKeyTracer:
+                    sub_tracer = PythonKeyTracer()
+                elif type(parent_tracer) == _ModuleStackTracer:
+                    sub_tracer = _ModuleStackTracer(parent_tracer.scope_root)
+                else:
+                    raise RuntimeError(f"Unexpected tracer type: {type(parent_tracer)}.")
 
-        python_dispatcher_mode: Any = nullcontext()
-        # pre-autograd tracing uses per-dispatch-key modes,
-        # which requires the python dispatcher
-        if tracing_mode == "symbolic" or pre_dispatch:
-            python_dispatcher_mode = enable_python_dispatcher()
-
-        proxy_function_mode: Any = nullcontext()
-        if pre_dispatch:
-            proxy_function_mode = PreDispatchTorchFunctionMode(fx_tracer)
-
-        proxy_mode = ProxyTorchDispatchMode(fx_tracer,
-                                            tracing_mode,
-                                            pre_dispatch=pre_dispatch,
-                                            _allow_fake_constant=_allow_fake_constant,
-                                            _error_on_data_dependent_ops=_error_on_data_dependent_ops)
-
-        arg_count = 0
-
-        def wrap_fake(x):
-            nonlocal arg_count
-            # TODO: it would be nice to line these up with the names
-            # FX will choose for the placeholders, but we don't
-            # actually know what the names will be at this point yet
-            # NB: the Source here is actually meaningless
-            from torch._dynamo.source import ConstantSource
-            source = ConstantSource(f"input{arg_count}")
-            if isinstance(x, torch.Tensor):
-                arg_count += 1
-                return fake_tensor_mode.from_tensor(x, source=source)  # type: ignore[attr-defined]
-            # NB: don't match on bools
-            elif type(x) is int and tracing_mode == "symbolic":
-                return shape_env.create_symintnode(shape_env.create_symbol(x, source, positive=None), hint=x, source=source)
-            elif isinstance(x, torch.ScriptObject):
-                return torch._library.fake_class_registry.to_fake_obj(fake_tensor_mode, x)
-
-            assert not isinstance(x, FakeScriptObject), f"ScriptObject {x} has been fakified. Cannot wrap_fake it again."
-            return x
-
-        sym_mode = proxy_mode.sym_mode
-
-        wrap_fn_map = {
-            "real": lambda x: x,
-            "fake": wrap_fake,
-            "symbolic": wrap_fake,
-        }
-        args = pytree.tree_map(wrap_fn_map[tracing_mode], args)
-
-        if not hasattr(inspect.unwrap(f), '__code__') or inspect.unwrap(f).__code__.co_flags & inspect.CO_VARARGS:
-            # FX doesn't support varargs, so we gotta fake up a wrapper
-            # TODO: Would be nice to fix this at the source...
-            func = fake_signature(f, len(phs))
-        else:
-            func = f
+                return sub_tracer
+
+            self.fx_tracer = _create_sub_fx_tracer(parent_tracer.fx_tracer)
+            self._construct_modes_with_fx_tracer(self.fx_tracer)
+            yield
+        finally:
+            self._restore_modes(*prev_modes)
 
-        torch_fn_metadata_mode = TorchFunctionMetadataMode(fx_tracer)
 
+    def _trace_inner(self, f, *args):
+        phs = pytree.tree_map(lambda _: fx.PH, args)  # type: ignore[attr-defined]
+
+        def _wrap_fake(args: Tuple[Any]) -> Tuple[Any]:
+            arg_count = 0
+
+            def inner_wrap_fake(x):
+                nonlocal arg_count
+                # TODO: it would be nice to line these up with the names
+                # FX will choose for the placeholders, but we don't
+                # actually know what the names will be at this point yet
+                # NB: the Source here is actually meaningless
+                from torch._dynamo.source import ConstantSource
+                source = ConstantSource(f"input{arg_count}")
+                if isinstance(x, torch.Tensor):
+                    arg_count += 1
+                    return self.fake_tensor_mode.from_tensor(x, source=source)  # type: ignore[attr-defined]
+                # NB: don't match on bools
+                elif type(x) is int and self.tracing_mode == "symbolic":
+                    return self.fake_tensor_mode.shape_env.create_symintnode(
+                        self.fake_tensor_mode.shape_env.create_symbol(x, source, positive=None),
+                        hint=x,
+                        source=source
+                    )
+                elif isinstance(x, torch.ScriptObject):
+                    return torch._library.fake_class_registry.to_fake_obj(self.fake_tensor_mode, x)
+
+                assert not isinstance(x, FakeScriptObject), f"ScriptObject {x} has been fakified. Cannot wrap_fake it again."
+                return x
+
+            wrap_fn_map = {
+                "real": lambda x: x,
+                "fake": inner_wrap_fake,
+                "symbolic": inner_wrap_fake,
+            }
+            return pytree.tree_map(wrap_fn_map[self.tracing_mode], args)
+
+        def _wrap_func(f, phs):
+            if not hasattr(inspect.unwrap(f), '__code__') or inspect.unwrap(f).__code__.co_flags & inspect.CO_VARARGS:
+                # FX doesn't support varargs, so we gotta fake up a wrapper
+                # TODO: Would be nice to fix this at the source...
+                return fake_signature(f, len(phs))
+            return f
+
+        args = _wrap_fake(args)
+        func = _wrap_func(f, phs)
         # We disable the autocast cache as the autocast cache causes type conversions on parameters to
         # check a cache, which introduces untracked tensors into the graph
         #
         # We also disable tracing by any other tensor proxy-based tracers except the current. The
         # purpose of `make_fx` is to produce graphmodules as a side effect; its internal execution is
         # thus irrelevant to any external functional trace.
-        with decompose(decomposition_table), fake_tensor_mode, python_dispatcher_mode, proxy_function_mode, \
-             sym_mode, torch_fn_metadata_mode, proxy_mode, disable_autocast_cache():
-            t = dispatch_trace(wrap_key(func, args, fx_tracer, pre_dispatch), tracer=fx_tracer, concrete_args=tuple(phs))
+        with decompose(self.decomposition_table), self.fake_tensor_mode, self.python_dispatcher_mode, self.proxy_function_mode, \
+             self.proxy_mode.sym_mode, self.torch_fn_metadata_mode, \
+             self.proxy_mode, disable_autocast_cache(), _set_make_fx_tracer(self):
+            t = dispatch_trace(
+                wrap_key(func, args, self.fx_tracer, self.pre_dispatch),
+                tracer=self.fx_tracer,
+                concrete_args=tuple(phs)
+            )
 
         # TODO: kind of a bad way to do it, should maybe figure out a better way
-        if tracing_mode == "symbolic":
-            t.shape_env = shape_env  # type: ignore[assignment]
+        if self.tracing_mode == "symbolic":
+            t.shape_env = self.fake_tensor_mode.shape_env  # type: ignore[assignment]
         return t
 
-    return wrapped
+    def trace(self, f, *args) -> torch.fx.GraphModule:
+        with self._init_modes_from_inputs(f, args):
+            return self._trace_inner(f, *args)
+
+    def trace_subgraph(self, f, *args):
+        # Create a new tracer based on parent's config
+        sub_tracer = _MakefxTracer(
+            self.decomposition_table,
+            self.tracing_mode,
+            self._allow_non_fake_inputs,
+            self.pre_dispatch,
+            self.record_module_stack,
+            self._allow_fake_constant,
+            self._error_on_data_dependent_ops
+        )
+        with sub_tracer._init_modes_from_parent(self):
+            return sub_tracer._trace_inner(f, *args)
+
+_CURRENT_MAKE_FX_TRACER : Optional[_MakefxTracer] = None
+
+@contextmanager
+def _set_make_fx_tracer(tracer: _MakefxTracer) -> None:
+    global _CURRENT_MAKE_FX_TRACER
+    prev_tracer = _CURRENT_MAKE_FX_TRACER
+    try:
+        _CURRENT_MAKE_FX_TRACER = tracer
+        yield
+    finally:
+        _CURRENT_MAKE_FX_TRACER = prev_tracer
+
+def make_fx(
+        f,
+        decomposition_table=None,
+        tracing_mode="real",
+        _allow_non_fake_inputs=False,
+        *,
+        pre_dispatch=False,
+        record_module_stack=False,
+        _allow_fake_constant=False,
+        _error_on_data_dependent_ops=True):
+
+    assert tracing_mode in ["real", "fake", "symbolic"]
+
 
+    make_fx_tracer = _MakefxTracer(
+        decomposition_table,
+        tracing_mode,
+        _allow_non_fake_inputs,
+        pre_dispatch,
+        record_module_stack,
+        _allow_fake_constant,
+        _error_on_data_dependent_ops
+    )
+
+    @functools.wraps(f)
+    def wrapped(*args):
+        return make_fx_tracer.trace(f, *args)
+
+    return wrapped
 
 def get_torch_dispatch_modes():
     return torch.utils._python_dispatch._get_current_dispatch_mode_stack()
@@ -1310,3 +1436,22 @@ def get_isolated_graphmodule(func, args, kwargs, tracing_mode="real"):
     with disable_proxy_modes_tracing():
         gm = make_fx(wrapped, tracing_mode=tracing_mode)(all_args)
     return gm
+
+
+def _set_unbacked_bindings(out, out_proxy):
+    """A helper function for setting up unbacked_bindings on the destination FX graph."""
+    from .symbolic_shapes import compute_unbacked_bindings
+
+    # Can't use detect_fake_mode here,
+    #
+    # python test/distributed/_tensor/test_dtensor_compile.py -k
+    # test_tp_compile_fullgraph_is_seq_parallel_False
+    #
+    # will fail.  Very strange, it probably isn't right for them to be using
+    # two fake modes there...
+    fake_mode = torch._C._get_dispatch_mode(
+        torch._C._TorchDispatchModeKey.FAKE
+    )
+    if fake_mode and fake_mode.shape_env:
+        if symbol_to_path := compute_unbacked_bindings(fake_mode.shape_env, out):
+            out_proxy.node.meta["unbacked_bindings"] = symbol_to_path
diff --git a/torch/fx/experimental/sym_node.py b/torch/fx/experimental/sym_node.py
index 8ec9b816beac9..98cba67a73a18 100644
--- a/torch/fx/experimental/sym_node.py
+++ b/torch/fx/experimental/sym_node.py
@@ -164,6 +164,25 @@ def maybe_as_int(self):
         else:
             return None
 
+    # NB: This does conversions, not sure if this is good or not
+    def maybe_as_float(self):
+        import sympy
+
+        if isinstance(self.expr, sympy.Float):
+            return float(self.expr)
+        else:
+            return None
+
+    def maybe_as_bool(self):
+        import sympy
+
+        if self.expr is sympy.true:
+            return True
+        elif self.expr is sympy.false:
+            return False
+        else:
+            return None
+
     def is_int(self):
         return self.pytype is int
 
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index e44350276cbe9..918707399270d 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -83,6 +83,9 @@
 class GuardOnDataDependentSymNode(RuntimeError):
     pass
 
+class PendingUnbackedSymbolNotFound(RuntimeError):
+    pass
+
 import sympy
 from sympy.printing.str import StrPrinter
 from sympy.printing.precedence import precedence, PRECEDENCE
@@ -602,14 +605,19 @@ def free_unbacked_symbols_with_path(
             return r
 
         symbol_to_path = free_unbacked_symbols_with_path(example_value, ())
-        assert not pending, (
-            f"pending {pending} not in {example_value} " +
-            (
+        if not peek and pending:
+            extra = (
                 repr((example_value.stride(), example_value.storage_offset()))
                 if isinstance(example_value, torch.Tensor)
                 else ""
             )
-        )
+            raise PendingUnbackedSymbolNotFound(
+                f"Pending unbacked symbols {pending} not in returned outputs {example_value} {extra}.\n"
+                "Did you accidentally call new_dynamic_size() or item() more times "
+                "than you needed to in your fake implementation?\n"
+                "For more help, see https://docs.google.com/document/d/1RWrH-3wLEpzR9kCS6gGBNen_-Fs-8PVbWWFE5AcgeWE/edit"
+            )
+
         # Why do we have to do some rebinding here?  If the original FX node
         # wasn't a binding site because you had a memo hit, but post
         # translation you aren't a memo hit anymore, there's now a new binding
@@ -3016,6 +3024,38 @@ def create_symintnode(
             out = SymInt(SymNode(sym, self, int, hint, fx_node=fx_node))
         return out
 
+    @record_shapeenv_event()
+    def create_symfloatnode(
+            self,
+            sym: "sympy.Expr",
+            *,
+            hint: Optional[int],
+            source: Optional[Source] = None,
+    ):
+        """Create a SymFloat value from a symbolic expression"""
+        source_name = source.name() if source else None
+
+        if self._translation_validation_enabled and source is not None:
+            # Create a new symbol for this source.
+            symbol = self._create_symbol_for_source(source)
+            assert symbol is not None
+
+            # Create a new FX placeholder and Z3 variable for 'symbol'.
+            fx_node = self._create_fx_placeholder_and_z3var(symbol, float)
+
+            # Add an equality assertion for the newly created symbol and 'sym'.
+            self._add_assertion(sympy.Eq(symbol, sym))
+        else:
+            fx_node = None
+
+        if isinstance(sym, sympy.Float):
+            if hint is not None:
+                assert float(sym) == hint
+            out = float(sym)
+        else:
+            out = SymFloat(SymNode(sym, self, float, hint, fx_node=fx_node))
+        return out
+
     @record_shapeenv_event()
     def create_unspecified_symint_and_symbol(self, value, source, dynamic_dim):
         """Create a SymInt wrapping a new unspecified symbol"""
@@ -3200,10 +3240,15 @@ def create_symbol(
             # If we're not duck shaping, we always create a new symbol
             # Even if we're duck shaping, if we haven't seen this particular
             # value before, we also create a new symbol
-            sympy_expr = make_symbol(SymT.SIZE, len(self.var_to_val), positive=positive, integer=True)
+            if type(val) is int:
+                sympy_expr = make_symbol(SymT.SIZE, len(self.var_to_val), positive=positive, integer=True)
+            else:
+                sympy_expr = make_symbol(SymT.FLOAT, len(self.var_to_val), positive=positive, real=True)
             # We always associate vars to vals
             if isinstance(val, int):
                 self.var_to_val[sympy_expr] = sympy.Integer(val)
+            elif isinstance(val, float):
+                self.var_to_val[sympy_expr] = sympy.Float(val)
             else:
                 # Only used for jagged layout nested tensors
                 self.var_to_val[sympy_expr] = SingletonInt(val.node.nested_int(), coeff=val.node.nested_int_coeff())
@@ -3238,6 +3283,9 @@ def create_symbol(
                 if val not in vr:
                     raise ConstraintViolationError(f"{val} not in range [{vr.lower}, {vr.upper}]")
 
+                range_str = f"[{vr.lower}, {vr.upper}]"
+            elif isinstance(val, float):
+                self.var_to_range[sympy_expr] = vr = ValueRanges(-sympy.oo, sympy.oo)
                 range_str = f"[{vr.lower}, {vr.upper}]"
             else:
                 # Skip var_range logic for SingletonInt
@@ -3384,7 +3432,7 @@ def _create_no_constraints_context(t):
                     if context is None:
                         input_contexts[i] = _create_no_constraints_context(t)
                 else:
-                    assert isinstance(t, (SymInt, int))
+                    assert isinstance(t, (SymInt, int, SymFloat, float))
                     assert not isinstance(context, list)
 
         # It took a lot of sweat to figure out the algorithm here.  Let's
@@ -3592,6 +3640,22 @@ def hint(s):
                     )
                     record_constraint_violation(constraint.warn_only, self._debug_name(source), msg)
 
+        def track_symfloat(source, val):
+            log.debug("track_symfloat %s %s", LazyString(source.name), val)
+            assert not isinstance(val, SymFloat) or is_symbolic(val)
+
+            if isinstance(val, SymFloat) and val.node.maybe_as_float() is not None:
+                val = val.node.maybe_as_float()
+
+            if isinstance(val, SymFloat):
+                s = val.node.expr
+                if isinstance(s, sympy.Symbol):
+                    symbol_to_source[s].append(source)
+                input_guards.append((source, s))
+            else:
+                s = sympy.Float(val)
+                input_guards.append((source, s))
+
         for t, source, context in zip(placeholders, sources, input_contexts):
             if isinstance(source, str):
                 from torch._dynamo.source import LocalSource
@@ -3602,6 +3666,9 @@ def hint(s):
             if isinstance(t, (SymInt, int)):
                 track_symint(source, t)
                 continue
+            elif isinstance(t, (SymFloat, float)):
+                track_symfloat(source, t)
+                continue
             assert isinstance(t, Tensorlike)
             if is_traceable_wrapper_subclass(t):
                 from torch._dynamo.source import AttrSource
@@ -3788,7 +3855,6 @@ def issue_guard(guard: ShapeGuard) -> None:
                 r = self.var_to_range[symbol]
 
             assert sources
-            assert symbol.is_integer
             bounds = []
             if r.lower != -sympy.oo:
                 if any(is_dim(source) for source in sources):
@@ -3834,6 +3900,12 @@ def issue_guard(guard: ShapeGuard) -> None:
                                 self._debug_name(source),
                                 msg,
                             )
+            # We NaN specialize, which means similar to 0/1 specialization we
+            # should assume that the float is NOT nan.  This is load bearing
+            # if you have something like an equality guard, nan will play
+            # merry hell with the reasoning.
+            if symbol_is_type(symbol, SymT.FLOAT):
+                exprs.append(f"not __math_isnan({source_ref(sources[0])})")
 
         if constraint_violations:
             warn_msgs = []
@@ -4498,6 +4570,7 @@ def _smart_symbol_sort(x):
                 floor_div_atoms = lhs.atoms(FloorDiv).union(rhs.atoms(FloorDiv))
                 if len(floor_div_atoms) > 0 and any(a.divisor != 1 for a in floor_div_atoms):
                     raise NotImplementedError
+
                 # Never replace unbacked symbols with other unbacked symbols.
                 # This is error prone because you can cause references to
                 # unbacked symbols to time travel backwards.  E.g.,
@@ -4512,10 +4585,20 @@ def _smart_symbol_sort(x):
                 # references u2 and u3 prior to them actually being bound at
                 # runtime.  It's pretty inconvenient to setup control
                 # dependencies for substitutions, so ban it entirely.
-                if isinstance(lhs, sympy.Symbol) and free_unbacked_symbols(lhs) and not free_unbacked_symbols(rhs):
-                    # short-circuit when no solving is needed
+                def trivial_solve(lhs, rhs):
+                    if isinstance(lhs, sympy.Symbol):
+                        if free_unbacked_symbols(lhs) and not free_unbacked_symbols(rhs):
+                            return True
+                        if symbol_is_type(lhs, SymT.FLOAT):
+                            return True
+                        # TODO: Maybe trivial solutions for int should also be
+                        # done?
+                    return False
+
+                # short-circuit when no solving is needed
+                if trivial_solve(lhs, rhs):
                     self._set_replacement(lhs, self._find(rhs), "trivial_lhs")
-                elif isinstance(rhs, sympy.Symbol) and free_unbacked_symbols(rhs) and not free_unbacked_symbols(lhs):
+                elif trivial_solve(rhs, lhs):
                     self._set_replacement(rhs, self._find(lhs), "trivial_rhs")
                 else:
                     r = try_solve(expr, free[0], floordiv_inequality=False)
@@ -4791,21 +4874,11 @@ def compute_concrete_val():
 
             # Turn this into a boolean expression, no longer need to consult
             # concrete_val
-            suppress_maybe_guard_rel = False
             if concrete_val is sympy.true:
                 g = expr
             elif concrete_val is sympy.false:
                 g = sympy.Not(expr)
             else:
-                # WARNING: we cannot actually do simplifications on guards
-                # on floating point values, because Sympy generally does not
-                # think expressions on integers can ever be equal to floating
-                # point (e.g., sympy.Eq(s0/6, 0.5) evaluates to False).  Without
-                # very clear algebraic laws that hold for floating point, such
-                # simplifications are error prone anyway, so be sure not to
-                # maybe_guard_rel in those cases.
-                if not isinstance(concrete_val, sympy.Integer):
-                    suppress_maybe_guard_rel = True
                 g = sympy.Eq(expr, concrete_val)  # type: ignore[arg-type]
 
             if isinstance(g, sympy.Rel):
diff --git a/torch/fx/passes/runtime_assert.py b/torch/fx/passes/runtime_assert.py
index 870348af6f69e..0d45defe8a48c 100644
--- a/torch/fx/passes/runtime_assert.py
+++ b/torch/fx/passes/runtime_assert.py
@@ -120,12 +120,13 @@ def add_runtime_asserts(ras):
                     ),
                 )
 
-    for node in graph.nodes:
+    nodes = list(graph.nodes)
+    for i, node in enumerate(nodes[:-1]):
         # Placeholders can match symbols, but when we destructure them
         # with size we have to make sure we insert the nodes after all
         # the placeholders
         with graph.inserting_before(
-            node.next if node not in placeholders else last_placeholder.next
+            nodes[i + 1] if node not in placeholders else last_placeholder.next
         ):
             # Unfortunately, this logic still must remain because manual
             # make_fx calls may not explicitly bind all symbolic ints as
@@ -150,12 +151,24 @@ def match_symbol(symint, cb):
                 match_symbol(example_value, lambda: node)
                 if isinstance(t := example_value, torch.Tensor):
                     for i, s in enumerate(t.size()):
-                        match_symbol(s, lambda: graph.call_method("size", (node, i)))
+                        match_symbol(
+                            s,
+                            lambda: graph.call_function(
+                                torch.ops.aten.sym_size.int, (node, i)
+                            ),
+                        )
                     for i, s in enumerate(t.stride()):
-                        match_symbol(s, lambda: graph.call_method("stride", (node, i)))
+                        match_symbol(
+                            s,
+                            lambda: graph.call_function(
+                                torch.ops.aten.sym_stride.int, (node, i)
+                            ),
+                        )
                     match_symbol(
                         t.storage_offset(),
-                        lambda: graph.call_method("storage_offset", (node,)),
+                        lambda: graph.call_function(
+                            torch.ops.aten.sym_storage_offset.default, (node,)
+                        ),
                     )
 
             # Handle asserts that aren't associated with any symbol.  This
diff --git a/torch/library.h b/torch/library.h
index c38179a6eea1d..3c1d0c415106f 100644
--- a/torch/library.h
+++ b/torch/library.h
@@ -299,9 +299,9 @@ class TORCH_API CppFunction final {
   }
 
  private:
-  c10::optional<c10::DispatchKey> dispatch_key_;
+  std::optional<c10::DispatchKey> dispatch_key_;
   c10::KernelFunction func_;
-  c10::optional<c10::impl::CppSignature> cpp_signature_;
+  std::optional<c10::impl::CppSignature> cpp_signature_;
   std::unique_ptr<c10::FunctionSchema> schema_;
   std::string debug_;
 
@@ -316,7 +316,7 @@ class TORCH_API CppFunction final {
 
   CppFunction(
       c10::KernelFunction func,
-      c10::optional<c10::impl::CppSignature> cpp_signature,
+      std::optional<c10::impl::CppSignature> cpp_signature,
       std::unique_ptr<c10::FunctionSchema> schema);
 };
 
@@ -555,7 +555,7 @@ class TORCH_API Library final {
   Library(
       Kind kind,
       std::string ns,
-      c10::optional<c10::DispatchKey> k,
+      std::optional<c10::DispatchKey> k,
       const char* file,
       uint32_t line);
 
@@ -847,9 +847,9 @@ class TORCH_API Library final {
 
  private:
   Kind kind_;
-  c10::optional<std::string> ns_;
-  c10::optional<c10::DispatchKey> dispatch_key_;
-  c10::optional<std::pair<const char*, const char*>> python_module_;
+  std::optional<std::string> ns_;
+  std::optional<c10::DispatchKey> dispatch_key_;
+  std::optional<std::pair<const char*, const char*>> python_module_;
   const char* file_;
   uint32_t line_;
 
@@ -889,7 +889,7 @@ class TorchLibraryInit final {
       Library::Kind kind,
       InitFn* fn,
       const char* ns,
-      c10::optional<c10::DispatchKey> k,
+      std::optional<c10::DispatchKey> k,
       const char* file,
       uint32_t line)
       : lib_(kind, ns, k, file, line) {
diff --git a/torch/nn/attention/__init__.py b/torch/nn/attention/__init__.py
index fc4835f046e6c..039d76a32f4b0 100644
--- a/torch/nn/attention/__init__.py
+++ b/torch/nn/attention/__init__.py
@@ -6,6 +6,8 @@
 from torch.backends.cuda import (
     can_use_efficient_attention,
     can_use_flash_attention,
+    cudnn_sdp_enabled,
+    enable_cudnn_sdp,
     enable_flash_sdp,
     enable_math_sdp,
     enable_mem_efficient_sdp,
@@ -99,19 +101,23 @@ def sdpa_kernel(backends: Union[List[SDPBackend], SDPBackend]):
         backends = [backends]
 
     backends = set(backends)
+    previous_cudnn: bool = cudnn_sdp_enabled()
     previous_flash: bool = flash_sdp_enabled()
     previous_mem_efficient: bool = mem_efficient_sdp_enabled()
     previous_math: bool = math_sdp_enabled()
     try:
+        enable_cudnn = SDPBackend.CUDNN_ATTENTION in backends
         enable_flash = SDPBackend.FLASH_ATTENTION in backends
         enable_mem_efficient = SDPBackend.EFFICIENT_ATTENTION in backends
         enable_math = SDPBackend.MATH in backends
 
+        enable_cudnn_sdp(enable_cudnn)
         enable_flash_sdp(enable_flash)
         enable_mem_efficient_sdp(enable_mem_efficient)
         enable_math_sdp(enable_math)
         yield {}
     finally:
+        enable_cudnn_sdp(previous_cudnn)
         enable_flash_sdp(previous_flash)
         enable_mem_efficient_sdp(previous_mem_efficient)
         enable_math_sdp(previous_math)
diff --git a/torch/nn/attention/_flex_attention.py b/torch/nn/attention/_flex_attention.py
index ee131dfac8524..1acfab57a62ce 100644
--- a/torch/nn/attention/_flex_attention.py
+++ b/torch/nn/attention/_flex_attention.py
@@ -83,6 +83,10 @@ def score_mod(
     """
 
     if torch.compiler.is_dynamo_compiling():
+        # mark head_dim and dim always to be static
+        for x in [query, key, value]:
+            torch._dynamo.mark_static(x, 1)
+            torch._dynamo.mark_static(x, -1)
         out, _ = flex_attention_hop(query, key, value, score_mod)
         return out
 
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index a4dc9a17089c5..f5206d425b4d8 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -350,11 +350,6 @@ def export(
                     %3 : Float = onnx::Mul(%2, %0)
                     return (%3)
 
-                If PyTorch was built with Caffe2 (i.e. with ``BUILD_CAFFE2=1``), then
-                Caffe2-specific behavior will be enabled, including special support
-                for ops are produced by the modules described in
-                `Quantization <https://pytorch.org/docs/stable/quantization.html>`_.
-
                 .. warning::
 
                     Models exported this way are probably runnable only by Caffe2.
@@ -1802,9 +1797,8 @@ def _add_output_to_block(block: _C.Block, value: _C.Value) -> int:
 def _should_aten_fallback(
     name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
 ):
-    # For BUILD_CAFFE2=0 builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
+    # For all builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
     #   an aten::ATen operator is created regardless of symbolics existence
-    # For BUILD_CAFFE2=1, the same applies only if there is no symbolic available
 
     is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version)
     is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN
diff --git a/torch/optim/__init__.py b/torch/optim/__init__.py
index 5e836b4047ddf..58d9c948416b8 100644
--- a/torch/optim/__init__.py
+++ b/torch/optim/__init__.py
@@ -22,17 +22,17 @@
 from .sgd import SGD
 from .sparse_adam import SparseAdam
 
-del adadelta  # noqa: F821
-del adagrad  # noqa: F821
-del adam  # noqa: F821
-del adamw  # noqa: F821
-del sparse_adam  # noqa: F821
-del adamax  # noqa: F821
-del asgd  # noqa: F821
-del sgd  # noqa: F821
-del radam  # noqa: F821
-del rprop  # noqa: F821
-del rmsprop  # noqa: F821
-del optimizer  # noqa: F821
-del nadam  # noqa: F821
-del lbfgs  # noqa: F821
+del adadelta  # type: ignore[name-defined] # noqa: F821
+del adagrad  # type: ignore[name-defined] # noqa: F821
+del adam  # type: ignore[name-defined] # noqa: F821
+del adamw  # type: ignore[name-defined] # noqa: F821
+del sparse_adam  # type: ignore[name-defined] # noqa: F821
+del adamax  # type: ignore[name-defined] # noqa: F821
+del asgd  # type: ignore[name-defined] # noqa: F821
+del sgd  # type: ignore[name-defined] # noqa: F821
+del radam  # type: ignore[name-defined] # noqa: F821
+del rprop  # type: ignore[name-defined] # noqa: F821
+del rmsprop  # type: ignore[name-defined] # noqa: F821
+del optimizer  # type: ignore[name-defined] # noqa: F821
+del nadam  # type: ignore[name-defined] # noqa: F821
+del lbfgs  # type: ignore[name-defined] # noqa: F821
diff --git a/torch/optim/__init__.pyi b/torch/optim/__init__.pyi
deleted file mode 100644
index 8d35bab14c207..0000000000000
--- a/torch/optim/__init__.pyi
+++ /dev/null
@@ -1,15 +0,0 @@
-from . import lr_scheduler as lr_scheduler, swa_utils as swa_utils
-from .adadelta import Adadelta as Adadelta
-from .adagrad import Adagrad as Adagrad
-from .adam import Adam as Adam
-from .adamax import Adamax as Adamax
-from .adamw import AdamW as AdamW
-from .asgd import ASGD as ASGD
-from .lbfgs import LBFGS as LBFGS
-from .nadam import NAdam as NAdam
-from .optimizer import Optimizer as Optimizer
-from .radam import RAdam as RAdam
-from .rmsprop import RMSprop as RMSprop
-from .rprop import Rprop as Rprop
-from .sgd import SGD as SGD
-from .sparse_adam import SparseAdam as SparseAdam
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
index b9fcafbbcd9aa..097c8040b63e1 100644
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -9,6 +9,7 @@
     _differentiable_doc,
     _disable_dynamo_if_unsupported,
     _foreach_doc,
+    _get_capturable_supported_devices,
     _get_scalar_dtype,
     _maximize_doc,
     _use_grad_for_differentiable,
@@ -24,10 +25,10 @@ class Adadelta(Optimizer):
     def __init__(
         self,
         params: ParamsT,
-        lr=1.0,
-        rho=0.9,
-        eps=1e-6,
-        weight_decay=0,
+        lr: float = 1.0,
+        rho: float = 0.9,
+        eps: float = 1e-6,
+        weight_decay: float = 0,
         foreach: Optional[bool] = None,
         *,
         capturable: bool = False,
@@ -254,9 +255,14 @@ def _single_tensor_adadelta(
 ):
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch._utils.is_compiling() and capturable:
+        capturable_supported_devices = _get_capturable_supported_devices(
+            supports_xla=False
+        )
         assert all(
-            p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)
-        ), "If capturable=True, params and state_steps must be CUDA tensors."
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
     for param, grad, square_avg, acc_delta, step in zip(
         params, grads, square_avgs, acc_deltas, state_steps
@@ -305,9 +311,14 @@ def _multi_tensor_adadelta(
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch._utils.is_compiling() and capturable:
+        capturable_supported_devices = _get_capturable_supported_devices(
+            supports_xla=False
+        )
         assert all(
-            p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)
-        ), "If capturable=True, params and state_steps must be CUDA tensors."
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
     if len(params) == 0:
         return
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index 8dcfd75001120..0ed8acfac1c61 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -2,7 +2,7 @@
 
 import torch
 from torch import Tensor
-from torch.utils._foreach_utils import _get_fused_kernels_supported_devices
+
 from .optimizer import (
     _default_to_fused_or_foreach,
     _differentiable_doc,
@@ -23,16 +23,15 @@ class Adagrad(Optimizer):
     def __init__(
         self,
         params: ParamsT,
-        lr=1e-2,
-        lr_decay=0,
-        weight_decay=0,
-        initial_accumulator_value=0,
-        eps=1e-10,
+        lr: float = 1e-2,
+        lr_decay: float = 0,
+        weight_decay: float = 0,
+        initial_accumulator_value: float = 0,
+        eps: float = 1e-10,
         foreach: Optional[bool] = None,
         *,
         maximize: bool = False,
         differentiable: bool = False,
-        fused: Optional[bool] = None,
     ):
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
@@ -56,41 +55,13 @@ def __init__(
             foreach=foreach,
             maximize=maximize,
             differentiable=differentiable,
-            fused=fused,
         )
         super().__init__(params, defaults)
 
-        if fused:
-            if differentiable:
-                raise RuntimeError("`fused` does not support `differentiable`")
-            self._step_supports_amp_scaling = True
-            fused_supported_devices = _get_fused_kernels_supported_devices()
-            # Not support CUDA yet
-            fused_supported_devices.remove("cuda")
-            if not all(
-                p.device.type in fused_supported_devices and torch.is_floating_point(p)
-                for pg in self.param_groups
-                for p in pg["params"]
-            ):
-                raise RuntimeError(
-                    "`fused=True` requires all the params to be floating point Tensors of "
-                    f"supported devices: {fused_supported_devices}."
-                )
-            if foreach:
-                raise RuntimeError("`fused` and `foreach` cannot be `True` together.")
-
         for group in self.param_groups:
             for p in group["params"]:
                 state = self.state[p]
-                state["step"] = (
-                    torch.zeros(
-                        (),
-                        dtype=_get_scalar_dtype(is_fused=group["fused"]),
-                        device=p.device,
-                    )
-                    if group["fused"]
-                    else torch.tensor(0.0, dtype=_get_scalar_dtype())
-                )
+                state["step"] = torch.tensor(0.0, dtype=_get_scalar_dtype())
                 init_value = (
                     complex(initial_accumulator_value, initial_accumulator_value)
                     if torch.is_complex(p)
@@ -102,14 +73,10 @@ def __init__(
 
     def __setstate__(self, state):
         super().__setstate__(state)
-        #  define "fused" for
-        #  MYPY error: Name "fused" may be undefined
-        fused = None
         for group in self.param_groups:
             group.setdefault("foreach", None)
             group.setdefault("maximize", False)
             group.setdefault("differentiable", False)
-            fused = group.setdefault("fused", None)
 
         state_values = list(self.state.values())
         step_is_tensor = (len(state_values) != 0) and torch.is_tensor(
@@ -117,9 +84,7 @@ def __setstate__(self, state):
         )
         if not step_is_tensor:
             for s in state_values:
-                s["step"] = torch.tensor(
-                    float(s["step"]), dtype=_get_scalar_dtype(is_fused=fused)
-                )
+                s["step"] = torch.tensor(float(s["step"]), dtype=_get_scalar_dtype())
 
     def share_memory(self):
         for group in self.param_groups:
@@ -179,9 +144,6 @@ def step(self, closure=None):
                 maximize=group["maximize"],
                 differentiable=group["differentiable"],
                 has_complex=has_complex,
-                fused=group["fused"],
-                grad_scale=getattr(self, "grad_scale", None),
-                found_inf=getattr(self, "found_inf", None),
             )
 
         return loss
@@ -228,10 +190,7 @@ def step(self, closure=None):
         {_foreach_doc}
         {_maximize_doc}
         {_differentiable_doc}
-        fused (bool, optional): whether the fused implementation (CPU only) is used.
-            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
-            are supported. (default: None). Please note that the fused implementations does not
-            support sparse or complex gradients.
+
     .. _Adaptive Subgradient Methods for Online Learning and Stochastic
         Optimization: http://jmlr.org/papers/v12/duchi11a.html
 
@@ -244,9 +203,6 @@ def adagrad(
     grads: List[Tensor],
     state_sums: List[Tensor],
     state_steps: List[Tensor],
-    fused: Optional[bool] = None,
-    grad_scale: Optional[Tensor] = None,
-    found_inf: Optional[Tensor] = None,
     # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
     # setting these as kwargs for now as functional API is compiled by torch/distributed/optim
     has_sparse_grad: bool = False,
@@ -269,28 +225,15 @@ def adagrad(
             "API has changed, `state_steps` argument must contain a list of singleton tensors"
         )
 
-    # Respect when the user inputs False/True for foreach or fused. We only want to change
-    # the default when neither have been user-specified. Note that we default to foreach
-    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
-    # bake-in time before making it the default, even if it is typically faster.
-    if fused is None and foreach is None:
+    if foreach is None:
         _, foreach = _default_to_fused_or_foreach(
             params, differentiable, use_fused=False
         )
 
-    if fused is None:
-        fused = False
-    if foreach is None:
-        foreach = False
-
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
-    if fused and torch.jit.is_scripting():
-        raise RuntimeError("torch.jit.script not supported with fused optimizers")
 
-    if fused and not torch.jit.is_scripting():
-        func = _fused_adagrad
-    elif foreach and not torch.jit.is_scripting():
+    if foreach and not torch.jit.is_scripting():
         func = _multi_tensor_adagrad
     else:
         func = _single_tensor_adagrad
@@ -308,8 +251,6 @@ def adagrad(
         maximize=maximize,
         differentiable=differentiable,
         has_complex=has_complex,
-        grad_scale=grad_scale,
-        found_inf=found_inf,
     )
 
 
@@ -325,8 +266,6 @@ def _single_tensor_adagrad(
     grads: List[Tensor],
     state_sums: List[Tensor],
     state_steps: List[Tensor],
-    grad_scale: Optional[Tensor],
-    found_inf: Optional[Tensor],
     *,
     lr: float,
     weight_decay: float,
@@ -337,7 +276,6 @@ def _single_tensor_adagrad(
     differentiable: bool,
     has_complex: bool,
 ):
-    assert grad_scale is None and found_inf is None
     for param, grad, state_sum, step_t in zip(params, grads, state_sums, state_steps):
         # update step
         step_t += 1
@@ -386,8 +324,6 @@ def _multi_tensor_adagrad(
     grads: List[Tensor],
     state_sums: List[Tensor],
     state_steps: List[Tensor],
-    grad_scale: Optional[Tensor],
-    found_inf: Optional[Tensor],
     *,
     lr: float,
     weight_decay: float,
@@ -399,7 +335,6 @@ def _multi_tensor_adagrad(
     has_complex: bool,
 ):
     assert not differentiable, "_foreach ops don't support autograd"
-    assert grad_scale is None and found_inf is None
 
     # Foreach functions will throw errors if given empty lists
     if len(params) == 0:
@@ -432,8 +367,6 @@ def _multi_tensor_adagrad(
                 maximize=maximize,
                 differentiable=differentiable,
                 has_complex=has_complex,
-                grad_scale=grad_scale,
-                found_inf=found_inf,
             )
             continue
 
@@ -481,76 +414,3 @@ def _multi_tensor_adagrad(
             numerator = torch._foreach_mul(device_grads, minus_clr)  # type: ignore[assignment]
 
         torch._foreach_addcdiv_(device_params, numerator, std)
-
-
-def _fused_adagrad(
-    params: List[Tensor],
-    grads: List[Tensor],
-    state_sums: List[Tensor],
-    state_steps: List[Tensor],
-    grad_scale: Optional[Tensor],
-    found_inf: Optional[Tensor],
-    *,
-    lr: float,
-    weight_decay: float,
-    lr_decay: float,
-    eps: float,
-    has_sparse_grad: bool,
-    maximize: bool,
-    differentiable: bool,
-    has_complex: bool,
-) -> None:
-    if not params:
-        return
-    if has_sparse_grad or has_complex:
-        raise RuntimeError("`fused` does not support sparse grad or complex param")
-
-    if differentiable:
-        raise RuntimeError(
-            "adagrad with fused=True does not support differentiable=True"
-        )
-
-    grad_scale_dict = (
-        {grad_scale.device: grad_scale} if grad_scale is not None else None
-    )
-    found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None
-
-    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, state_sums, state_steps]
-    )
-    for (device, _), (
-        (
-            device_params,
-            device_grads,
-            device_state_sums,
-            device_state_steps,
-        ),
-        _,
-    ) in grouped_tensors.items():
-        device_grad_scale, device_found_inf = None, None
-        if grad_scale is not None and grad_scale_dict is not None:
-            if device not in grad_scale_dict:
-                grad_scale_dict[device] = grad_scale.to(device, non_blocking=True)  # type: ignore[index]
-            device_grad_scale = grad_scale_dict[device]  # type: ignore[index]
-        if found_inf is not None and found_inf_dict is not None:
-            if found_inf not in found_inf_dict:
-                found_inf_dict[device] = found_inf.to(device, non_blocking=True)  # type: ignore[index]
-            device_found_inf = found_inf_dict[device]  # type: ignore[index]
-        torch._foreach_add_(device_state_steps, 1)
-        torch._fused_adagrad_(
-            device_params,
-            device_grads,
-            device_state_sums,
-            device_state_steps,
-            lr=lr,
-            lr_decay=lr_decay,
-            weight_decay=weight_decay,
-            eps=eps,
-            maximize=maximize,
-            grad_scale=device_grad_scale,
-            found_inf=device_found_inf,
-        )
-        if device_found_inf is not None:
-            torch._foreach_sub_(
-                device_state_steps, [device_found_inf] * len(device_state_steps)
-            )
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 04c93989576b0..fba4b2027b05d 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -11,12 +11,14 @@
     _dispatch_sqrt,
     _foreach_doc,
     _fused_doc,
+    _get_capturable_supported_devices,
     _get_scalar_dtype,
     _get_value,
     _maximize_doc,
     _stack_if_compiling,
     _use_grad_for_differentiable,
     _view_as_real,
+    DeviceDict,
     Optimizer,
     ParamsT,
 )
@@ -202,12 +204,12 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            params_with_grad = []
-            grads = []
-            exp_avgs = []
-            exp_avg_sqs = []
-            max_exp_avg_sqs = []
-            state_steps = []
+            params_with_grad: List[Tensor] = []
+            grads: List[Tensor] = []
+            exp_avgs: List[Tensor] = []
+            exp_avg_sqs: List[Tensor] = []
+            max_exp_avg_sqs: List[Tensor] = []
+            state_steps: List[Tensor] = []
             beta1, beta2 = group["betas"]
 
             has_complex = self._init_group(
@@ -352,9 +354,11 @@ def _single_tensor_adam(
 
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch._utils.is_compiling() and capturable:
-            assert (param.is_cuda and step_t.is_cuda) or (
-                param.is_xla and step_t.is_xla
-            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+            capturable_supported_devices = _get_capturable_supported_devices()
+            assert (
+                param.device.type == step_t.device.type
+                and param.device.type in capturable_supported_devices
+            ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
         # update step
         step_t += 1
@@ -463,9 +467,14 @@ def _multi_tensor_adam(
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch._utils.is_compiling() and capturable:
+        capturable_supported_devices = _get_capturable_supported_devices(
+            supports_xla=False
+        )
         assert all(
-            p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)
-        ), "If capturable=True, params and state_steps must be CUDA tensors."
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
     assert grad_scale is None and found_inf is None
 
@@ -498,7 +507,7 @@ def _multi_tensor_adam(
                 )
 
         if maximize:
-            device_grads = torch._foreach_neg(device_grads)
+            device_grads = torch._foreach_neg(device_grads)  # type: ignore[assignment]
 
         # Update steps
         # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
@@ -516,7 +525,7 @@ def _multi_tensor_adam(
             if maximize:
                 torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
             else:
-                device_grads = torch._foreach_add(
+                device_grads = torch._foreach_add(  # type: ignore[assignment]
                     device_grads, device_params, alpha=weight_decay
                 )
 
@@ -531,6 +540,9 @@ def _multi_tensor_adam(
         # Delete the local intermediate since it won't be used anymore to save on peak memory
         del device_grads
 
+        bias_correction1: Union[Tuple[Tensor, ...], List[Tensor]]
+        bias_correction2: Union[Tuple[Tensor, ...], List[Tensor]]
+        bias_correction2_sqrt: Union[Tuple[Tensor, ...], List[Tensor]]
         if capturable:
             bias_correction1 = torch._foreach_pow(beta1, device_state_steps)
             bias_correction2 = torch._foreach_pow(beta2, device_state_steps)
@@ -577,7 +589,7 @@ def _multi_tensor_adam(
 
             step_size = _stack_if_compiling([(lr / bc) * -1 for bc in bias_correction1])
 
-            bias_correction2_sqrt = [_dispatch_sqrt(bc) for bc in bias_correction2]
+            bias_correction2_sqrt = [_dispatch_sqrt(bc) for bc in bias_correction2]  # type: ignore[arg-type]
 
             if amsgrad:
                 # Maintains the maximum of all 2nd moment running avg. till now
@@ -591,7 +603,7 @@ def _multi_tensor_adam(
             torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)
             torch._foreach_add_(exp_avg_sq_sqrt, eps)
             torch._foreach_addcdiv_(
-                device_params, device_exp_avgs, exp_avg_sq_sqrt, step_size
+                device_params, device_exp_avgs, exp_avg_sq_sqrt, step_size  # type: ignore[arg-type]
             )
 
 
@@ -621,17 +633,18 @@ def _fused_adam(
     if differentiable:
         raise RuntimeError("Adam with fused=True does not support differentiable=True")
 
-    grad_scale_dict = (
-        {grad_scale.device: grad_scale} if grad_scale is not None else None
+    grad_scale_dict: DeviceDict = (
+        {grad_scale.device: grad_scale} if grad_scale is not None else {}
+    )
+    found_inf_dict: DeviceDict = (
+        {found_inf.device: found_inf} if found_inf is not None else {}
     )
-    found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None
 
     # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
     # treating it as a scalar.
-    lr_dict = (
+    lr_dict: Optional[DeviceDict] = (
         {lr.device: lr} if isinstance(lr, Tensor) and str(lr.device) != "cpu" else None
     )
-
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]
     )
@@ -648,15 +661,15 @@ def _fused_adam(
     ) in grouped_tensors.items():
         device_grad_scale, device_found_inf = None, None
         if grad_scale is not None:
-            if device not in grad_scale_dict:
-                grad_scale_dict[device] = grad_scale.to(device, non_blocking=True)
-            device_grad_scale = grad_scale_dict[device]
+            device_grad_scale = grad_scale_dict.setdefault(
+                device, grad_scale.to(device, non_blocking=True)
+            )
         if found_inf is not None:
-            if found_inf not in found_inf_dict:
-                found_inf_dict[device] = found_inf.to(device, non_blocking=True)
-            device_found_inf = found_inf_dict[device]
+            device_found_inf = found_inf_dict.setdefault(
+                device, found_inf.to(device, non_blocking=True)
+            )
         if lr_dict is not None and device not in lr_dict:
-            lr_dict[device] = lr.to(device=device, non_blocking=True)
+            lr_dict[device] = lr.to(device=device, non_blocking=True)  # type: ignore[union-attr]
             lr = lr_dict[device]
         torch._foreach_add_(device_state_steps, 1)
         torch._fused_adam_(
diff --git a/torch/optim/adam.pyi b/torch/optim/adam.pyi
deleted file mode 100644
index aef8ed69a9c99..0000000000000
--- a/torch/optim/adam.pyi
+++ /dev/null
@@ -1,22 +0,0 @@
-from typing import Optional, Tuple, Union
-
-from torch import Tensor
-
-from .optimizer import Optimizer, ParamsT
-
-class Adam(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: Union[float, Tensor] = 1e-3,
-        betas: Tuple[float, float] = (0.9, 0.999),
-        eps: float = 1e-8,
-        weight_decay: float = 0,
-        amsgrad: bool = False,
-        *,
-        foreach: Optional[bool] = None,
-        maximize: bool = False,
-        capturable: bool = False,
-        differentiable: bool = False,
-        fused: Optional[bool] = None,
-    ) -> None: ...
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index 6fa335de4d8b8..8af468ba83869 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -9,12 +9,14 @@
     _differentiable_doc,
     _disable_dynamo_if_unsupported,
     _foreach_doc,
+    _get_capturable_supported_devices,
     _get_scalar_dtype,
     _get_value,
     _maximize_doc,
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
+    ParamsT,
 )
 
 __all__ = ["Adamax", "adamax"]
@@ -23,11 +25,11 @@
 class Adamax(Optimizer):
     def __init__(
         self,
-        params,
-        lr=2e-3,
-        betas=(0.9, 0.999),
-        eps=1e-8,
-        weight_decay=0,
+        params: ParamsT,
+        lr: float = 2e-3,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-8,
+        weight_decay: float = 0,
         foreach: Optional[bool] = None,
         *,
         maximize: bool = False,
@@ -127,11 +129,11 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            params_with_grad = []
-            grads = []
-            exp_avgs = []
-            exp_infs = []
-            state_steps = []
+            params_with_grad: List[Tensor] = []
+            grads: List[Tensor] = []
+            exp_avgs: List[Tensor] = []
+            exp_infs: List[Tensor] = []
+            state_steps: List[Tensor] = []
 
             beta1, beta2 = group["betas"]
             eps = group["eps"]
@@ -242,9 +244,11 @@ def _single_tensor_adamax(
 
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch._utils.is_compiling() and capturable:
-            assert (param.is_cuda and step_t.is_cuda) or (
-                param.is_xla and step_t.is_xla
-            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+            capturable_supported_devices = _get_capturable_supported_devices()
+            assert (
+                param.device.type == step_t.device.type
+                and param.device.type in capturable_supported_devices
+            ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
         # update step
         step_t += 1
@@ -295,11 +299,11 @@ def _multi_tensor_adamax(
     exp_infs: List[Tensor],
     state_steps: List[Tensor],
     *,
+    eps: float,
     beta1: float,
     beta2: float,
     lr: float,
     weight_decay: float,
-    eps: float,
     maximize: bool,
     differentiable: bool,
     capturable: bool,
@@ -311,14 +315,15 @@ def _multi_tensor_adamax(
         return
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
-    if (
-        not torch._utils.is_compiling()
-        and capturable
-        and not all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps))
-    ):
-        raise RuntimeError(
-            "If capturable=True, params and state_steps must be CUDA tensors."
+    if not torch._utils.is_compiling() and capturable:
+        capturable_supported_devices = _get_capturable_supported_devices(
+            supports_xla=False
         )
+        assert all(
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, exp_avgs, exp_infs, state_steps]
@@ -336,7 +341,7 @@ def _multi_tensor_adamax(
             )
 
         if maximize:
-            grouped_grads = torch._foreach_neg(grouped_grads)
+            grouped_grads = torch._foreach_neg(grouped_grads)  # type: ignore[assignment]
 
         # Update steps
         # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
@@ -354,7 +359,7 @@ def _multi_tensor_adamax(
                 # Re-use the intermediate memory (grouped_grads) already allocated for maximize
                 torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay)
             else:
-                grouped_grads = torch._foreach_add(
+                grouped_grads = torch._foreach_add(  # type: ignore[assignment]
                     grouped_grads, grouped_params, alpha=weight_decay
                 )
 
@@ -367,13 +372,14 @@ def _multi_tensor_adamax(
         # in this case, we need to introduce a copy of the grads
         # since one has not been introduced previously
         if not maximize and weight_decay == 0:
-            grouped_grads = torch._foreach_abs(grouped_grads)
+            grouped_grads = torch._foreach_abs(grouped_grads)  # type: ignore[assignment]
         else:
             torch._foreach_abs_(grouped_grads)
 
         torch._foreach_add_(grouped_grads, eps)
         torch._foreach_maximum_(grouped_exp_infs, grouped_grads)
 
+        bias_corrections: Union[Tuple[Tensor, ...], List[Tensor]]
         if capturable:
             bias_corrections = torch._foreach_pow(beta1, grouped_state_steps)
             # foreach_sub doesn't allow a scalar as the first arg
diff --git a/torch/optim/adamax.pyi b/torch/optim/adamax.pyi
deleted file mode 100644
index d38cfaefe388c..0000000000000
--- a/torch/optim/adamax.pyi
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import Tuple
-
-from .optimizer import Optimizer, ParamsT
-
-class Adamax(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: float = ...,
-        betas: Tuple[float, float] = ...,
-        eps: float = ...,
-        weight_decay: float = ...,
-    ) -> None: ...
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index aa46c7a537e77..e58b28244083a 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple, Union
+from typing import cast, List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -11,12 +11,14 @@
     _dispatch_sqrt,
     _foreach_doc,
     _fused_doc,
+    _get_capturable_supported_devices,
     _get_scalar_dtype,
     _get_value,
     _maximize_doc,
     _stack_if_compiling,
     _use_grad_for_differentiable,
     _view_as_real,
+    DeviceDict,
     Optimizer,
     ParamsT,
 )
@@ -201,14 +203,14 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            params_with_grad = []
-            grads = []
-            exp_avgs = []
-            exp_avg_sqs = []
-            max_exp_avg_sqs = []
-            state_steps = []
-            amsgrad = group["amsgrad"]
-            beta1, beta2 = group["betas"]
+            params_with_grad: List[Tensor] = []
+            grads: List[Tensor] = []
+            exp_avgs: List[Tensor] = []
+            exp_avg_sqs: List[Tensor] = []
+            max_exp_avg_sqs: List[Tensor] = []
+            state_steps: List[Tensor] = []
+            amsgrad: bool = group["amsgrad"]
+            beta1, beta2 = cast(Tuple[float, float], group["betas"])
 
             has_complex = self._init_group(
                 group,
@@ -353,9 +355,11 @@ def _single_tensor_adamw(
 
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch._utils.is_compiling() and capturable:
-            assert (param.is_cuda and step_t.is_cuda) or (
-                param.is_xla and step_t.is_xla
-            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+            capturable_supported_devices = _get_capturable_supported_devices()
+            assert (
+                param.device.type == step_t.device.type
+                and param.device.type in capturable_supported_devices
+            ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
         if torch.is_complex(param):
             grad = torch.view_as_real(grad)
@@ -464,9 +468,14 @@ def _multi_tensor_adamw(
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch._utils.is_compiling() and capturable:
+        capturable_supported_devices = _get_capturable_supported_devices(
+            supports_xla=False
+        )
         assert all(
-            p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)
-        ), "If capturable=True, params and state_steps must be CUDA tensors."
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
     assert not differentiable, "_foreach ops don't support autograd"
 
@@ -498,7 +507,7 @@ def _multi_tensor_adamw(
                 )
 
         if maximize:
-            device_grads = torch._foreach_neg(device_grads)
+            device_grads = torch._foreach_neg(device_grads)  # type: ignore[assignment]
 
         # Update steps
         # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
@@ -526,6 +535,10 @@ def _multi_tensor_adamw(
         # Delete the local intermediate since it won't be used anymore to save on peak memory
         del device_grads
 
+        bias_correction1: Union[Tuple[Tensor, ...], List[Tensor]]
+        bias_correction2: Union[Tuple[Tensor, ...], List[Tensor]]
+        bias_correction2_sqrt: Union[Tuple[Tensor, ...], List[Tensor]]
+
         if capturable:
             bias_correction1 = torch._foreach_pow(beta1, device_state_steps)
             bias_correction2 = torch._foreach_pow(beta2, device_state_steps)
@@ -572,7 +585,9 @@ def _multi_tensor_adamw(
 
             step_size = _stack_if_compiling([(lr / bc) * -1 for bc in bias_correction1])
 
-            bias_correction2_sqrt = [_dispatch_sqrt(bc) for bc in bias_correction2]
+            bias_correction2_sqrt = [
+                _dispatch_sqrt(bc) for bc in bias_correction2  # type: ignore[arg-type]
+            ]
 
             if amsgrad:
                 # Maintains the maximum of all 2nd moment running avg. till now
@@ -586,7 +601,10 @@ def _multi_tensor_adamw(
             torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)
             torch._foreach_add_(exp_avg_sq_sqrt, eps)
             torch._foreach_addcdiv_(
-                device_params, device_exp_avgs, exp_avg_sq_sqrt, step_size
+                device_params,
+                device_exp_avgs,
+                exp_avg_sq_sqrt,
+                step_size,  # type: ignore[arg-type]
             )
 
 
@@ -603,27 +621,29 @@ def _fused_adamw(
     amsgrad: bool,
     beta1: float,
     beta2: float,
-    lr: Union[float, Tensor],
+    lr: Union[Tensor, float],
     weight_decay: float,
     eps: float,
     maximize: bool,
     capturable: bool,  # Needed for consistency.
     differentiable: bool,
-    has_complex: bool,
+    has_complex: bool,  # Needed for consistency.
 ) -> None:
     if not params:
         return
     if differentiable:
         raise RuntimeError("Adam with fused=True does not support differentiable=True")
 
-    grad_scale_dict = (
-        {grad_scale.device: grad_scale} if grad_scale is not None else None
+    grad_scale_dict: DeviceDict = (
+        {grad_scale.device: grad_scale} if grad_scale is not None else {}
+    )
+    found_inf_dict: DeviceDict = (
+        {found_inf.device: found_inf} if found_inf is not None else {}
     )
-    found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None
 
     # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
     # treating it as a scalar.
-    lr_dict = (
+    lr_dict: Optional[DeviceDict] = (
         {lr.device: lr} if isinstance(lr, Tensor) and str(lr.device) != "cpu" else None
     )
 
@@ -643,16 +663,17 @@ def _fused_adamw(
     ) in grouped_tensors.items():
         device_grad_scale, device_found_inf = None, None
         if grad_scale is not None:
-            if device not in grad_scale_dict:
-                grad_scale_dict[device] = grad_scale.to(device, non_blocking=True)
-            device_grad_scale = grad_scale_dict[device]
+            device_grad_scale = grad_scale_dict.setdefault(
+                device, grad_scale.to(device, non_blocking=True)
+            )
         if found_inf is not None:
-            if found_inf not in found_inf_dict:
-                found_inf_dict[device] = found_inf.to(device, non_blocking=True)
-            device_found_inf = found_inf_dict[device]
+            device_found_inf = found_inf_dict.setdefault(
+                device, found_inf.to(device, non_blocking=True)
+            )
         if lr_dict is not None and device not in lr_dict:
-            lr_dict[device] = lr.to(device=device, non_blocking=True)
-            lr = lr_dict[device]
+            lr = lr_dict.setdefault(
+                device, lr.to(device=device, non_blocking=True)  # type: ignore[union-attr]
+            )
         torch._foreach_add_(device_state_steps, 1)
         torch._fused_adamw_(
             device_params,
diff --git a/torch/optim/adamw.pyi b/torch/optim/adamw.pyi
deleted file mode 100644
index 17c35ebec8a6a..0000000000000
--- a/torch/optim/adamw.pyi
+++ /dev/null
@@ -1,22 +0,0 @@
-from typing import Optional, Tuple, Union
-
-from torch import Tensor
-
-from .optimizer import Optimizer, ParamsT
-
-class AdamW(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: Union[float, Tensor] = 1e-3,
-        betas: Tuple[float, float] = (0.9, 0.999),
-        eps: float = 1e-8,
-        weight_decay: float = 1e-2,
-        amsgrad: bool = False,
-        *,
-        maximize: bool = False,
-        foreach: Optional[bool] = None,
-        capturable: bool = False,
-        differentiable: bool = False,
-        fused: Optional[bool] = None,
-    ) -> None: ...
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index 5714a82d5f19b..a87aadc81803c 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -9,12 +9,14 @@
     _differentiable_doc,
     _disable_dynamo_if_unsupported,
     _foreach_doc,
+    _get_capturable_supported_devices,
     _get_scalar_dtype,
     _get_value,
     _maximize_doc,
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
+    ParamsT,
 )
 
 __all__ = ["ASGD", "asgd"]
@@ -30,12 +32,12 @@ def _to_tensor(x, device=None):
 class ASGD(Optimizer):
     def __init__(
         self,
-        params,
-        lr=1e-2,
-        lambd=1e-4,
-        alpha=0.75,
-        t0=1e6,
-        weight_decay=0,
+        params: ParamsT,
+        lr: float = 1e-2,
+        lambd: float = 1e-4,
+        alpha: float = 0.75,
+        t0: float = 1e6,
+        weight_decay: float = 0,
         foreach: Optional[bool] = None,
         maximize: bool = False,
         differentiable: bool = False,
@@ -135,12 +137,12 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            params_with_grad = []
-            grads = []
-            mus = []
-            axs = []
-            etas = []
-            state_steps = []
+            params_with_grad: List[Tensor] = []
+            grads: List[Tensor] = []
+            mus: List[Tensor] = []
+            axs: List[Tensor] = []
+            etas: List[Tensor] = []
+            state_steps: List[Tensor] = []
 
             has_complex = self._init_group(
                 group, params_with_grad, grads, mus, axs, etas, state_steps
@@ -220,11 +222,17 @@ def _single_tensor_asgd(
 
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch._utils.is_compiling() and capturable:
+            capturable_supported_devices = _get_capturable_supported_devices()
             assert (
-                param.is_cuda and mu.is_cuda and eta.is_cuda and step_t.is_cuda
-            ) or (
-                param.is_xla and mu.is_xla and eta.is_xla and step_t.is_xla
-            ), "If capturable=True, params, mus, etas, and state_steps must be CUDA or XLA tensors."
+                param.device.type
+                == mu.device.type
+                == eta.device.type
+                == step_t.device.type
+                and param.device.type in capturable_supported_devices
+            ), (
+                f"If capturable=True, params, mus, etas, and state_steps must be "
+                f"on supported devices: {capturable_supported_devices}."
+            )
 
         if torch.is_complex(param):
             grad = torch.view_as_real(grad)
@@ -287,10 +295,14 @@ def _multi_tensor_asgd(
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch._utils.is_compiling() and capturable:
+        capturable_supported_devices = _get_capturable_supported_devices(
+            supports_xla=False
+        )
         assert all(
-            p.is_cuda and mu.is_cuda and eta.is_cuda and step.is_cuda
+            p.device.type == mu.device.type == eta.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
             for p, mu, eta, step in zip(params, mus, etas, state_steps)
-        ), "If capturable=True, params, mus, etas, and state_steps must be CUDA tensors."
+        ), f"If capturable=True, params, mus, etas, and state_steps must be on supported devices: {capturable_supported_devices}."
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, axs, mus, etas, state_steps]
@@ -310,7 +322,7 @@ def _multi_tensor_asgd(
             _view_as_real(grouped_params, grouped_grads, grouped_axs)
 
         if maximize:
-            grouped_grads = torch._foreach_neg(grouped_grads)
+            grouped_grads = torch._foreach_neg(grouped_grads)  # type: ignore[assignment]
 
         # Update steps
         # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
@@ -324,6 +336,7 @@ def _multi_tensor_asgd(
             torch._foreach_add_(grouped_state_steps, 1)
 
         # intermediate = grad + param * lambd
+        intermediate: Union[Tuple[Tensor, ...], List[Tensor]]
         if weight_decay != 0:
             if maximize:
                 torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay)
@@ -358,6 +371,8 @@ def _multi_tensor_asgd(
         torch._foreach_addcmul_(grouped_axs, intermediate, grouped_mus)
         del intermediate
 
+        new_etas: Union[Tuple[Tensor, ...], List[Tensor]]
+        new_mus: Union[Tuple[Tensor, ...], List[Tensor]]
         if capturable:
             # update grouped_mus
             new_mus = torch._foreach_sub(grouped_state_steps, t0)
diff --git a/torch/optim/asgd.pyi b/torch/optim/asgd.pyi
deleted file mode 100644
index 634b0d162cebd..0000000000000
--- a/torch/optim/asgd.pyi
+++ /dev/null
@@ -1,12 +0,0 @@
-from .optimizer import Optimizer, ParamsT
-
-class ASGD(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: float = ...,
-        lambd: float = ...,
-        alpha: float = ...,
-        t0: float = ...,
-        weight_decay: float = ...,
-    ) -> None: ...
diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py
index 1e0f5738ad637..e8818cca538c9 100644
--- a/torch/optim/lbfgs.py
+++ b/torch/optim/lbfgs.py
@@ -1,5 +1,7 @@
+from typing import Optional
+
 import torch
-from .optimizer import Optimizer
+from .optimizer import Optimizer, ParamsT
 
 __all__ = ["LBFGS"]
 
@@ -99,17 +101,17 @@ def _strong_wolfe(
     # exact point satisfying the criteria
     insuf_progress = False
     # find high and low points in bracket
-    low_pos, high_pos = (0, 1) if bracket_f[0] <= bracket_f[-1] else (1, 0)
+    low_pos, high_pos = (0, 1) if bracket_f[0] <= bracket_f[-1] else (1, 0)  # type: ignore[possibly-undefined]
     while not done and ls_iter < max_ls:
         # line-search bracket is so small
-        if abs(bracket[1] - bracket[0]) * d_norm < tolerance_change:
+        if abs(bracket[1] - bracket[0]) * d_norm < tolerance_change:  # type: ignore[possibly-undefined]
             break
 
         # compute new trial value
         t = _cubic_interpolate(
             bracket[0],
             bracket_f[0],
-            bracket_gtd[0],
+            bracket_gtd[0],  # type: ignore[possibly-undefined]
             bracket[1],
             bracket_f[1],
             bracket_gtd[1],
@@ -147,7 +149,7 @@ def _strong_wolfe(
             # Armijo condition not satisfied or not lower than lowest point
             bracket[high_pos] = t
             bracket_f[high_pos] = f_new
-            bracket_g[high_pos] = g_new.clone(memory_format=torch.contiguous_format)
+            bracket_g[high_pos] = g_new.clone(memory_format=torch.contiguous_format)  # type: ignore[possibly-undefined]
             bracket_gtd[high_pos] = gtd_new
             low_pos, high_pos = (0, 1) if bracket_f[0] <= bracket_f[1] else (1, 0)
         else:
@@ -158,19 +160,19 @@ def _strong_wolfe(
                 # old high becomes new low
                 bracket[high_pos] = bracket[low_pos]
                 bracket_f[high_pos] = bracket_f[low_pos]
-                bracket_g[high_pos] = bracket_g[low_pos]
+                bracket_g[high_pos] = bracket_g[low_pos]  # type: ignore[possibly-undefined]
                 bracket_gtd[high_pos] = bracket_gtd[low_pos]
 
             # new point becomes new low
             bracket[low_pos] = t
             bracket_f[low_pos] = f_new
-            bracket_g[low_pos] = g_new.clone(memory_format=torch.contiguous_format)
+            bracket_g[low_pos] = g_new.clone(memory_format=torch.contiguous_format)  # type: ignore[possibly-undefined]
             bracket_gtd[low_pos] = gtd_new
 
     # return stuff
-    t = bracket[low_pos]
+    t = bracket[low_pos]  # type: ignore[possibly-undefined]
     f_new = bracket_f[low_pos]
-    g_new = bracket_g[low_pos]
+    g_new = bracket_g[low_pos]  # type: ignore[possibly-undefined]
     return f_new, g_new, t, ls_func_evals
 
 
@@ -210,14 +212,14 @@ class LBFGS(Optimizer):
 
     def __init__(
         self,
-        params,
-        lr=1,
-        max_iter=20,
-        max_eval=None,
-        tolerance_grad=1e-7,
-        tolerance_change=1e-9,
-        history_size=100,
-        line_search_fn=None,
+        params: ParamsT,
+        lr: float = 1,
+        max_iter: int = 20,
+        max_eval: Optional[int] = None,
+        tolerance_grad: float = 1e-7,
+        tolerance_change: float = 1e-9,
+        history_size: int = 100,
+        line_search_fn: Optional[str] = None,
     ):
         if max_eval is None:
             max_eval = max_iter * 5 // 4
diff --git a/torch/optim/lbfgs.pyi b/torch/optim/lbfgs.pyi
deleted file mode 100644
index c7c0ac060881a..0000000000000
--- a/torch/optim/lbfgs.pyi
+++ /dev/null
@@ -1,16 +0,0 @@
-from typing import Optional
-
-from .optimizer import Optimizer, ParamsT
-
-class LBFGS(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: float = ...,
-        max_iter: int = ...,
-        max_eval: Optional[int] = ...,
-        tolerance_grad: float = ...,
-        tolerance_change: float = ...,
-        history_size: int = ...,
-        line_search_fn: Optional[str] = ...,
-    ) -> None: ...
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 6d0daaf9d7184..77bdb6b46aac0 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -155,7 +155,18 @@ def print_lr(
         lr: float,
         epoch: Optional[int] = None,
     ):
-        """Display the current learning rate."""
+        """Display the current learning rate.
+
+        .. deprecated:: 2.4
+            ``print_lr()`` is deprecated. Please use ``get_last_lr()`` to access the
+            learning rate.
+        """
+        warnings.warn(
+            "`LRScheduler.print_lr()` is being deprecated. To fetch the learning rate, "
+            "please use `get_last_lr()` instead. For more details, "
+            "see https://github.com/pytorch/pytorch/issues/99270.",
+            UserWarning,
+        )
         if is_verbose:
             if epoch is None:
                 print(f"Adjusting learning rate of group {group} to {lr:.4e}.")
diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py
index 901036897f564..cca41f5bc8427 100644
--- a/torch/optim/nadam.py
+++ b/torch/optim/nadam.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import cast, List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -9,12 +9,14 @@
     _disable_dynamo_if_unsupported,
     _dispatch_sqrt,
     _foreach_doc,
+    _get_capturable_supported_devices,
     _get_scalar_dtype,
     _get_value,
     _stack_if_compiling,
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
+    ParamsT,
 )
 
 __all__ = ["NAdam", "nadam"]
@@ -23,12 +25,12 @@
 class NAdam(Optimizer):
     def __init__(
         self,
-        params,
-        lr=2e-3,
-        betas=(0.9, 0.999),
-        eps=1e-8,
-        weight_decay=0,
-        momentum_decay=4e-3,
+        params: ParamsT,
+        lr: float = 2e-3,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-8,
+        weight_decay: float = 0,
+        momentum_decay: float = 4e-3,
         decoupled_weight_decay: bool = False,
         *,
         foreach: Optional[bool] = None,
@@ -155,13 +157,13 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            params_with_grad = []
-            grads = []
-            exp_avgs = []
-            exp_avg_sqs = []
-            mu_products = []
-            state_steps = []
-            beta1, beta2 = group["betas"]
+            params_with_grad: List[Tensor] = []
+            grads: List[Tensor] = []
+            exp_avgs: List[Tensor] = []
+            exp_avg_sqs: List[Tensor] = []
+            mu_products: List[Tensor] = []
+            state_steps: List[Tensor] = []
+            beta1, beta2 = cast(Tuple[float, float], group["betas"])
 
             has_complex = self._init_group(
                 group,
@@ -293,9 +295,14 @@ def _single_tensor_nadam(
 
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch._utils.is_compiling() and capturable:
-            assert (param.is_cuda and mu_product.is_cuda and step_t.is_cuda) or (
-                param.is_xla and mu_product.is_xla and step_t.is_xla
-            ), "If capturable=True, params, mu_products, and state_steps must be CUDA or XLA tensors."
+            capturable_supported_devices = _get_capturable_supported_devices()
+            assert (
+                param.device.type == mu_product.device.type == step_t.device.type
+                and param.device.type in capturable_supported_devices
+            ), (
+                f"If capturable=True, params, mu_products and state_steps must be "
+                f"on supported devices: {capturable_supported_devices}."
+            )
 
         # update step
         step_t += 1
@@ -373,10 +380,14 @@ def _multi_tensor_nadam(
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch._utils.is_compiling() and capturable:
+        capturable_supported_devices = _get_capturable_supported_devices(
+            supports_xla=False
+        )
         assert all(
-            p.is_cuda and mp.is_cuda and step.is_cuda
+            p.device.type == mp.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
             for p, mp, step in zip(params, mu_products, state_steps)
-        ), "If capturable=True, params, mu_products, and state_steps must be CUDA tensors."
+        ), f"If capturable=True, params, mu_products, and state_steps must be on supported devices: {capturable_supported_devices}."
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps]
@@ -411,7 +422,7 @@ def _multi_tensor_nadam(
                 # Perform stepweight decay
                 torch._foreach_mul_(grouped_params, 1 - lr * weight_decay)
             else:
-                grouped_grads = torch._foreach_add(
+                grouped_grads = torch._foreach_add(  # type: ignore[assignment]
                     grouped_grads, grouped_params, alpha=weight_decay
                 )
 
@@ -425,6 +436,9 @@ def _multi_tensor_nadam(
 
         exp_avg_sq_sqrt = torch._foreach_sqrt(grouped_exp_avg_sqs)
 
+        bias_correction_sqrt: Union[Tuple[Tensor, ...], List[Tensor]]
+        mus: Union[Tuple[Tensor, ...], List[Tensor]]
+        mu_nexts: Union[Tuple[Tensor, ...], List[Tensor]]
         if capturable:
             # mus will be beta1 * (1 - 0.5 * 0.96 ** (step * momentum_decay))
             exponent = torch._foreach_mul(grouped_state_steps, momentum_decay)
@@ -524,10 +538,10 @@ def _multi_tensor_nadam(
             )
 
             torch._foreach_addcdiv_(
-                grouped_params, grouped_grads, exp_avg_sq_sqrt, step_size_grads
+                grouped_params, grouped_grads, exp_avg_sq_sqrt, step_size_grads  # type: ignore[arg-type]
             )
             torch._foreach_addcdiv_(
-                grouped_params, grouped_exp_avgs, exp_avg_sq_sqrt, step_size_expavg
+                grouped_params, grouped_exp_avgs, exp_avg_sq_sqrt, step_size_expavg  # type: ignore[arg-type]
             )
 
 
diff --git a/torch/optim/nadam.pyi b/torch/optim/nadam.pyi
deleted file mode 100644
index f62e188b3d72b..0000000000000
--- a/torch/optim/nadam.pyi
+++ /dev/null
@@ -1,15 +0,0 @@
-from typing import Tuple
-
-from .optimizer import Optimizer, ParamsT
-
-class NAdam(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: float = ...,
-        betas: Tuple[float, float] = ...,
-        eps: float = ...,
-        weight_decay: float = ...,
-        momentum_decay: float = ...,
-        decoupled_weight_decay: bool = ...,
-    ) -> None: ...
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index fbcf95744bad8..1b76f6287af36 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -38,6 +38,8 @@
 Kwargs: TypeAlias = Dict[str, Any]
 StateDict: TypeAlias = Dict[str, Any]
 TensorListList: TypeAlias = List[List[torch.Tensor]]
+DeviceDict = Dict[Optional[torch.device], torch.Tensor]
+
 
 GlobalOptimizerPreHook: TypeAlias = Callable[
     ["Optimizer", Args, Kwargs], Optional[Tuple[Args, Kwargs]]
@@ -213,6 +215,16 @@ def _get_scalar_dtype(is_fused=None):
     )
 
 
+def _get_capturable_supported_devices(supports_xla: bool = True) -> List[str]:
+    r"""Return the device type list that supports capturable optimizer."""
+    capturable_supported_devices = ["cuda"]
+    if not torch.jit.is_scripting():
+        capturable_supported_devices.append(torch._C._get_privateuse1_backend_name())
+    if supports_xla:
+        capturable_supported_devices.append("xla")
+    return capturable_supported_devices
+
+
 # Common doc strings among optimizers
 _foreach_doc = r"""foreach (bool, optional): whether foreach implementation of optimizer
             is used. If unspecified by the user (so foreach is None), we will try to use
@@ -222,7 +234,7 @@ def _get_scalar_dtype(is_fused=None):
             being a tensorlist vs just one tensor. If memory is prohibitive, batch fewer
             parameters through the optimizer at a time or switch this flag to False (default: None)"""
 
-_fused_doc = r"""fused (bool, optional): whether the fused implementation is used.
+_fused_doc = r"""fused (bool, optional): whether the fused implementation (CUDA only) is used.
             Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
             are supported. (default: None)
 
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
index 10c38a14a6aa3..18330f98ec7ae 100644
--- a/torch/optim/radam.py
+++ b/torch/optim/radam.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import cast, List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -10,11 +10,13 @@
     _disable_dynamo_if_unsupported,
     _dispatch_sqrt,
     _foreach_doc,
+    _get_capturable_supported_devices,
     _get_scalar_dtype,
     _get_value,
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
+    ParamsT,
 )
 
 __all__ = ["RAdam", "radam"]
@@ -23,11 +25,11 @@
 class RAdam(Optimizer):
     def __init__(
         self,
-        params,
-        lr=1e-3,
-        betas=(0.9, 0.999),
-        eps=1e-8,
-        weight_decay=0,
+        params: ParamsT,
+        lr: float = 1e-3,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-8,
+        weight_decay: float = 0,
         decoupled_weight_decay: bool = False,
         *,
         foreach: Optional[bool] = None,
@@ -127,12 +129,12 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            params_with_grad = []
-            grads = []
-            exp_avgs = []
-            exp_avg_sqs = []
-            state_steps = []
-            beta1, beta2 = group["betas"]
+            params_with_grad: List[Tensor] = []
+            grads: List[Tensor] = []
+            exp_avgs: List[Tensor] = []
+            exp_avg_sqs: List[Tensor] = []
+            state_steps: List[Tensor] = []
+            beta1, beta2 = cast(Tuple[float, float], group["betas"])
 
             has_complex = self._init_group(
                 group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps
@@ -247,8 +249,8 @@ def _single_tensor_radam(
     lr: float,
     weight_decay: float,
     eps: float,
-    differentiable: bool,
     decoupled_weight_decay: bool,
+    differentiable: bool,
     capturable: bool,
     has_complex: bool,
 ):
@@ -260,9 +262,11 @@ def _single_tensor_radam(
 
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch._utils.is_compiling() and capturable:
-            assert (param.is_cuda and step_t.is_cuda) or (
-                param.is_xla and step_t.is_xla
-            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+            capturable_supported_devices = _get_capturable_supported_devices()
+            assert (
+                param.device.type == step_t.device.type
+                and param.device.type in capturable_supported_devices
+            ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
         if torch.is_complex(param):
             param = torch.view_as_real(param)
@@ -355,9 +359,14 @@ def _multi_tensor_radam(
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch._utils.is_compiling() and capturable:
+        capturable_supported_devices = _get_capturable_supported_devices(
+            supports_xla=False
+        )
         assert all(
-            p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)
-        ), "If capturable=True, params and state_steps must be CUDA tensors."
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, exp_avgs, exp_avg_sqs, state_steps]
@@ -388,6 +397,9 @@ def _multi_tensor_radam(
         # maximum length of the approximated SMA
         rho_inf = 2 / (1 - beta2) - 1
         # compute the length of the approximated SMA
+        bias_correction1: Union[Tuple[Tensor, ...], List[Tensor]]
+        bias_correction2: Union[Tuple[Tensor, ...], List[Tensor]]
+        rho_t_list: Union[Tuple[Tensor, ...], List[Tensor]]
         if capturable:
             bias_correction1 = torch._foreach_pow(beta2, grouped_state_steps)
             torch._foreach_neg_(bias_correction1)
@@ -413,7 +425,7 @@ def _multi_tensor_radam(
             if decoupled_weight_decay:
                 torch._foreach_mul_(grouped_params, 1 - lr * weight_decay)
             else:
-                grouped_grads = torch._foreach_add(
+                grouped_grads = torch._foreach_add(  # type: ignore[assignment]
                     grouped_grads, grouped_params, alpha=weight_decay
                 )
 
@@ -469,7 +481,7 @@ def _multi_tensor_radam(
         else:
             rect = [
                 _dispatch_sqrt(
-                    (rho_t - 4)
+                    (rho_t - 4)  # type: ignore[arg-type]
                     * (rho_t - 2)
                     * rho_inf
                     / ((rho_inf - 4) * (rho_inf - 2) * rho_t)
diff --git a/torch/optim/radam.pyi b/torch/optim/radam.pyi
deleted file mode 100644
index b001376b05ef4..0000000000000
--- a/torch/optim/radam.pyi
+++ /dev/null
@@ -1,14 +0,0 @@
-from typing import Tuple
-
-from .optimizer import Optimizer, ParamsT
-
-class RAdam(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: float = ...,
-        betas: Tuple[float, float] = ...,
-        eps: float = ...,
-        weight_decay: float = ...,
-        decoupled_weight_decay: bool = ...,
-    ) -> None: ...
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index dc4491b553b24..b3375c338b40f 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -8,11 +8,13 @@
     _differentiable_doc,
     _disable_dynamo_if_unsupported,
     _foreach_doc,
+    _get_capturable_supported_devices,
     _get_scalar_dtype,
     _maximize_doc,
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
+    ParamsT,
 )
 
 __all__ = ["RMSprop", "rmsprop"]
@@ -21,12 +23,12 @@
 class RMSprop(Optimizer):
     def __init__(
         self,
-        params,
-        lr=1e-2,
-        alpha=0.99,
-        eps=1e-8,
-        weight_decay=0,
-        momentum=0,
+        params: ParamsT,
+        lr: float = 1e-2,
+        alpha: float = 0.99,
+        eps: float = 1e-8,
+        weight_decay: float = 0,
+        momentum: float = 0,
         centered=False,
         capturable=False,
         foreach: Optional[bool] = None,
@@ -146,12 +148,12 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            params_with_grad = []
-            grads = []
-            square_avgs = []
-            grad_avgs = []
-            momentum_buffer_list = []
-            state_steps = []
+            params_with_grad: List[Tensor] = []
+            grads: List[Tensor] = []
+            square_avgs: List[Tensor] = []
+            grad_avgs: List[Tensor] = []
+            momentum_buffer_list: List[Tensor] = []
+            state_steps: List[Tensor] = []
 
             has_complex = self._init_group(
                 group,
@@ -275,9 +277,11 @@ def _single_tensor_rmsprop(
 
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch._utils.is_compiling() and capturable:
-            assert (param.is_cuda and step.is_cuda) or (
-                param.is_xla and step.is_xla
-            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+            capturable_supported_devices = _get_capturable_supported_devices()
+            assert (
+                param.device.type == step.device.type
+                and param.device.type in capturable_supported_devices
+            ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
         grad = grads[i]
         grad = grad if not maximize else -grad
@@ -346,10 +350,12 @@ def _multi_tensor_rmsprop(
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch._utils.is_compiling() and capturable:
+        capturable_supported_devices = _get_capturable_supported_devices()
         assert all(
-            (p.is_cuda and step.is_cuda) or (p.is_xla and step.is_xla)
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
             for p, step in zip(params, state_steps)
-        ), "If capturable=True, params and state_steps must be CUDA tensors."
+        ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, square_avgs, grad_avgs, momentum_buffer_list, state_steps]
@@ -373,7 +379,7 @@ def _multi_tensor_rmsprop(
             _view_as_real(grouped_params, *state_and_grads)
 
         if maximize:
-            grouped_grads = torch._foreach_neg(grouped_grads)
+            grouped_grads = torch._foreach_neg(grouped_grads)  # type: ignore[assignment]
 
         # Update steps
         # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
@@ -391,7 +397,7 @@ def _multi_tensor_rmsprop(
             if maximize:
                 torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay)
             else:
-                grouped_grads = torch._foreach_add(
+                grouped_grads = torch._foreach_add(  # type: ignore[assignment]
                     grouped_grads, grouped_params, alpha=weight_decay
                 )
 
diff --git a/torch/optim/rmsprop.pyi b/torch/optim/rmsprop.pyi
deleted file mode 100644
index f206d542dcecb..0000000000000
--- a/torch/optim/rmsprop.pyi
+++ /dev/null
@@ -1,13 +0,0 @@
-from .optimizer import Optimizer, ParamsT
-
-class RMSprop(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: float = ...,
-        alpha: float = ...,
-        eps: float = ...,
-        weight_decay: float = ...,
-        momentum: float = ...,
-        centered: bool = ...,
-    ) -> None: ...
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index b252f5214cb8a..ec40aae5c90a9 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import torch
 from torch import Tensor
@@ -8,11 +8,13 @@
     _differentiable_doc,
     _disable_dynamo_if_unsupported,
     _foreach_doc,
+    _get_capturable_supported_devices,
     _get_scalar_dtype,
     _maximize_doc,
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
+    ParamsT,
 )
 
 __all__ = ["Rprop", "rprop"]
@@ -21,10 +23,10 @@
 class Rprop(Optimizer):
     def __init__(
         self,
-        params,
-        lr=1e-2,
-        etas=(0.5, 1.2),
-        step_sizes=(1e-6, 50),
+        params: ParamsT,
+        lr: float = 1e-2,
+        etas: Tuple[float, float] = (0.5, 1.2),
+        step_sizes: Tuple[float, float] = (1e-6, 50),
         *,
         capturable: bool = False,
         foreach: Optional[bool] = None,
@@ -120,11 +122,11 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            params = []
-            grads = []
-            prevs = []
-            step_sizes = []
-            state_steps = []
+            params: List[Tensor] = []
+            grads: List[Tensor] = []
+            prevs: List[Tensor] = []
+            step_sizes: List[Tensor] = []
+            state_steps: List[Tensor] = []
 
             etaminus, etaplus = group["etas"]
             step_size_min, step_size_max = group["step_sizes"]
@@ -235,9 +237,11 @@ def _single_tensor_rprop(
 
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch._utils.is_compiling() and capturable:
-            assert (param.is_cuda and step.is_cuda) or (
-                param.is_xla and step.is_xla
-            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+            capturable_supported_devices = _get_capturable_supported_devices()
+            assert (
+                param.device.type == step.device.type
+                and param.device.type in capturable_supported_devices
+            ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
         step += 1
 
@@ -299,10 +303,12 @@ def _multi_tensor_rprop(
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch._utils.is_compiling() and capturable:
+        capturable_supported_devices = _get_capturable_supported_devices()
         assert all(
-            (p.is_cuda and step.is_cuda) or (p.is_xla and step.is_xla)
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
             for p, step in zip(params, state_steps)
-        ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+        ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, prevs, step_sizes, state_steps]
diff --git a/torch/optim/rprop.pyi b/torch/optim/rprop.pyi
deleted file mode 100644
index fd0c6ba209161..0000000000000
--- a/torch/optim/rprop.pyi
+++ /dev/null
@@ -1,12 +0,0 @@
-from typing import Tuple
-
-from .optimizer import Optimizer, ParamsT
-
-class Rprop(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: float = ...,
-        etas: Tuple[float, float] = ...,
-        step_sizes: Tuple[float, float] = ...,
-    ) -> None: ...
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index b346958204710..c0efc24430787 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -10,6 +10,7 @@
     _fused_doc,
     _maximize_doc,
     _use_grad_for_differentiable,
+    DeviceDict,
     Optimizer,
 )
 
@@ -20,10 +21,10 @@ class SGD(Optimizer):
     def __init__(
         self,
         params,
-        lr=1e-3,
-        momentum=0,
-        dampening=0,
-        weight_decay=0,
+        lr: float = 1e-3,
+        momentum: float = 0,
+        dampening: float = 0,
+        weight_decay: float = 0,
         nesterov=False,
         *,
         maximize: bool = False,
@@ -80,13 +81,13 @@ def __setstate__(self, state):
             group.setdefault("differentiable", False)
             group.setdefault("fused", False)
 
-    def _init_group(self, group, params_with_grad, d_p_list, momentum_buffer_list):
+    def _init_group(self, group, params, grads, momentum_buffer_list):
         has_sparse_grad = False
 
         for p in group["params"]:
             if p.grad is not None:
-                params_with_grad.append(p)
-                d_p_list.append(p.grad)
+                params.append(p)
+                grads.append(p.grad)
                 if p.grad.is_sparse:
                     has_sparse_grad = True
 
@@ -110,17 +111,17 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            params_with_grad = []
-            d_p_list = []
-            momentum_buffer_list = []
+            params: List[Tensor] = []
+            grads: List[Tensor] = []
+            momentum_buffer_list: List[Optional[Tensor]] = []
 
             has_sparse_grad = self._init_group(
-                group, params_with_grad, d_p_list, momentum_buffer_list
+                group, params, grads, momentum_buffer_list
             )
 
             sgd(
-                params_with_grad,
-                d_p_list,
+                params,
+                grads,
                 momentum_buffer_list,
                 weight_decay=group["weight_decay"],
                 momentum=group["momentum"],
@@ -137,7 +138,7 @@ def step(self, closure=None):
 
             if group["momentum"] != 0:
                 # update momentum_buffers in state
-                for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
+                for p, momentum_buffer in zip(params, momentum_buffer_list):
                     state = self.state[p]
                     state["momentum_buffer"] = momentum_buffer
 
@@ -245,7 +246,7 @@ def sgd(
     momentum_buffer_list: List[Optional[Tensor]],
     # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
     # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-    has_sparse_grad: bool = None,
+    has_sparse_grad: bool = False,
     foreach: Optional[bool] = None,
     fused: Optional[bool] = None,
     grad_scale: Optional[Tensor] = None,
@@ -312,7 +313,7 @@ def sgd(
 
 def _single_tensor_sgd(
     params: List[Tensor],
-    d_p_list: List[Tensor],
+    grads: List[Tensor],
     momentum_buffer_list: List[Optional[Tensor]],
     grad_scale: Optional[Tensor],
     found_inf: Optional[Tensor],
@@ -328,26 +329,26 @@ def _single_tensor_sgd(
     assert grad_scale is None and found_inf is None
 
     for i, param in enumerate(params):
-        d_p = d_p_list[i] if not maximize else -d_p_list[i]
+        grad = grads[i] if not maximize else -grads[i]
 
         if weight_decay != 0:
-            d_p = d_p.add(param, alpha=weight_decay)
+            grad = grad.add(param, alpha=weight_decay)
 
         if momentum != 0:
             buf = momentum_buffer_list[i]
 
             if buf is None:
-                buf = torch.clone(d_p).detach()
+                buf = torch.clone(grad).detach()
                 momentum_buffer_list[i] = buf
             else:
-                buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
+                buf.mul_(momentum).add_(grad, alpha=1 - dampening)
 
             if nesterov:
-                d_p = d_p.add(buf, alpha=momentum)
+                grad = grad.add(buf, alpha=momentum)
             else:
-                d_p = buf
+                grad = buf
 
-        param.add_(d_p, alpha=-lr)
+        param.add_(grad, alpha=-lr)
 
 
 def _multi_tensor_sgd(
@@ -371,7 +372,7 @@ def _multi_tensor_sgd(
         return
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, momentum_buffer_list], with_indices=True
+        [params, grads, momentum_buffer_list], with_indices=True  # type: ignore[list-item]
     )
     for (
         device_params,
@@ -383,14 +384,14 @@ def _multi_tensor_sgd(
         )
 
         if maximize:
-            device_grads = torch._foreach_neg(device_grads)
+            device_grads = torch._foreach_neg(device_grads)  # type: ignore[assignment]
 
         if weight_decay != 0:
             # Re-use the intermediate memory (device_grads) already allocated for maximize
             if maximize:
                 torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
             else:
-                device_grads = torch._foreach_add(
+                device_grads = torch._foreach_add(  # type: ignore[assignment]
                     device_grads, device_params, alpha=weight_decay
                 )
 
@@ -458,10 +459,12 @@ def _fused_sgd(
         return
     if has_sparse_grad:
         raise RuntimeError("`_fused_sgd` does not support sparse gradients")
-    grad_scale_dict = (
-        {grad_scale.device: grad_scale} if grad_scale is not None else None
+    grad_scale_dict: DeviceDict = (
+        {grad_scale.device: grad_scale} if grad_scale is not None else {}
+    )
+    found_inf_dict: DeviceDict = (
+        {found_inf.device: found_inf} if found_inf is not None else {}
     )
-    found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None
 
     no_momentum_buffer = momentum == 0
     is_first_step = (
@@ -471,21 +474,19 @@ def _fused_sgd(
         for i, g in enumerate(grads):
             momentum_buffer_list[i] = torch.empty_like(g)
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, momentum_buffer_list], with_indices=False
+        [params, grads, momentum_buffer_list], with_indices=False  # type: ignore[list-item]
     )
-    for (device, dtype), (
+    for (device, _), (
         (device_params, device_grads, device_momentum_buffer_list),
         _,
     ) in grouped_tensors.items():
         device_grad_scale, device_found_inf = None, None
         if grad_scale is not None:
-            if device not in grad_scale_dict:
-                grad_scale_dict[device] = grad_scale.to(device)
-            device_grad_scale = grad_scale_dict[device]
-        if found_inf is not None:
-            if device not in found_inf_dict:
-                found_inf_dict[device] = found_inf.to(device)
-            device_found_inf = found_inf_dict[device]
+            device_grad_scale = grad_scale_dict.setdefault(
+                device, grad_scale.to(device)
+            )
+        if found_inf_dict is not None and found_inf is not None:
+            device_found_inf = found_inf_dict.setdefault(device, found_inf.to(device))
         torch._fused_sgd_(
             device_params,
             device_grads,
diff --git a/torch/optim/sgd.pyi b/torch/optim/sgd.pyi
deleted file mode 100644
index ba1bcd60a1b89..0000000000000
--- a/torch/optim/sgd.pyi
+++ /dev/null
@@ -1,12 +0,0 @@
-from .optimizer import Optimizer, ParamsT
-
-class SGD(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: float = ...,
-        momentum: float = ...,
-        dampening: float = ...,
-        weight_decay: float = ...,
-        nesterov: bool = ...,
-    ) -> None: ...
diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py
index e3ee2db8204b0..88643d1a56461 100644
--- a/torch/optim/sparse_adam.py
+++ b/torch/optim/sparse_adam.py
@@ -1,13 +1,21 @@
+from typing import List, Tuple
+
 import torch
+from torch import Tensor
 from . import _functional as F
-from .optimizer import _maximize_doc, Optimizer
+from .optimizer import _maximize_doc, Optimizer, ParamsT
 
 __all__ = ["SparseAdam"]
 
 
 class SparseAdam(Optimizer):
     def __init__(
-        self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, maximize: bool = False
+        self,
+        params: ParamsT,
+        lr: float = 1e-3,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-8,
+        maximize: bool = False,
     ):
         if not 0.0 < lr:
             raise ValueError(f"Invalid learning rate: {lr}")
@@ -56,13 +64,11 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            params_with_grad = []
-            grads = []
-            exp_avgs = []
-            exp_avg_sqs = []
-            state_steps = []
-            eps = group["eps"]
-            lr = group["lr"]
+            params_with_grad: List[Tensor] = []
+            grads: List[Tensor] = []
+            exp_avgs: List[Tensor] = []
+            exp_avg_sqs: List[Tensor] = []
+            state_steps: List[int] = []
             beta1, beta2 = group["betas"]
             maximize = group.get("maximize", False)
 
@@ -103,10 +109,10 @@ def step(self, closure=None):
                 exp_avgs,
                 exp_avg_sqs,
                 state_steps,
+                eps=group["eps"],
                 beta1=beta1,
                 beta2=beta2,
                 lr=group["lr"],
-                eps=group["eps"],
                 maximize=maximize,
             )
 
diff --git a/torch/optim/sparse_adam.pyi b/torch/optim/sparse_adam.pyi
deleted file mode 100644
index a84001d590b8c..0000000000000
--- a/torch/optim/sparse_adam.pyi
+++ /dev/null
@@ -1,12 +0,0 @@
-from typing import Tuple
-
-from .optimizer import Optimizer, ParamsT
-
-class SparseAdam(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: float = ...,
-        betas: Tuple[float, float] = ...,
-        eps: float = ...,
-    ) -> None: ...
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index 62bb93c906358..7c2c9cdaf6f92 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -2,7 +2,7 @@
 import math
 import warnings
 from copy import deepcopy
-from typing import Any, Callable, cast, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Callable, Iterable, List, Literal, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -21,11 +21,7 @@
     "get_swa_avg_fn",
 ]
 
-from torch.utils._foreach_utils import (
-    _group_tensors_by_device_and_dtype,
-    Indices,
-    TensorListList,
-)
+from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 
 PARAM_LIST = Union[Tuple[Tensor, ...], List[Tensor]]
 
@@ -192,6 +188,7 @@ class AveragedModel(Module):
     .. _Polyak averaging:
         https://paperswithcode.com/method/polyak-averaging
     """
+    n_averaged: Tensor
 
     def __init__(
         self,
@@ -231,8 +228,8 @@ def update_parameters(self, model: Module):
             if self.use_buffers
             else model.parameters()
         )
-        self_param_detached = []
-        model_param_detached = []
+        self_param_detached: List[Optional[Tensor]] = []
+        model_param_detached: List[Optional[Tensor]] = []
         for p_averaged, p_model in zip(self_param, model_param):
             p_model_ = p_model.detach().to(p_averaged.device)
             self_param_detached.append(p_averaged.detach())
@@ -243,14 +240,7 @@ def update_parameters(self, model: Module):
         if self.n_averaged > 0:
             if self.multi_avg_fn is not None or self.avg_fn is None:
                 grouped_tensors = _group_tensors_by_device_and_dtype(
-                    cast(TensorListList, [self_param_detached, model_param_detached])
-                )
-                grouped_tensors = cast(
-                    Dict[
-                        Tuple[torch.device, torch.dtype],
-                        Tuple[List[List[Tensor]], Indices],
-                    ],
-                    grouped_tensors,
+                    [self_param_detached, model_param_detached]
                 )
                 for (device, _), (
                     [self_params, model_params],
@@ -258,9 +248,12 @@ def update_parameters(self, model: Module):
                 ) in grouped_tensors.items():
                     if self.multi_avg_fn:
                         self.multi_avg_fn(
-                            self_params, model_params, self.n_averaged.to(device)
+                            self_params, model_params, self.n_averaged.to(device)  # type: ignore[arg-type]
                         )
-                    elif device.type in _get_foreach_kernels_supported_devices():
+                    elif (
+                        device is not None
+                        and device.type in _get_foreach_kernels_supported_devices()
+                    ):
                         multi_avg_fn = get_swa_multi_avg_fn()
                         multi_avg_fn(
                             self_params, model_params, self.n_averaged.to(device)
@@ -268,10 +261,10 @@ def update_parameters(self, model: Module):
                     else:
                         avg_fn = get_swa_avg_fn()
                         n_averaged = self.n_averaged.to(device)
-                        for p_averaged, p_model in zip(self_params, model_params):
+                        for p_averaged, p_model in zip(self_params, model_params):  # type: ignore[assignment]
                             p_averaged.copy_(avg_fn(p_averaged, p_model, n_averaged))
             else:
-                for p_averaged, p_model in zip(
+                for p_averaged, p_model in zip(  # type: ignore[assignment]
                     self_param_detached, model_param_detached
                 ):
                     n_averaged = self.n_averaged.to(p_averaged.device)
@@ -394,7 +387,7 @@ def __init__(
         optimizer: Optimizer,
         swa_lr: float,
         anneal_epochs=10,
-        anneal_strategy="cos",
+        anneal_strategy: Literal["cos", "linear"] = "cos",
         last_epoch=-1,
     ):
         swa_lrs = self._format_param(optimizer, swa_lr)
@@ -417,7 +410,10 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @staticmethod
-    def _format_param(optimizer, swa_lrs):
+    def _format_param(
+        optimizer: Optimizer,
+        swa_lrs: Union[float, List[float], Tuple[float, ...]],
+    ) -> Union[List[float], Tuple[float, ...]]:
         if isinstance(swa_lrs, (list, tuple)):
             if len(swa_lrs) != len(optimizer.param_groups):
                 raise ValueError(
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index 6e40d0f68bd23..f9efd00f1bb06 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -148,7 +148,6 @@ def stop(self):
     def prepare_trace(self):
         if self.profiler is None:
             self.profiler = prof.profile(
-                use_cuda=(ProfilerActivity.CUDA in self.activities),
                 use_cpu=(ProfilerActivity.CPU in self.activities),
                 use_mtia=(ProfilerActivity.MTIA in self.activities),
                 use_device=self.use_device,
diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py
index 4e2a654562444..8527084f4afa8 100644
--- a/torch/testing/_internal/autocast_test_lists.py
+++ b/torch/testing/_internal/autocast_test_lists.py
@@ -184,6 +184,7 @@ def __init__(self, dev):
             ("cross", (torch.randn(3, dtype=torch.float32, device=dev),
                        torch.randn(3, dtype=torch.float16, device=dev))),
             ("dot", pointwise0_fp16 + pointwise1_fp32),
+            ("vdot", pointwise0_fp16 + pointwise1_fp32),
             ("grid_sampler", (torch.randn((2, 3, 33, 22), dtype=torch.float16, device=dev),
                               torch.randn((2, 22, 11, 2), dtype=torch.float32, device=dev),
                               0, 0, False)),
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 31bf3fafd2379..283982b2ba445 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -17,11 +17,14 @@
 import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.distributed._composable import checkpoint
+from torch.distributed._composable.fsdp import fully_shard
 from torch.distributed._composable.fsdp._fsdp_param_group import (
     FSDPParamGroup,
     RegisterPostBackwardFunction,
 )
 from torch.distributed._tensor import distribute_tensor, DTensor, Shard
+from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.fsdp import CPUOffload, FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp._common_utils import TrainingState
 from torch.distributed.fsdp._init_utils import NO_RESHARD_AFTER_FORWARD_STRATEGIES
@@ -32,6 +35,11 @@
 )
 from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
 from torch.distributed.fsdp.wrap import always_wrap_policy, ModuleWrapPolicy, wrap
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    RowwiseParallel,
+)
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
 from torch.testing._internal.common_distributed import (
@@ -856,6 +864,47 @@ def reset_parameters(self):
             torch.nn.init.normal_(self.buffer)
 
 
+class MLPStack(nn.Sequential):
+    def __init__(self, mlp_dim: int):
+        modules = [
+            nn.LayerNorm(mlp_dim, bias=False),
+            # Use multiplier of 3 to exercise uneven case
+            MLP(mlp_dim, dim_multiplier=3),
+            MLP(mlp_dim),
+            MLP(mlp_dim, dim_multiplier=3),
+        ]
+        super().__init__(*modules)
+
+    def parallelize(
+        self,
+        tp_mesh: DeviceMesh,
+        dp_mesh: DeviceMesh,
+        use_activation_checkpointing: bool,
+        reshard_after_forward: bool,
+    ) -> "MLPStack":
+        parallelize_module(
+            self,
+            device_mesh=tp_mesh,
+            # Leave the layer norm as implicitly replicated
+            parallelize_plan={
+                # Pass `use_local_output=False` to keep as DTensor to preserve
+                # uneven activation dims
+                "1.in_proj": ColwiseParallel(use_local_output=False),
+                "1.out_proj": RowwiseParallel(use_local_output=False),
+                "2.in_proj": ColwiseParallel(use_local_output=False),
+                "2.out_proj": RowwiseParallel(use_local_output=False),
+                "3.in_proj": ColwiseParallel(use_local_output=False),
+                "3.out_proj": RowwiseParallel(),
+            },
+        )
+        for mlp in self:
+            if use_activation_checkpointing:
+                checkpoint(mlp)
+            fully_shard(mlp, mesh=dp_mesh, reshard_after_forward=reshard_after_forward)
+        fully_shard(self, mesh=dp_mesh, reshard_after_forward=reshard_after_forward)
+        return self
+
+
 class DoubleLinear(nn.Module):
     """
     This can be used for returning multiple outputs from a module
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 678350789b2b4..d456bb520db06 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -14951,8 +14951,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
                # Consider making it a parameter or input, or detaching the gradient
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
-               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad',
-                            active_if=TEST_WITH_ROCM)
            ],
            sample_inputs_func=sample_inputs_instance_norm,
            supports_expanded_weight=True,),
@@ -16273,10 +16271,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
                DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness',
                             device_type='cpu', dtypes=(torch.bfloat16, torch.float16)),
-               # Trying to use forward AD with miopen_batch_norm that does not support it
-               # because it has not been implemented yet.
-               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad',
-                            device_type="cuda", active_if=TEST_WITH_ROCM),
                DecorateInfo(toleranceOverride({torch.float32: tol(atol=5e-05, rtol=1e-05)}),
                             'TestCompositeCompliance', 'test_forward_ad', device_type="cpu"),
            )),
diff --git a/torch/testing/_internal/common_optimizers.py b/torch/testing/_internal/common_optimizers.py
index 5a66923373f74..61396b6226301 100644
--- a/torch/testing/_internal/common_optimizers.py
+++ b/torch/testing/_internal/common_optimizers.py
@@ -1146,8 +1146,7 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         Adagrad,
         optim_inputs_func=optim_inputs_func_adagrad,
         optim_error_inputs_func=optim_error_inputs_func_adagrad,
-        supported_impls=("foreach", "differentiable", "fused"),
-        supports_fused_on=("cpu",),
+        supported_impls=("foreach", "differentiable"),
         supports_sparse=True,
         metadata_for_sparse=(
             {"lr": 0.1, "weight_decay": 0, "lr_decay": 0},
@@ -1156,23 +1155,6 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 lambda opt: ReduceLROnPlateau(opt, threshold=1e-4),
             ],
         ),
-        decorators=(
-            DecorateInfo(
-                #  Note on tolerances:
-                #  difference comes from the fact that the non fused kernel have
-                #  more dtype cast operations. We have another test test_fused_cpu_matches_cuda
-                #  to make sure there is no discrepancies between cuda fused kernel
-                #  and cpu fused kernel
-                toleranceOverride(
-                    {
-                        torch.bfloat16: tol(atol=5e-3, rtol=5e-3),
-                        torch.float16: tol(atol=5e-3, rtol=5e-3),
-                    }
-                ),
-                "TestOptimRenewed",
-                "test_fused_matches_forloop",
-            ),
-        ),
         skips=(
             DecorateInfo(
                 skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 435caa69041be..1805134130936 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -1,6 +1,7 @@
 # mypy: ignore-errors
 
 import copy
+import json
 import itertools
 import math
 import os
@@ -8,7 +9,7 @@
 import sys
 import tempfile
 import time
-from collections import namedtuple, OrderedDict
+from collections import namedtuple, OrderedDict, defaultdict
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from datetime import timedelta
@@ -204,6 +205,24 @@ def get_profiling_event(event_name, profiler, dedup_gpu_user_annotation=False):
         )
     ]
 
+def get_profiler_nccl_meta(prof):
+    """Torch profiler includes nccl metadata in an inserted operator called "record_param_comms"
+    We will need to test metadata obtained from profiler here"""
+    tf = tempfile.NamedTemporaryFile(
+        mode="w+t", suffix=".json", delete=False
+    )
+    tf.close()
+    trace_file = tf.name
+
+    prof.export_chrome_trace(trace_file)
+    with open(trace_file) as f:
+        events = json.load(f)["traceEvents"]
+    print(f"Trace saved to {trace_file}")
+
+    # Comment to debug
+    os.remove(trace_file)
+
+    return [e for e in events if e.get("name") == "record_param_comms"]
 
 # Base error message substring on unfinished reductions.
 ddp_prev_reduction_unfinished_str = (
@@ -659,6 +678,33 @@ def _verify_buffers_equal(self, m1, m2):
                 for b in gathered_bufs_m2:
                     self.assertEqual(b, buf2)
 
+        def _sanity_check_profiler_nccl_meta(self, nccl_meta_events):
+            """Torch profiler includes nccl metadata in an inserted operator called "record_param_comms"
+            We test for basic fields in this profiler event that correspond to the nccl communication
+            collectives"""
+            per_coll_meta = defaultdict(list)
+            for e in nccl_meta_events:
+                args = e.get("args", {})
+                collname = args.get("Collective name", "")
+                self.assertNotEqual(collname, "")
+                self.assertNotEqual(args.get("dtype", ""), "")
+
+                per_coll_meta[collname].append(args)
+                if collname in {"wait"}:
+                    continue
+
+                self.assertEqual(args["Process Group Description"], "default_pg")
+                self.assertNotEqual(args["Process Group Ranks"], "")
+
+                self.assertGreaterEqual(args.get("In msg nelems", -1), 0)
+                self.assertGreaterEqual(args.get("Out msg nelems", -1), 0)
+                self.assertGreaterEqual(args.get("Group size", -1), 0)
+                self.assertGreaterEqual(args.get("Global rank start", -1), 0)
+                self.assertGreaterEqual(args.get("Global rank stride", -1), 0)
+
+            # print(per_coll_meta)
+            return per_coll_meta
+
         def test_dump_DDP_relevant_env_vars(self):
             with captured_output() as (out, _):
                 _dump_DDP_relevant_env_vars()
@@ -1588,6 +1634,7 @@ def _test_send_recv_nccl(self, profiler_ctx=None):
                         for event in events:
                             self.assertTrue(event.input_shapes in expected_shapes)
 
+
         @skip_if_no_gpu
         @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Send Recv Only")
         @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
@@ -6880,6 +6927,8 @@ def _test_ddp_profiling(self, profiler_ctx):
             events = get_profiling_event("search_unused_parameters", prof)
             self.assertEqual(len(events), 1)
 
+            return prof
+
         @require_backend_is_available(DistTestCases.backend_feature["gpu"])
         @skip_if_lt_x_gpu(2)
         @skip_but_pass_in_sandcastle("Currently failing in NVIDIA internal CI")
@@ -6898,7 +6947,29 @@ def test_ddp_profiling_torch_profiler(self):
             cpu_act = torch.profiler.ProfilerActivity.CPU
             cuda_act = torch.profiler.ProfilerActivity.CUDA
             torch_profiler_ctx = torch.profiler.profile(activities=[cpu_act, cuda_act])
-            self._test_ddp_profiling(profiler_ctx=torch_profiler_ctx)
+            prof = self._test_ddp_profiling(profiler_ctx=torch_profiler_ctx)
+
+            if dist.get_backend() != "nccl":
+                return
+
+            # Note comment out the "os.remove(trace_file)" in `get_profiler_nccl_meta()`
+            # to debug any mismatches.
+            nccl_meta_events = get_profiler_nccl_meta(prof)
+            self.assertGreater(len(nccl_meta_events), 0)
+
+            nccl_meta = self._sanity_check_profiler_nccl_meta(nccl_meta_events)
+
+            # additionally check the specific collectives in this test case
+            self.assertEqual(len(nccl_meta["allreduce"]), 2)
+            self.assertEqual(len(nccl_meta["wait"]), 1)
+
+            # check allreduce message sizes
+            a0 = nccl_meta["allreduce"][0]
+            self.assertEqual(a0["Out msg nelems"], 100, msg=f"{a0}")
+            self.assertEqual(a0["dtype"], "Float", msg=f"{a0}")
+            a1 = nccl_meta["allreduce"][1]
+            self.assertEqual(a1["Out msg nelems"], 1, msg=f"{a1}")
+            self.assertEqual(a1["dtype"], "Int", msg=f"{a1}")
 
         @skip_if_lt_x_gpu(2)
         @skip_but_pass_in_sandcastle_if(
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index d22b550c6d1a5..c417f1d9d72a2 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -205,6 +205,7 @@ def _disable_current_modes():
     )
     from torch._subclasses.functional_tensor import FunctionalTensorMode
     from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
+    from torch._subclasses.schema_check_mode import SchemaCheckMode
 
     mode_len_pre_dispatch = _len_torch_dispatch_stack_pre_dispatch()
     old_pre_dispatch_modes = [
@@ -213,12 +214,15 @@ def _disable_current_modes():
 
     has_proxy_mode_in_pre_dispatch = False
     has_functional_mode_in_pre_dispatch = False
+    has_schema_check_mode_in_pre_dispatch = False
 
     for i in old_pre_dispatch_modes:
         if isinstance(i, ProxyTorchDispatchMode):
             has_proxy_mode_in_pre_dispatch = True
         if isinstance(i, FunctionalTensorMode):
             has_functional_mode_in_pre_dispatch = True
+        if isinstance(i, SchemaCheckMode):
+            has_schema_check_mode_in_pre_dispatch = True
 
     mode_len = _len_torch_dispatch_stack()
     old_modes = [_pop_mode() for _ in range(mode_len)]
@@ -235,6 +239,13 @@ def _disable_current_modes():
             raise AssertionError(
                 "Can't have ProxyTorchDispatchMode available both in PreDispatch and Python Key"
             )
+        if (
+            isinstance(old, SchemaCheckMode)
+            and has_schema_check_mode_in_pre_dispatch
+        ):
+            raise AssertionError(
+                "Can't have SchemaCheckMode available both in PreDispatch and Python Key"
+            )
 
     # Manually disable proxy and fake modes, if any are active
     try:
diff --git a/torch/utils/_sympy/symbol.py b/torch/utils/_sympy/symbol.py
index ea2d2b7293f36..89908a09e1971 100644
--- a/torch/utils/_sympy/symbol.py
+++ b/torch/utils/_sympy/symbol.py
@@ -19,6 +19,7 @@
 
 class SymT(Enum):
     SIZE = auto()
+    FLOAT = auto()
     UNBACKED_INT = auto()
     UNBACKED_FLOAT = auto()
     # Inductor: The intermediates in inner_fn tmp0, one generated per ops call.
@@ -54,7 +55,11 @@ class SymT(Enum):
 prefix_str = {
     SymT.SIZE: "s",  # integer
     SymT.UNBACKED_INT: "u",  # integer
-    SymT.UNBACKED_FLOAT: "f",
+    # Prefix z here is chosen to avoid false aliasing in symbol_is_type test
+    # DO NOT add a "z" type.  You also need to avoid conflicts on these
+    # prefixes but this is somewhat easier to manage
+    SymT.FLOAT: "zf",
+    SymT.UNBACKED_FLOAT: "zuf",
     SymT.TMP: "tmp",
     SymT.PRECOMPUTED_SIZE: "ps",
     SymT.INDEX: "i",
diff --git a/torch/utils/_sympy/value_ranges.py b/torch/utils/_sympy/value_ranges.py
index f2319e930d769..eae126b1b4dcd 100644
--- a/torch/utils/_sympy/value_ranges.py
+++ b/torch/utils/_sympy/value_ranges.py
@@ -872,12 +872,15 @@ def bound_sympy(
         #      size variables can come with a lower bound of 2, as we specialise on 0 and 1
         unbounded_ranges: Dict[sympy.Symbol, ValueRanges] = {}
         for s in unbounded_vars:
-            assert s.is_integer  # type: ignore[attr-defined]
-            if s.is_positive:  # type: ignore[attr-defined]
-                lower = 1
-            elif s.is_nonnegative:  # type: ignore[attr-defined]
-                lower = 0
+            if s.is_integer:  # type: ignore[attr-defined]
+                if s.is_positive:  # type: ignore[attr-defined]
+                    lower = 1
+                elif s.is_nonnegative:  # type: ignore[attr-defined]
+                    lower = 0
+                else:
+                    lower = -math.inf  # type: ignore[assignment]
             else:
+                # Don't bother trying very hard here
                 lower = -math.inf  # type: ignore[assignment]
             unbounded_ranges[s] = ValueRanges(lower, math.inf)  # type: ignore[index]
         ranges = {**ranges, **unbounded_ranges}
diff --git a/torch/xpu/streams.py b/torch/xpu/streams.py
index 2c3c3a63d58bd..f4e35a376e7c2 100644
--- a/torch/xpu/streams.py
+++ b/torch/xpu/streams.py
@@ -2,6 +2,7 @@
 
 import torch
 from torch._streambase import _EventBase, _StreamBase
+
 from .._utils import _dummy_type
 
 
@@ -34,7 +35,7 @@ def __new__(cls, device=None, priority=0, **kwargs):
             with torch.xpu.device(device):
                 return super().__new__(cls, priority=priority, **kwargs)
 
-    def wait_event(self, event):
+    def wait_event(self, event) -> None:
         r"""Make all future work submitted to the stream wait for an event.
 
         Args:
@@ -42,7 +43,7 @@ def wait_event(self, event):
         """
         event.wait(self)
 
-    def wait_stream(self, stream):
+    def wait_stream(self, stream) -> None:
         r"""Synchronize with another stream.
 
         All future work submitted to this stream will wait until all kernels
@@ -68,7 +69,7 @@ def record_event(self, event=None):
         event.record(self)
         return event
 
-    def query(self):
+    def query(self) -> bool:
         r"""Check if all the work submitted has been completed.
 
         Returns:
@@ -76,7 +77,7 @@ def query(self):
         """
         return super().query()
 
-    def synchronize(self):
+    def synchronize(self) -> None:
         r"""Wait for all the kernels in this stream to complete."""
         super().synchronize()
 
@@ -114,7 +115,7 @@ class Event(torch._C._XpuEventBase, _EventBase):
     def __new__(cls, enable_timing=False):
         return super().__new__(cls, enable_timing=enable_timing)
 
-    def record(self, stream=None):
+    def record(self, stream=None) -> None:
         r"""Record the event in a given stream.
 
         Uses ``torch.xpu.current_stream()`` if no stream is specified. The
@@ -124,7 +125,7 @@ def record(self, stream=None):
             stream = torch.xpu.current_stream()
         super().record(stream)
 
-    def wait(self, stream=None):
+    def wait(self, stream=None) -> None:
         r"""Make all future work submitted to the given stream wait for this event.
 
         Use ``torch.xpu.current_stream()`` if no stream is specified.
@@ -133,7 +134,7 @@ def wait(self, stream=None):
             stream = torch.xpu.current_stream()
         super().wait(stream)
 
-    def query(self):
+    def query(self) -> bool:
         r"""Check if all work currently captured by event has completed.
 
         Returns:
@@ -150,7 +151,7 @@ def elapsed_time(self, end_event):
         """
         return super().elapsed_time(end_event)
 
-    def synchronize(self):
+    def synchronize(self) -> None:
         r"""Wait for the event to complete.
 
         Waits until the completion of all work currently captured in this event.
diff --git a/torchgen/aoti/fallback_ops.py b/torchgen/aoti/fallback_ops.py
index aba595e141925..f77527a156beb 100644
--- a/torchgen/aoti/fallback_ops.py
+++ b/torchgen/aoti/fallback_ops.py
@@ -65,6 +65,7 @@
     "aten.histogram.bin_ct",
     "aten._histogramdd_bin_edges.default",
     "aten._histogramdd_from_bin_cts.default",
+    "aten.index_put.default",
     "aten.index_reduce.default",
     "aten.index.Tensor",
     "aten.kthvalue.default",
@@ -82,7 +83,9 @@
     "aten.mm.out",
     "aten.mode.default",
     "aten.mul.Scalar",
+    "aten.mul.Tensor",
     "aten.nanmedian.default",
+    "aten.native_dropout.default",
     "aten.nonzero.default",
     "aten.ormqr.default",
     "aten._pdist_backward.default",
@@ -93,6 +96,8 @@
     "aten.rand.default",
     "aten.rand.generator",
     "aten.randint.default",
+    "aten.randint.generator",
+    "aten.randint.low_out",
     "aten.randn.default",
     "aten.randn.generator",
     "aten.randperm.default",
@@ -110,9 +115,11 @@
     "aten._scaled_mm.default",
     "aten.scatter_reduce.two_out",
     "aten.scatter.src_out",
+    "aten.scatter.value_out",
     "aten.searchsorted.default",
     "aten._segment_reduce_backward.default",
     "aten.segment_reduce.default",
+    "aten.slice.Tensor",
     "aten.soft_margin_loss_backward.default",
     "aten.sort.default",
     "aten.sort.stable",
diff --git a/torchgen/gen.py b/torchgen/gen.py
index 28e46c3536e6d..d715361146ea0 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -49,8 +49,8 @@
 from torchgen.gen_aoti_c_shim import (
     gen_aoti_c_shim,
     gen_static_dispatch_backend_call_signature,
-    get_backend_index_for_aoti,
     get_fallback_op_name,
+    get_header_for_aoti,
 )
 from torchgen.gen_functionalization_type import (
     gen_functionalization_definition,
@@ -2353,54 +2353,28 @@ def operator_headers() -> List[str]:
             else:
                 raise AssertionError(f"unrecognized {dispatch_key} for ufunc")
 
-        structured_func_group_dict = {
-            f"{func_group.functional.namespace}.{func_group.functional.func.name}": func_group
-            for func_group in structured_native_functions
-        }
+        structured_func_group_dict = dict()
+        for func_group in structured_native_functions:
+            for func in func_group.functions():
+                if func.structured_delegate is not None:
+                    structured_func_group_dict[func.structured_delegate] = func_group
+                    break
+
         if dispatch_key in (DispatchKey.CPU, DispatchKey.CUDA):
             fallbacks = dict()
             for func in native_functions:
                 op_name = get_fallback_op_name(func)
                 if op_name in inductor_fallback_ops:
-                    fallbacks[op_name] = (
-                        func,
-                        structured_func_group_dict.get(
-                            f"{func.namespace}.{func.func.name.name}", None
-                        ),
-                    )
+                    fallbacks[op_name] = func
             fallback_native_functions = tuple(
                 value for _, value in sorted(fallbacks.items())
             )
 
-            def get_header(
-                func: NativeFunction,
-                func_group: Optional[NativeFunctionsGroup],
-            ) -> Optional[str]:
-                backend_index = get_backend_index_for_aoti(
-                    func, func_group, dispatch_key, backend_indices
-                )
-                return (
-                    None
-                    if backend_index is None
-                    else f"#include <ATen/ops/{func.root_name}_{backend_index.dispatch_key.lower()}_dispatch.h>"
-                )
-
-            def headers_for_aoti() -> str:
-                headers = []
-                for func, func_group in fallback_native_functions:
-                    header = get_header(func, func_group)
-                    if header is not None:
-                        headers.append(header)
-                return "\n".join(sorted(set(headers)))
-
-            extra_headers = (
-                extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else ""
-            )
-
             # header files were checked in for ABI-compatiblilty checking
             header_file_name = f"c_shim_{dispatch_key.lower()}.h"
             new_header = gen_aoti_c_shim(
                 fallback_native_functions,
+                structured_func_group_dict,
                 dispatch_key,
                 backend_indices,
                 header=True,
@@ -2442,10 +2416,25 @@ def headers_for_aoti() -> str:
                     )
 
             # cpp files are always generated on-the-fly
+            def headers_for_aoti() -> str:
+                headers = []
+                for func in fallback_native_functions:
+                    header = get_header_for_aoti(
+                        func, structured_func_group_dict, dispatch_key, backend_indices
+                    )
+                    if header is not None:
+                        headers.append(header)
+                return "\n".join(sorted(set(headers)))
+
+            extra_headers = (
+                extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else ""
+            )
+
             aoti_fm.write(
                 f"c_shim_{dispatch_key.lower()}.cpp",
                 lambda: gen_aoti_c_shim(
                     fallback_native_functions,
+                    structured_func_group_dict,
                     dispatch_key,
                     backend_indices,
                     header=False,
diff --git a/torchgen/gen_aoti_c_shim.py b/torchgen/gen_aoti_c_shim.py
index 0d31bd14a5e6f..1f99e3a9f3fae 100644
--- a/torchgen/gen_aoti_c_shim.py
+++ b/torchgen/gen_aoti_c_shim.py
@@ -16,6 +16,7 @@
     ListType,
     NativeFunction,
     NativeFunctionsGroup,
+    OperatorName,
     OptionalType,
     Type,
 )
@@ -209,7 +210,11 @@ def convert_return(typ: BaseType, val: str) -> str:
 
     ret_pointer_can_be_null = False
     unambiguous_name = schema.name.unambiguous_name()
-    for name in ["_scaled_dot_product_flash_attention", "convolution_backward"]:
+    for name in [
+        "_scaled_dot_product_flash_attention",
+        "_scaled_dot_product_efficient_attention",
+        "convolution_backward",
+    ]:
         if name in unambiguous_name:
             ret_pointer_can_be_null = True
             break
@@ -302,15 +307,17 @@ def gen_static_dispatch_backend_call(
 
 def get_backend_index_for_aoti(
     func: NativeFunction,
-    func_group: Optional[NativeFunctionsGroup],
+    func_group_mapping: Dict[OperatorName, NativeFunctionsGroup],
     dispatch_key: DispatchKey,
     backend_indices: Dict[DispatchKey, BackendIndex],
 ) -> Optional[BackendIndex]:
     backend_index = None
     if backend_indices[dispatch_key].has_kernel(func) or (
         func.structured_delegate is not None
-        and func_group is not None
-        and backend_indices[dispatch_key].has_kernel(func_group)
+        and func.structured_delegate in func_group_mapping
+        and backend_indices[dispatch_key].has_kernel(
+            func_group_mapping[func.structured_delegate]
+        )
     ):
         backend_index = backend_indices[dispatch_key]
     elif backend_indices[DispatchKey.CompositeExplicitAutograd].has_kernel(func):
@@ -327,6 +334,22 @@ def get_backend_index_for_aoti(
     return backend_index
 
 
+def get_header_for_aoti(
+    func: NativeFunction,
+    func_group_mapping: Dict[OperatorName, NativeFunctionsGroup],
+    dispatch_key: DispatchKey,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+) -> Optional[str]:
+    backend_index = get_backend_index_for_aoti(
+        func, func_group_mapping, dispatch_key, backend_indices
+    )
+    return (
+        None
+        if backend_index is None
+        else f"#include <ATen/ops/{func.root_name}_{backend_index.dispatch_key.lower()}_dispatch.h>"
+    )
+
+
 def get_fallback_op_name(func: NativeFunction) -> str:
     return (
         f"{func.namespace}.{func.func.name.name}.{func.func.name.overload_name}"
@@ -337,13 +360,13 @@ def get_fallback_op_name(func: NativeFunction) -> str:
 
 def gen_c_shim(
     func: NativeFunction,
-    func_group: Optional[NativeFunctionsGroup],
+    func_group_mapping: Dict[OperatorName, NativeFunctionsGroup],
     dispatch_key: DispatchKey,
     backend_indices: Dict[DispatchKey, BackendIndex],
     header: bool,
 ) -> Optional[str]:
     backend_index = get_backend_index_for_aoti(
-        func, func_group, dispatch_key, backend_indices
+        func, func_group_mapping, dispatch_key, backend_indices
     )
     if backend_index is None:
         return None
@@ -371,7 +394,7 @@ def gen_c_shim(
 
 @dataclass(frozen=True)
 class ShimGenerator:
-    func_group_mapping: Dict[str, Optional[NativeFunctionsGroup]]
+    func_group_mapping: Dict[OperatorName, NativeFunctionsGroup]
     dispatch_key: DispatchKey
     backend_indices: Dict[DispatchKey, BackendIndex]
     header: bool  # True to generate .h and False to generate .cpp
@@ -383,7 +406,7 @@ def __call__(
     ) -> Optional[str]:
         result = gen_c_shim(
             func,
-            self.func_group_mapping.get(get_fallback_op_name(func), None),
+            self.func_group_mapping,
             self.dispatch_key,
             self.backend_indices,
             self.header,
@@ -392,22 +415,20 @@ def __call__(
 
 
 def gen_aoti_c_shim(
-    native_functions: Sequence[Tuple[NativeFunction, Optional[NativeFunctionsGroup]]],
+    native_functions: Sequence[NativeFunction],
+    func_group_mapping: Dict[OperatorName, NativeFunctionsGroup],
     dispatch_key: DispatchKey,
     backend_indices: Dict[DispatchKey, BackendIndex],
     header: bool,
     includes: str = "",
 ) -> str:
-    func_group_mapping = {
-        get_fallback_op_name(func): func_group for func, func_group in native_functions
-    }
     body = "\n".join(
         list(
             mapMaybe(
                 ShimGenerator(
                     func_group_mapping, dispatch_key, backend_indices, header
                 ),
-                [func for func, _ in native_functions],
+                native_functions,
             )
         )
     )