diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 426f4698c2b00..73e3f09394b72 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -84,13 +84,27 @@ fi # CMake 3.18 is needed to support CUDA17 language variant CMAKE_VERSION=3.18.5 -_UCX_COMMIT=00bcc6bb18fc282eb160623b4c0d300147f579af -_UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea +_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb +_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b # It's annoying to rename jobs every time you want to rewrite a # configuration, so we hardcode everything here rather than do it # from scratch case "$image" in + pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9) + CUDA_VERSION=12.4.0 + CUDNN_VERSION=8 + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=9 + PROTOBUF=yes + DB=yes + VISION=yes + KATEX=yes + UCX_COMMIT=${_UCX_COMMIT} + UCC_COMMIT=${_UCC_COMMIT} + CONDA_CMAKE=yes + TRITON=yes + ;; pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9) CUDA_VERSION=12.1.1 CUDNN_VERSION=8 @@ -105,6 +119,21 @@ case "$image" in CONDA_CMAKE=yes TRITON=yes ;; + pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks) + CUDA_VERSION=12.4.0 + CUDNN_VERSION=8 + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=9 + PROTOBUF=yes + DB=yes + VISION=yes + KATEX=yes + UCX_COMMIT=${_UCX_COMMIT} + UCC_COMMIT=${_UCC_COMMIT} + CONDA_CMAKE=yes + TRITON=yes + INDUCTOR_BENCHMARKS=yes + ;; pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks) CUDA_VERSION=12.1.1 CUDNN_VERSION=8 @@ -134,6 +163,20 @@ case "$image" in CONDA_CMAKE=yes TRITON=yes ;; + pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9) + CUDA_VERSION=12.4.0 + CUDNN_VERSION=8 + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=9 + PROTOBUF=yes + DB=yes + VISION=yes + KATEX=yes + UCX_COMMIT=${_UCX_COMMIT} + UCC_COMMIT=${_UCC_COMMIT} + CONDA_CMAKE=yes + TRITON=yes + ;; pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9) CUDA_VERSION=12.1.1 CUDNN_VERSION=8 @@ -226,7 +269,7 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - BASEKIT_VERSION=2024.0.0-49522 + XPU_VERSION=0.5 NINJA_VERSION=1.9.0 CONDA_CMAKE=yes TRITON=yes @@ -403,7 +446,7 @@ docker build \ --build-arg "DOCS=${DOCS}" \ --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \ --build-arg "EXECUTORCH=${EXECUTORCH}" \ - --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \ + --build-arg "XPU_VERSION=${XPU_VERSION}" \ --build-arg "ACL=${ACL:-}" \ --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \ --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \ diff --git a/.ci/docker/common/install_cudnn.sh b/.ci/docker/common/install_cudnn.sh index f654c9fee24e6..3afd2f28841f5 100644 --- a/.ci/docker/common/install_cudnn.sh +++ b/.ci/docker/common/install_cudnn.sh @@ -4,7 +4,10 @@ if [[ ${CUDNN_VERSION} == 8 ]]; then # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement mkdir tmp_cudnn pushd tmp_cudnn - if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then + if [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then + CUDNN_NAME="cudnn-linux-x86_64-8.9.7.29_cuda12-archive" + curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz + elif [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive" curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then diff --git a/.ci/docker/common/install_cusparselt.sh b/.ci/docker/common/install_cusparselt.sh index d418f1c75610e..493982919f8a4 100644 --- a/.ci/docker/common/install_cusparselt.sh +++ b/.ci/docker/common/install_cusparselt.sh @@ -5,9 +5,14 @@ set -ex # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html mkdir tmp_cusparselt && cd tmp_cusparselt -if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then - CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.5.2.1-archive" - curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz +if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then + arch_path='sbsa' + export TARGETARCH=${TARGETARCH:-$(uname -m)} + if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then + arch_path='x86_64' + fi + CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.5.2.1-archive" + curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive" curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh index de009c1a3adbf..d2db4cb76bfcf 100755 --- a/.ci/docker/common/install_triton.sh +++ b/.ci/docker/common/install_triton.sh @@ -15,7 +15,7 @@ conda_reinstall() { if [ -n "${ROCM_VERSION}" ]; then TRITON_REPO="https://github.com/openai/triton" TRITON_TEXT_FILE="triton-rocm" -elif [ -n "${BASEKIT_VERSION}" ]; then +elif [ -n "${XPU_VERSION}" ]; then TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" TRITON_TEXT_FILE="triton-xpu" else diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh index d98ad2049b47c..aa308010326a7 100644 --- a/.ci/docker/common/install_xpu.sh +++ b/.ci/docker/common/install_xpu.sh @@ -3,10 +3,7 @@ set -xe # Intel® software for general purpose GPU capabilities. -# Refer to https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html - -# Intel® oneAPI Base Toolkit (version 2024.0.0) has been updated to include functional and security updates. -# Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html +# Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html # Users should update to the latest version as it becomes available @@ -17,14 +14,16 @@ function install_ubuntu() { # Set up the repository. To do this, download the key to the system keyring wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \ | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg - wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ - | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null + wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ + | gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg # Add the signed entry to APT sources and configure the APT client to use the Intel repository - echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \ + echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \ + https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \ | tee /etc/apt/sources.list.d/intel-gpu-jammy.list - echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \ - | tee /etc/apt/sources.list.d/oneAPI.list + echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \ + https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \ + | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list # Update the packages list and repository index apt-get update @@ -40,11 +39,11 @@ function install_ubuntu() { mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo # Development Packages apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev - # Install Intel® oneAPI Base Toolkit - if [ -n "$BASEKIT_VERSION" ]; then - apt-get install intel-basekit=$BASEKIT_VERSION -y + # Install Intel Support Packages + if [ -n "$XPU_VERSION" ]; then + apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION} else - apt-get install intel-basekit -y + apt-get install -y intel-for-pytorch-gpu-dev fi # Cleanup diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile index dcf7312c108fc..f96ee5e3b1070 100644 --- a/.ci/docker/ubuntu-cuda/Dockerfile +++ b/.ci/docker/ubuntu-cuda/Dockerfile @@ -152,6 +152,7 @@ RUN rm install_cusparselt.sh RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi +RUN if [ -h /usr/local/cuda-12.1/cuda-12.4 ]; then rm /usr/local/cuda-12.1/cuda-12.4; fi USER jenkins CMD ["bash"] diff --git a/.ci/docker/ubuntu-xpu/Dockerfile b/.ci/docker/ubuntu-xpu/Dockerfile index e49358fcbd0d9..02cd1133a050c 100644 --- a/.ci/docker/ubuntu-xpu/Dockerfile +++ b/.ci/docker/ubuntu-xpu/Dockerfile @@ -62,7 +62,7 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt # Install XPU Dependencies -ARG BASEKIT_VERSION +ARG XPU_VERSION COPY ./common/install_xpu.sh install_xpu.sh RUN bash ./install_xpu.sh && rm install_xpu.sh diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh index b81caa0513691..4aa5dc39d0f5f 100755 --- a/.ci/pytorch/build.sh +++ b/.ci/pytorch/build.sh @@ -44,11 +44,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then fi fi -if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* ]]; then - echo "Caffe2 build is ON" - export BUILD_CAFFE2=ON -fi - if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then export ATEN_THREADING=TBB export USE_TBB=1 diff --git a/.ci/pytorch/docs-test.sh b/.ci/pytorch/docs-test.sh index 557f9d348772f..ffc00b623c14f 100755 --- a/.ci/pytorch/docs-test.sh +++ b/.ci/pytorch/docs-test.sh @@ -6,4 +6,4 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh" echo "Testing pytorch docs" cd docs -make doctest +TERM=vt100 make doctest diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index 19d28eeefd9a8..6af49bee7d05e 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -588,6 +588,15 @@ test_inductor_torchbench_smoketest_perf() { "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \ --expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv done + + # Perform some "warm-start" runs for a few huggingface models. + for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do + python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \ + --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" + python benchmarks/dynamo/check_accuracy.py \ + --actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \ + --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv" + done } test_inductor_torchbench_cpu_smoketest_perf(){ diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt index 9830a3ce9650e..98cd949f97130 100644 --- a/.github/ci_commit_pins/audio.txt +++ b/.github/ci_commit_pins/audio.txt @@ -1 +1 @@ -ea437b31ce316ea3d66fe73768c0dcb94edb79ad +1980f8af5bcd0bb2ce51965cf79d8d4c25dad8a0 diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index 4bf7526e79141..ade85af096871 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1,6 +1,5 @@ tracking_issue: 24422 ciflow_tracking_issue: 64124 -TD_rollout_issue: 123120 ciflow_push_tags: - ciflow/binaries - ciflow/binaries_conda diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 6d822165895eb..9f0dfe973dc9f 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -38,6 +38,8 @@ jobs: matrix: runner: [linux.12xlarge] docker-image-name: [ + pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9, + pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks, pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9, pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks, pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9, diff --git a/.lintrunner.toml b/.lintrunner.toml index 938e9521f72d2..988b1697c8455 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -1053,12 +1053,6 @@ exclude_patterns = [ 'test/quantization/fx/test_quantize_fx.py', 'test/quantization/fx/test_subgraph_rewriter.py', 'test/test_datapipe.py', - 'test/test_fake_tensor.py', - 'test/test_flop_counter.py', - 'test/test_function_schema.py', - 'test/test_functional_autograd_benchmark.py', - 'test/test_functional_optim.py', - 'test/test_functionalization_of_rng_ops.py', 'test/test_futures.py', 'test/test_fx.py', 'test/test_fx_experimental.py', @@ -1143,7 +1137,6 @@ exclude_patterns = [ 'test/test_transformers.py', 'test/test_type_promotion.py', 'test/test_unary_ufuncs.py', - 'test/test_utils.py', 'test/test_vulkan.py', 'test/test_xnnpack_integration.py', 'test/torch_np/numpy_test/**/*.py', diff --git a/CMakeLists.txt b/CMakeLists.txt index 79db67e7357b5..f7561d606cbdb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,11 +181,7 @@ option(BUILD_BINARY "Build C++ binaries" OFF) option(BUILD_DOCS "Build Caffe2 documentation" OFF) option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON) option(BUILD_PYTHON "Build Python binaries" ON) -option(BUILD_CAFFE2 "Master flag to build Caffe2" OFF) option(BUILD_LITE_INTERPRETER "Master flag to build Lite Interpreter" OFF) -cmake_dependent_option( - BUILD_CAFFE2_OPS "Build Caffe2 operators" ON - "BUILD_CAFFE2" OFF) option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON) cmake_dependent_option( CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON @@ -281,8 +277,8 @@ if(NOT DEFINED USE_VULKAN) endif() option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF) -option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON) -option(USE_LITE_INTERPRETER_PROFILER "Enable " ON) +option(USE_SOURCE_DEBUG_ON_MOBILE "Enable" ON) +option(USE_LITE_INTERPRETER_PROFILER "Enable" ON) option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF) option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF) # option USE_XNNPACK: try to enable xnnpack by default. @@ -635,7 +631,6 @@ if(INTERN_BUILD_MOBILE) endif() set(BUILD_PYTHON OFF) set(BUILD_FUNCTORCH OFF) - set(BUILD_CAFFE2_OPS OFF) set(USE_DISTRIBUTED OFF) set(NO_API ON) set(USE_FBGEMM OFF) @@ -1208,13 +1203,6 @@ else() "shared libs.") endif() -# ---[ Modules -# If master flag for buildling Caffe2 is disabled, we also disable the -# build for Caffe2 related operator modules. -if(BUILD_CAFFE2) - add_subdirectory(modules) -endif() - # ---[ Binaries # Binaries will be built after the Caffe2 main libraries and the modules # are built. For the binaries, they will be linked to the Caffe2 main diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e665e3fb8bbf6..a37c5a3b405db 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -667,7 +667,6 @@ only interested in a specific component. - Working on a test binary? Run `(cd build && ninja bin/test_binary_name)` to rebuild only that test binary (without rerunning cmake). (Replace `ninja` with `make` if you don't have ninja installed). -- Don't need Caffe2? Pass `BUILD_CAFFE2=0` to disable Caffe2 build. On the initial build, you can also speed things up with the environment variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTENTION`, `USE_MEM_EFF_ATTENTION`, `BUILD_TEST`, `USE_FBGEMM`, `USE_NNPACK` and `USE_QNNPACK`. @@ -1196,7 +1195,7 @@ build_with_asan() LDFLAGS="-stdlib=libstdc++" \ CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \ CXX_FLAGS="-pthread" \ - USE_CUDA=0 USE_OPENMP=0 BUILD_CAFFE2_OPS=0 USE_DISTRIBUTED=0 DEBUG=1 \ + USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \ python setup.py develop } diff --git a/README.md b/README.md index 3ff42586109c3..eb291b1c97e00 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,9 @@ Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.o - [NVIDIA Jetson Platforms](#nvidia-jetson-platforms) - [From Source](#from-source) - [Prerequisites](#prerequisites) + - [NVIDIA CUDA Support](#nvidia-cuda-support) + - [AMD ROCm Support](#amd-rocm-support) + - [Intel GPU Support](#intel-gpu-support) - [Install Dependencies](#install-dependencies) - [Get the PyTorch Source](#get-the-pytorch-source) - [Install PyTorch](#install-pytorch) @@ -162,6 +165,7 @@ If you are installing from source, you will need: We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro. +##### NVIDIA CUDA Support If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following: - [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) - [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above @@ -174,6 +178,7 @@ Other potentially useful environment variables may be found in `setup.py`. If you are building for NVIDIA's Jetson platforms (Jetson Nano, TX1, TX2, AGX Xavier), Instructions to install PyTorch for Jetson Nano are [available here](https://devtalk.nvidia.com/default/topic/1049071/jetson-nano/pytorch-for-jetson-nano/) +##### AMD ROCm Support If you want to compile with ROCm support, install - [AMD ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) 4.0 and above installation - ROCm is currently supported only for Linux systems. @@ -181,6 +186,14 @@ If you want to compile with ROCm support, install If you want to disable ROCm support, export the environment variable `USE_ROCM=0`. Other potentially useful environment variables may be found in `setup.py`. +##### Intel GPU Support +If you want to compile with Intel GPU support, follow these +- [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) instructions. +- Intel GPU is currently supported only for Linux systems. + +If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`. +Other potentially useful environment variables may be found in `setup.py`. + #### Install Dependencies **Common** @@ -379,7 +392,7 @@ You can also pass the `CMAKE_VARS="..."` environment variable to specify additio See [setup.py](./setup.py) for the list of available variables. ```bash -CMAKE_VARS="BUILD_CAFFE2=ON BUILD_CAFFE2_OPS=ON" make -f docker.Makefile +make -f docker.Makefile ``` ### Building the Documentation diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 583662e6c63d0..9ec458fda45e4 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -54,7 +54,7 @@ if(NOT BUILD_LITE_INTERPRETER) endif() EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS}) # Exclude TensorImpl_test.cpp if compiling without Caffe2 -if(NOT BUILD_CAFFE2 AND NOT BUILD_LITE_INTERPRETER) +if(NOT BUILD_LITE_INTERPRETER) file(GLOB_RECURSE ATen_CORE_EXCLUDED_TEST_SRCS "core/TensorImpl_test.cpp") EXCLUDE(ATen_CORE_TEST_SRCS "${ATen_CORE_TEST_SRCS}" ${ATen_CORE_EXCLUDED_TEST_SRCS}) endif() diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp index 2d086ebbe71fe..156a2b663c033 100644 --- a/aten/src/ATen/CPUGeneratorImpl.cpp +++ b/aten/src/ATen/CPUGeneratorImpl.cpp @@ -81,8 +81,8 @@ inline uint64_t make64BitsFrom32Bits(uint32_t hi, uint32_t lo) { CPUGeneratorImpl::CPUGeneratorImpl(uint64_t seed_in) : c10::GeneratorImpl{Device(DeviceType::CPU), DispatchKeySet(c10::DispatchKey::CPU)}, engine_{seed_in}, - next_float_normal_sample_{c10::optional()}, - next_double_normal_sample_{c10::optional()} { } + next_float_normal_sample_{std::optional()}, + next_double_normal_sample_{std::optional()} { } /** * Manually seeds the engine with the seed input @@ -151,8 +151,8 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) { detail::check_rng_state(new_state); at::mt19937 engine; - auto float_normal_sample = c10::optional(); - auto double_normal_sample = c10::optional(); + auto float_normal_sample = std::optional(); + auto double_normal_sample = std::optional(); // Construct the state of at::CPUGeneratorImpl based on input byte tensor size. CPUGeneratorImplStateLegacy* legacy_pod{nullptr}; @@ -160,7 +160,7 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) { if (new_state_size == size_legacy) { legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data(); // Note that in CPUGeneratorImplStateLegacy, we didn't have float version - // of normal sample and hence we leave the c10::optional as is + // of normal sample and hence we leave the std::optional as is // Update next_double_normal_sample. // Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y) @@ -171,14 +171,14 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) { auto r = legacy_pod->normal_rho; auto theta = 2.0 * c10::pi * legacy_pod->normal_x; // we return the sin version of the normal sample when in caching mode - double_normal_sample = c10::optional(r * ::sin(theta)); + double_normal_sample = std::optional(r * ::sin(theta)); } } else if (new_state_size == size_current) { auto rng_state = (CPUGeneratorImplState*)new_state.data(); legacy_pod = &rng_state->legacy_pod; // update next_float_normal_sample if (rng_state->is_next_float_normal_sample_valid) { - float_normal_sample = c10::optional(rng_state->next_float_normal_sample); + float_normal_sample = std::optional(rng_state->next_float_normal_sample); } // Update next_double_normal_sample. @@ -186,7 +186,7 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) { // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho // are squashed to 0.0. if (legacy_pod->normal_is_valid) { - double_normal_sample = c10::optional(legacy_pod->normal_y); + double_normal_sample = std::optional(legacy_pod->normal_y); } } else { AT_ERROR("Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy, @@ -283,14 +283,14 @@ uint64_t CPUGeneratorImpl::random64() { /** * Get the cached normal random in float */ -c10::optional CPUGeneratorImpl::next_float_normal_sample() { +std::optional CPUGeneratorImpl::next_float_normal_sample() { return next_float_normal_sample_; } /** * Get the cached normal random in double */ -c10::optional CPUGeneratorImpl::next_double_normal_sample() { +std::optional CPUGeneratorImpl::next_double_normal_sample() { return next_double_normal_sample_; } @@ -299,7 +299,7 @@ c10::optional CPUGeneratorImpl::next_double_normal_sample() { * * See Note [Acquire lock when using random generators] */ -void CPUGeneratorImpl::set_next_float_normal_sample(c10::optional randn) { +void CPUGeneratorImpl::set_next_float_normal_sample(std::optional randn) { next_float_normal_sample_ = randn; } @@ -308,7 +308,7 @@ void CPUGeneratorImpl::set_next_float_normal_sample(c10::optional randn) * * See Note [Acquire lock when using random generators] */ -void CPUGeneratorImpl::set_next_double_normal_sample(c10::optional randn) { +void CPUGeneratorImpl::set_next_double_normal_sample(std::optional randn) { next_double_normal_sample_ = randn; } diff --git a/aten/src/ATen/CPUGeneratorImpl.h b/aten/src/ATen/CPUGeneratorImpl.h index f74c42f44fda5..34dd33a475b91 100644 --- a/aten/src/ATen/CPUGeneratorImpl.h +++ b/aten/src/ATen/CPUGeneratorImpl.h @@ -24,18 +24,18 @@ struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl { static c10::DeviceType device_type(); uint32_t random(); uint64_t random64(); - c10::optional next_float_normal_sample(); - c10::optional next_double_normal_sample(); - void set_next_float_normal_sample(c10::optional randn); - void set_next_double_normal_sample(c10::optional randn); + std::optional next_float_normal_sample(); + std::optional next_double_normal_sample(); + void set_next_float_normal_sample(std::optional randn); + void set_next_double_normal_sample(std::optional randn); at::mt19937 engine(); void set_engine(at::mt19937 engine); private: CPUGeneratorImpl* clone_impl() const override; at::mt19937 engine_; - c10::optional next_float_normal_sample_; - c10::optional next_double_normal_sample_; + std::optional next_float_normal_sample_; + std::optional next_double_normal_sample_; }; namespace detail { diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index b50f0479e2fab..a922bcd5922fc 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -59,7 +59,7 @@ class TORCH_API Context { } } const AcceleratorHooksInterface& getAcceleratorHooksInterface( - c10::optional opt_device_type = c10::nullopt) { + std::optional opt_device_type = c10::nullopt) { c10::DeviceType device_type = opt_device_type.has_value() ? opt_device_type.value() : at::getAccelerator(true).value(); @@ -395,7 +395,7 @@ class TORCH_API Context { bool release_original_weights = false; #endif bool display_vmap_fallback_warnings_ = false; - c10::optional quantized_engine = c10::nullopt; + std::optional quantized_engine = c10::nullopt; bool enable_sparse_tensor_invariant_checks = false; bool allow_fp16_reduction_cpu = false; diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h index adc7f3efdbb6a..6c2f57e16c8ce 100644 --- a/aten/src/ATen/DeviceGuard.h +++ b/aten/src/ATen/DeviceGuard.h @@ -15,7 +15,7 @@ namespace at { // OptionalDeviceGuard guard(device_of(tensor)); /// Return the Device of a Tensor, if the Tensor is defined. -inline c10::optional device_of(const Tensor& t) { +inline std::optional device_of(const Tensor& t) { if (t.defined()) { return c10::make_optional(t.device()); } else { @@ -23,14 +23,14 @@ inline c10::optional device_of(const Tensor& t) { } } -inline c10::optional device_of(const c10::optional& t) { +inline std::optional device_of(const c10::optional& t) { return t.has_value() ? device_of(t.value()) : c10::nullopt; } /// Return the Device of a TensorList, if the list is non-empty and /// the first Tensor is defined. (This function implicitly assumes /// that all tensors in the list have the same device.) -inline c10::optional device_of(ITensorListRef t) { +inline std::optional device_of(ITensorListRef t) { if (!t.empty()) { return device_of(t.front()); } else { diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp index 0b35fc67b53ac..1eb5c070b547c 100644 --- a/aten/src/ATen/EmptyTensor.cpp +++ b/aten/src/ATen/EmptyTensor.cpp @@ -163,7 +163,7 @@ TensorBase _empty_generic( c10::Allocator* allocator, c10::DispatchKeySet ks, ScalarType scalar_type, - c10::optional memory_format_opt) { + std::optional memory_format_opt) { at::detail::check_size_nonnegative(size); at::detail::raise_warning_for_complex_half(scalar_type); caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type); @@ -197,7 +197,7 @@ TensorBase empty_generic( c10::Allocator* allocator, c10::DispatchKeySet ks, ScalarType scalar_type, - c10::optional memory_format_opt) { + std::optional memory_format_opt) { return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt); } @@ -206,7 +206,7 @@ TensorBase empty_generic_symint( c10::Allocator* allocator, c10::DispatchKeySet ks, ScalarType scalar_type, - c10::optional memory_format_opt) { + std::optional memory_format_opt) { return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt); } @@ -252,7 +252,7 @@ TensorBase empty_strided_symint_generic( } TensorBase empty_cpu(IntArrayRef size, ScalarType dtype, bool pin_memory, - c10::optional memory_format_opt) { + std::optional memory_format_opt) { auto allocator = GetCPUAllocatorMaybePinned(pin_memory); constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU); return empty_generic(size, allocator, cpu_ks, dtype, memory_format_opt); @@ -260,11 +260,11 @@ TensorBase empty_cpu(IntArrayRef size, ScalarType dtype, bool pin_memory, TensorBase empty_cpu( IntArrayRef size, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::CPU); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided); @@ -295,10 +295,10 @@ TensorBase empty_strided_cpu(IntArrayRef size, IntArrayRef stride, TensorBase empty_strided_cpu( IntArrayRef size, IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::CPU); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided); @@ -342,7 +342,7 @@ static MetaAllocator g_meta_alloc; REGISTER_ALLOCATOR(kMeta, &g_meta_alloc); TensorBase empty_meta(IntArrayRef size, ScalarType dtype, - c10::optional memory_format_opt) { + std::optional memory_format_opt) { auto *allocator = GetAllocator(kMeta); constexpr c10::DispatchKeySet meta_dks(c10::DispatchKey::Meta); return at::detail::empty_generic( @@ -351,11 +351,11 @@ TensorBase empty_meta(IntArrayRef size, ScalarType dtype, TensorBase empty_meta( IntArrayRef size, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt ) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta); // NB: because there is no SparseMeta (yet), non-strided layout is @@ -371,11 +371,11 @@ TensorBase empty_meta( TensorBase empty_symint_meta( SymIntArrayRef size, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt ) { auto *allocator = GetAllocator(kMeta); constexpr c10::DispatchKeySet ks(c10::DispatchKey::Meta); @@ -405,10 +405,10 @@ TensorBase empty_strided_meta(IntArrayRef size, IntArrayRef stride, TensorBase empty_strided_meta( IntArrayRef size, IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided); @@ -440,10 +440,10 @@ TensorBase empty_strided_symint_meta(SymIntArrayRef size, SymIntArrayRef stride, TensorBase empty_strided_symint_meta( SymIntArrayRef size, SymIntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided); diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h index f6e2e53bc99f5..e0e304ea8e8f6 100644 --- a/aten/src/ATen/EmptyTensor.h +++ b/aten/src/ATen/EmptyTensor.h @@ -49,14 +49,14 @@ TORCH_API TensorBase empty_generic( c10::Allocator* allocator, c10::DispatchKeySet ks, ScalarType scalar_type, - c10::optional memory_format_opt); + std::optional memory_format_opt); TORCH_API TensorBase empty_generic_symint( SymIntArrayRef size, c10::Allocator* allocator, c10::DispatchKeySet ks, ScalarType scalar_type, - c10::optional memory_format_opt); + std::optional memory_format_opt); TORCH_API TensorBase empty_strided_generic( IntArrayRef size, @@ -76,15 +76,15 @@ TORCH_API TensorBase empty_cpu( IntArrayRef size, ScalarType dtype, bool pin_memory = false, - c10::optional memory_format_opt = c10::nullopt); + std::optional memory_format_opt = c10::nullopt); TORCH_API TensorBase empty_cpu( IntArrayRef size, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt); + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt); TORCH_API TensorBase empty_cpu(IntArrayRef size, const TensorOptions& options); @@ -97,10 +97,10 @@ TORCH_API TensorBase empty_strided_cpu( TORCH_API TensorBase empty_strided_cpu( IntArrayRef size, IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt); + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt); TORCH_API TensorBase empty_strided_cpu( IntArrayRef size, @@ -110,23 +110,23 @@ TORCH_API TensorBase empty_strided_cpu( TORCH_API TensorBase empty_meta( IntArrayRef size, ScalarType dtype, - c10::optional memory_format_opt = c10::nullopt); + std::optional memory_format_opt = c10::nullopt); TORCH_API TensorBase empty_meta( IntArrayRef size, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt); + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt); TORCH_API TensorBase empty_symint_meta( SymIntArrayRef size, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt); + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt); TORCH_API TensorBase empty_meta(IntArrayRef size, const TensorOptions& options); @@ -136,10 +136,10 @@ empty_strided_meta(IntArrayRef size, IntArrayRef stride, ScalarType dtype); TORCH_API TensorBase empty_strided_meta( IntArrayRef size, IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt); + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt); TORCH_API TensorBase empty_strided_meta( IntArrayRef size, @@ -154,10 +154,10 @@ TORCH_API TensorBase empty_strided_symint_meta( TORCH_API TensorBase empty_strided_symint_meta( SymIntArrayRef size, SymIntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt); + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt); TORCH_API TensorBase empty_strided_symint_meta( SymIntArrayRef size, diff --git a/aten/src/ATen/FunctionalInverses.cpp b/aten/src/ATen/FunctionalInverses.cpp index ebc24085a74a8..c70c8bd842f9e 100644 --- a/aten/src/ATen/FunctionalInverses.cpp +++ b/aten/src/ATen/FunctionalInverses.cpp @@ -145,7 +145,7 @@ Tensor FunctionalInverses::_neg_view_inverse(const Tensor& base, const Tensor& m } } -Tensor FunctionalInverses::as_strided_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, at::SymIntArrayRef size, at::SymIntArrayRef stride, c10::optional storage_offset) { +Tensor FunctionalInverses::as_strided_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, at::SymIntArrayRef size, at::SymIntArrayRef stride, std::optional storage_offset) { if (inverse_return_mode == InverseReturnMode::AlwaysView) { // NB: assumes mutated_view is a narrowed view of base. // We should NOT do this for functionalization @@ -220,7 +220,7 @@ Tensor FunctionalInverses::lift_fresh_inverse(const Tensor& base, const Tensor& return mutated_view; } -Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t dim, c10::optional start, c10::optional end, c10::SymInt step) { +Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t dim, std::optional start, c10::optional end, c10::SymInt step) { if (inverse_return_mode == InverseReturnMode::AlwaysView) { // NB: assumes mutated_view is a narrowed view of base. // We should NOT do this for functionalization diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp index c9ef28dbf56e4..73edec07e2623 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.cpp +++ b/aten/src/ATen/FunctionalTensorWrapper.cpp @@ -526,7 +526,7 @@ Tensor to_functional_tensor(const Tensor& tensor) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!isFunctionalTensor(tensor)); return at::detail::make_tensor(tensor); } -c10::optional to_functional_tensor(const c10::optional& tensor) { +std::optional to_functional_tensor(const c10::optional& tensor) { if (tensor.has_value()) { return c10::make_optional(to_functional_tensor(*tensor)); } @@ -564,7 +564,7 @@ Tensor from_functional_tensor(const Tensor& tensor, bool assert_functional) { return tensor; } } -c10::optional from_functional_tensor(const c10::optional& t, bool assert_functional) { +std::optional from_functional_tensor(const c10::optional& t, bool assert_functional) { if (t.has_value()) { return c10::make_optional(from_functional_tensor(*t, assert_functional)); } @@ -610,7 +610,7 @@ void sync(const Tensor& t) { auto functional_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t); functional_impl->sync_(); } -void sync(const c10::optional& t) { +void sync(const std::optional& t) { if (t.has_value()) { sync(*t); } @@ -692,7 +692,7 @@ bool isFunctionalTensor(const at::Tensor& tensor) { return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize); } -bool isFunctionalTensor(const c10::optional& t) { +bool isFunctionalTensor(const std::optional& t) { if (t.has_value()) { return isFunctionalTensor(*t); } else { diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h index 95d6afe5f0be0..6ef890b772c1c 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.h +++ b/aten/src/ATen/FunctionalTensorWrapper.h @@ -286,32 +286,32 @@ TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper( } TORCH_API bool isFunctionalTensor(const at::Tensor& tensor); -TORCH_API bool isFunctionalTensor(const c10::optional& t); +TORCH_API bool isFunctionalTensor(const std::optional& t); TORCH_API bool isFunctionalTensor( - const c10::List>& t_list); + const c10::List>& t_list); TORCH_API bool isFunctionalTensor(ITensorListRef list); TORCH_API Tensor to_functional_tensor(const Tensor& tensor); -TORCH_API c10::optional to_functional_tensor( - const c10::optional& tensor); -TORCH_API c10::List> to_functional_tensor( - const c10::List>& t_list); +TORCH_API std::optional to_functional_tensor( + const std::optional& tensor); +TORCH_API c10::List> to_functional_tensor( + const c10::List>& t_list); TORCH_API std::vector to_functional_tensor(ITensorListRef t_list); TORCH_API void freeze_functional_tensor(const Tensor& tensor); TORCH_API Tensor from_functional_tensor(const Tensor& tensor, bool assert_functional = true); -TORCH_API c10::optional from_functional_tensor( - const c10::optional& t, +TORCH_API std::optional from_functional_tensor( + const std::optional& t, bool assert_functional = true); -TORCH_API c10::List> from_functional_tensor( - const c10::List>& t_list); +TORCH_API c10::List> from_functional_tensor( + const c10::List>& t_list); TORCH_API std::vector from_functional_tensor(ITensorListRef t_list); TORCH_API void sync(const at::Tensor& t); -TORCH_API void sync(const c10::optional& t); -TORCH_API void sync(const c10::List>& t_list); +TORCH_API void sync(const std::optional& t); +TORCH_API void sync(const c10::List>& t_list); TORCH_API void sync(ITensorListRef t_list); TORCH_API void replace_(const Tensor& functional_tensor, const Tensor& other); diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp index 8b26c875fc02c..1ffc268b7f79b 100644 --- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp +++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp @@ -125,7 +125,7 @@ namespace { // - when we resize to a larger size, it acts as a mutation // - when we resize to a smaller size, it acts as a view // See Note [resize_ in Functionalization] for more dtails -static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, c10::optional memory_format) { +static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, std::optional memory_format) { // First unwrap the tensor arguments at::Tensor self_; if (at::functionalization::impl::isFunctionalTensor(self)) { @@ -216,7 +216,7 @@ static at::Tensor lift_fresh_functionalize_copy(const at::Tensor & self) { // in the local include TLS. As a result, when we redispatch here, // we will end up hitting PreDispatch stack first. So, we should // directly redispatch to the functionalize key manually. - static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("aten::clone", "").typed)>(); + static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("aten::clone", "").typed)>(); return op.redispatch(c10::DispatchKeySet({c10::DispatchKey::Functionalize}), self, c10::nullopt); } @@ -225,7 +225,7 @@ static at::Tensor lift_fresh_functionalize_copy(const at::Tensor & self) { return at::functionalization::impl::to_functional_tensor(out); } -static bool device_opted_into_functionalization(c10::Device self_device, c10::optional tgt_device) { +static bool device_opted_into_functionalization(c10::Device self_device, std::optional tgt_device) { // If the target device is empty, then the output tensor should be on the same device as the input auto real_tgt_device = tgt_device.has_value() ? tgt_device.value() : self_device; return real_tgt_device.type() == c10::DeviceType::XLA || real_tgt_device.type() == c10::DeviceType::Lazy; @@ -235,12 +235,12 @@ static bool device_opted_into_functionalization(c10::Device self_device, c10::op // We should probably get rid of this though. static at::Tensor _to_copy_functionalize( const at::Tensor & self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, bool non_blocking, - c10::optional memory_format) { + std::optional memory_format) { at::Tensor self_; if (at::functionalization::impl::isFunctionalTensor(self)) { // sync any pending updates diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h index caa8ec42003c9..411cf12d51341 100644 --- a/aten/src/ATen/InferSize.h +++ b/aten/src/ATen/InferSize.h @@ -23,7 +23,7 @@ inline void infer_size_impl( ResultVec& res) { NumelType newsize = 1; // N.B. this is an index, not a sym dim! - auto infer_dim = c10::optional(); + auto infer_dim = std::optional(); for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) { if (shape[dim] == -1) { if (infer_dim) { diff --git a/aten/src/ATen/LegacyBatchingRegistrations.cpp b/aten/src/ATen/LegacyBatchingRegistrations.cpp index bae40e3c8e51f..e0f7fce43f9e4 100644 --- a/aten/src/ATen/LegacyBatchingRegistrations.cpp +++ b/aten/src/ATen/LegacyBatchingRegistrations.cpp @@ -380,8 +380,8 @@ Tensor select_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes Tensor slice_batching_rule( const Tensor& self, int64_t dim, - c10::optional start, - c10::optional end, + std::optional start, + std::optional end, int64_t step) { auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self); auto dim_physical = self_physical.getPhysicalDim(dim); @@ -996,10 +996,10 @@ Tensor new_zeros_batching_rule( Tensor new_empty_batching_rule( const Tensor& self, IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { auto physical_view = MultiBatchVmapTransform::logicalToPhysical(self); auto physical_size = physical_view.getPhysicalShape(size); auto result = physical_view.tensor().new_empty(physical_size, TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory)); @@ -1209,10 +1209,10 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) { BINARY_POINTWISE(mul); BINARY_POINTWISE(div); { - using Binop = Tensor (*)(const Tensor&, const Tensor&, c10::optional); - using Unop = Tensor (*)(const Tensor&, const Scalar&, c10::optional); - m.impl("div.Tensor_mode", binary_pointwise_batching_rule>); - m.impl("div.Scalar_mode", unwrap_and_call>); + using Binop = Tensor (*)(const Tensor&, const Tensor&, std::optional); + using Unop = Tensor (*)(const Tensor&, const Scalar&, std::optional); + m.impl("div.Tensor_mode", binary_pointwise_batching_rule>); + m.impl("div.Scalar_mode", unwrap_and_call>); } // at::pow has three out-of-place overloads diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp index a76156c03402d..3e66ef7f74dea 100644 --- a/aten/src/ATen/NamedTensorUtils.cpp +++ b/aten/src/ATen/NamedTensorUtils.cpp @@ -128,7 +128,7 @@ static void assert_names_equal(DimnameList a, DimnameList b) { } const Tensor& propagate_names_if_present_and_nonempty(const Tensor& result, - c10::optional maybe_names, + std::optional maybe_names, bool validate_names) { auto maybe_name_list = maybe_names.value_or(at::ArrayRef{}); propagate_names_if_nonempty(result.unsafeGetTensorImpl(), maybe_name_list, validate_names); diff --git a/aten/src/ATen/NamedTensorUtils.h b/aten/src/ATen/NamedTensorUtils.h index c1443b7eaa01b..47dcd6dd76851 100644 --- a/aten/src/ATen/NamedTensorUtils.h +++ b/aten/src/ATen/NamedTensorUtils.h @@ -81,7 +81,7 @@ namespace namedinference { const Tensor& propagate_names_if_present_and_nonempty( const Tensor& result, - c10::optional maybe_names, + std::optional maybe_names, bool validate_names = false); // Propagates `names` to `result` if `names` is not empty. // `names` can be empty; see [NOTE] Writing name inference rules diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp index 2f73b7b304ee3..534e4e71e657f 100644 --- a/aten/src/ATen/NestedTensorImpl.cpp +++ b/aten/src/ATen/NestedTensorImpl.cpp @@ -236,7 +236,7 @@ NestedTensorImpl::NestedTensorImpl( set_custom_sizes_strides(c10::TensorImpl::SizesStridesPolicy::CustomSizes); } -c10::optional NestedTensorImpl::opt_size(int64_t d) const { +std::optional NestedTensorImpl::opt_size(int64_t d) const { if (C10_UNLIKELY(!opt_sizes_.has_value())) { // Cache the metadata to avoid recomputing it each time. opt_sizes_ = c10::make_optional(construct_opt_sizes(nested_sizes_)); diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h index 0bd3d98e73c5c..697969edbbd44 100644 --- a/aten/src/ATen/NestedTensorImpl.h +++ b/aten/src/ATen/NestedTensorImpl.h @@ -61,10 +61,10 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl { // Returns nullopt if the ith dimension is irregular. The ith dimension // of a NestedTensor is regular if the unbound tensors match in // size at the (i-1)th dimension. - c10::optional opt_size(int64_t d) const; + std::optional opt_size(int64_t d) const; int64_t size(int64_t d) const { - c10::optional optional_size = this->opt_size(d); + std::optional optional_size = this->opt_size(d); TORCH_CHECK( optional_size.has_value(), "Given dimension ", @@ -171,7 +171,7 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl { // Optional to allow it to be computed lazily from nested. // TODO: maybe we can remove this metadata since // we can compute it from `nested_sizes_` - mutable c10::optional> opt_sizes_; + mutable std::optional> opt_sizes_; template c10::intrusive_ptr shallow_copy_and_detach_core( diff --git a/aten/src/ATen/SavedTensorHooks.cpp b/aten/src/ATen/SavedTensorHooks.cpp index c1c963409f40e..f2fb0642eb34c 100644 --- a/aten/src/ATen/SavedTensorHooks.cpp +++ b/aten/src/ATen/SavedTensorHooks.cpp @@ -35,7 +35,7 @@ void SavedTensorDefaultHooks::enable() { tls.disabled_error_message = c10::nullopt; } -const c10::optional& SavedTensorDefaultHooks::get_disabled_error_message() { +const std::optional& SavedTensorDefaultHooks::get_disabled_error_message() { return tls.disabled_error_message; } diff --git a/aten/src/ATen/SavedTensorHooks.h b/aten/src/ATen/SavedTensorHooks.h index af821cb908c6a..6ad46a8334c3f 100644 --- a/aten/src/ATen/SavedTensorHooks.h +++ b/aten/src/ATen/SavedTensorHooks.h @@ -21,7 +21,7 @@ struct TORCH_API SavedTensorDefaultHooksTLS { // disabled_error_message is nullopt IFF Saved Tensor hooks is enabled // We did this for efficiency (so we didn't have to keep a separate bool // around) - c10::optional disabled_error_message; + std::optional disabled_error_message; }; } // namespace impl @@ -46,7 +46,7 @@ struct TORCH_API SavedTensorDefaultHooks { static void disable(const std::string& error_message); static void enable(); static bool is_enabled(); - static const c10::optional& get_disabled_error_message(); + static const std::optional& get_disabled_error_message(); }; } // namespace at diff --git a/aten/src/ATen/ScalarOps.cpp b/aten/src/ATen/ScalarOps.cpp index 13a1754fa53a1..f931af0ad445e 100644 --- a/aten/src/ATen/ScalarOps.cpp +++ b/aten/src/ATen/ScalarOps.cpp @@ -23,7 +23,7 @@ Tensor& scalar_fill(Tensor& self, const Scalar& value) { return self; } -Tensor scalar_tensor_static(const Scalar& s, c10::optional dtype_opt, c10::optional device_opt) { +Tensor scalar_tensor_static(const Scalar& s, std::optional dtype_opt, c10::optional device_opt) { at::tracer::impl::NoTracerDispatchMode tracer_guard; at::AutoDispatchBelowAutograd mode; Tensor result = at::detail::empty_cpu( diff --git a/aten/src/ATen/ScalarOps.h b/aten/src/ATen/ScalarOps.h index 943ac161d4c18..ed591955dd876 100644 --- a/aten/src/ATen/ScalarOps.h +++ b/aten/src/ATen/ScalarOps.h @@ -18,8 +18,8 @@ namespace at::detail { Tensor& scalar_fill(Tensor& self, const Scalar& value); TORCH_API Tensor scalar_tensor_static( const Scalar& s, - c10::optional dtype_opt, - c10::optional device_opt); + std::optional dtype_opt, + std::optional device_opt); } // namespace at::detail // This is in the c10 namespace because we use ADL to find the functions in it. diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h index eb29b4d5ad739..b2ef33ffc058d 100644 --- a/aten/src/ATen/TensorIndexing.h +++ b/aten/src/ATen/TensorIndexing.h @@ -39,9 +39,9 @@ TORCH_API extern const EllipsisIndexType Ellipsis; struct TORCH_API Slice final { public: Slice( - c10::optional start_index = c10::nullopt, - c10::optional stop_index = c10::nullopt, - c10::optional step_index = c10::nullopt) { + std::optional start_index = c10::nullopt, + std::optional stop_index = c10::nullopt, + std::optional step_index = c10::nullopt) { if (!step_index.has_value()) { step_ = c10::SymInt(1); } else { @@ -205,7 +205,7 @@ static inline Tensor applySlice( c10::SymInt step, bool disable_slice_optimization, const at::Device& self_device, - const c10::optional& self_sizes) { + const std::optional& self_sizes) { // TODO: implement negative step TORCH_CHECK_VALUE(step > 0, "step must be greater than zero"); @@ -233,7 +233,7 @@ static inline Tensor applySelect( SymInt index, int64_t real_dim, const at::Device& /*self_device*/, - const c10::optional& self_sizes) { + const std::optional& self_sizes) { // See NOTE [nested tensor size for indexing] if (self_sizes.has_value()) { auto maybe_index = index.maybe_as_int(); @@ -431,7 +431,7 @@ static inline Tensor handleDimInMultiDimIndexing( std::vector& outIndices, bool disable_slice_optimization, const at::Device& original_tensor_device, - const c10::optional& prev_dim_result_sizes) { + const std::optional& prev_dim_result_sizes) { if (index.is_integer()) { return impl::applySelect( prev_dim_result, @@ -515,7 +515,7 @@ static inline Tensor applySlicing( std::vector& outIndices, bool disable_slice_optimization, const at::Device& self_device, - const c10::optional& self_sizes) { + const std::optional& self_sizes) { int64_t dim = 0; int64_t specified_dims = impl::count_specified_dimensions(indices); @@ -531,9 +531,9 @@ static inline Tensor applySlicing( for (const auto i : c10::irange(indices.size())) { auto& obj = indices[i]; // See NOTE [nested tensor size for indexing] - c10::optional result_sizes = result.is_nested() - ? c10::optional(c10::nullopt) - : c10::optional(result.sym_sizes()); + std::optional result_sizes = result.is_nested() + ? std::optional(c10::nullopt) + : std::optional(result.sym_sizes()); result = handleDimInMultiDimIndexing( /*prev_dim_result=*/result, /*original_tensor=*/self, @@ -607,9 +607,9 @@ static inline Tensor get_item( // nested tensor does not have a size (yet) so for now we represent its size // as null may need to be changed after we reach a better solution for nested // tensor size - c10::optional self_sizes = self.is_nested() - ? c10::optional(c10::nullopt) - : c10::optional(self.sym_sizes()); + std::optional self_sizes = self.is_nested() + ? std::optional(c10::nullopt) + : std::optional(self.sym_sizes()); // handle simple types: integers, slices, none, ellipsis, bool if (indices.size() == 1) { diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h index a241244a5744c..fb61ca65146a3 100644 --- a/aten/src/ATen/TensorIterator.h +++ b/aten/src/ATen/TensorIterator.h @@ -147,7 +147,7 @@ struct TORCH_API OperandInfo { /// promotion target_dtype value can become different from tensor's dtype /// also, during type promotion target_dtype and device can be set for an /// undefined tensor so that tensor can be properly constructed later. - c10::optional device = c10::nullopt; + std::optional device = c10::nullopt; ScalarType target_dtype = ScalarType::Undefined; // Caches dtype of the tensor, because scalar_type is an expensive operation // If dtype of the tensor is changed (e.g. as a result of type promotion or in @@ -971,9 +971,9 @@ class TORCH_API TensorIteratorConfig final { int num_outputs_ = 0; int num_inputs_ = 0; - c10::optional static_shape_ = c10::nullopt; - c10::optional static_dtype_ = c10::nullopt; - c10::optional static_device_ = c10::nullopt; + std::optional static_shape_ = c10::nullopt; + std::optional static_dtype_ = c10::nullopt; + std::optional static_device_ = c10::nullopt; bool check_mem_overlap_ = true; bool allow_cpu_scalars_ = false; bool is_reduction_ = false; diff --git a/aten/src/ATen/TensorSubclassLikeUtils.h b/aten/src/ATen/TensorSubclassLikeUtils.h index a9a0b4ecdcf8b..10c26dfe35eca 100644 --- a/aten/src/ATen/TensorSubclassLikeUtils.h +++ b/aten/src/ATen/TensorSubclassLikeUtils.h @@ -61,7 +61,7 @@ inline bool areAnyTensorSubclassLike(TensorList tensors) { } inline bool areAnyOptionalTensorSubclassLike( - const c10::List>& tensors) { + const c10::List>& tensors) { if (c10::impl::dispatch_mode_enabled()) return true; return std::any_of( diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index e425a0a8ed130..14e81d6504179 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -327,7 +327,7 @@ std::vector defaultStrides(IntArrayRef sizes) { // see overloads of computeStride() below. // template -inline c10::optional computeStride_impl( +inline std::optional computeStride_impl( const NewShapeVec& oldshape, const NewShapeVec& oldstride, const NewShapeVec& newshape, @@ -395,7 +395,7 @@ inline c10::optional computeStride_impl( return newstride; } -c10::optional> computeStride( +std::optional> computeStride( IntArrayRef oldshape, IntArrayRef oldstride, IntArrayRef newshape) { @@ -403,7 +403,7 @@ c10::optional> computeStride( return computeStride_impl, IntArrayRef, int64_t>(oldshape, oldstride, newshape, toResult); } -c10::optional computeStride( +std::optional computeStride( c10::SymIntArrayRef oldshape, c10::SymIntArrayRef oldstride, c10::SymIntArrayRef newshape) { @@ -411,7 +411,7 @@ c10::optional computeStride( return computeStride_impl(oldshape, oldstride, newshape, toResult); } -c10::optional computeStride( +std::optional computeStride( IntArrayRef oldshape, IntArrayRef oldstride, const DimVector& newshape) { diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h index 4615ab50606ee..4a81dc280e242 100644 --- a/aten/src/ATen/TensorUtils.h +++ b/aten/src/ATen/TensorUtils.h @@ -171,17 +171,17 @@ TORCH_API void check_dim_size( namespace detail { TORCH_API std::vector defaultStrides(IntArrayRef sizes); -TORCH_API c10::optional> computeStride( +TORCH_API std::optional> computeStride( IntArrayRef oldshape, IntArrayRef oldstride, IntArrayRef newshape); -TORCH_API c10::optional computeStride( +TORCH_API std::optional computeStride( c10::SymIntArrayRef oldshape, c10::SymIntArrayRef oldstride, c10::SymIntArrayRef newshape); -TORCH_API c10::optional computeStride( +TORCH_API std::optional computeStride( IntArrayRef oldshape, IntArrayRef oldstride, const DimVector& newshape); diff --git a/aten/src/ATen/VmapModeRegistrations.cpp b/aten/src/ATen/VmapModeRegistrations.cpp index ab4556c8c4155..3b6198778a353 100644 --- a/aten/src/ATen/VmapModeRegistrations.cpp +++ b/aten/src/ATen/VmapModeRegistrations.cpp @@ -39,7 +39,7 @@ TORCH_LIBRARY_IMPL(aten, VmapMode, m) { // CppFunction::makeNamedNotSupported() to avoid listing out the types of everything. // However, registering e.g. CppFunction::makeNamedNotSupported() as an implementation // only works for operators that support boxing. -#define TENSOROPTIONS c10::optional, c10::optional, c10::optional, c10::optional +#define TENSOROPTIONS std::optional, c10::optional, c10::optional, c10::optional // random operations (out-of-place) m.impl("bernoulli", unsupportedRandomOp>); diff --git a/aten/src/ATen/ZeroTensorFallback.cpp b/aten/src/ATen/ZeroTensorFallback.cpp index bc012f8cde909..329216cf3789f 100644 --- a/aten/src/ATen/ZeroTensorFallback.cpp +++ b/aten/src/ATen/ZeroTensorFallback.cpp @@ -16,7 +16,7 @@ namespace at { const auto num_arguments = arguments.size(); const auto stack_start = stack->size() - num_arguments; - c10::optional is_write; + std::optional is_write; for (const auto i : c10::irange(num_arguments)) { const auto& alias_info = arguments[i].alias_info(); if (alias_info != nullptr) { diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index c233f17b44580..2d01bdeca500b 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -144,7 +144,7 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_ Banned functions *******************************/ -static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const c10::optional&, int64_t) { +static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional&, int64_t) { AT_ERROR("torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n" "Many models use a sigmoid layer right before the binary cross entropy layer.\n" "In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n" diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h index 59a91848a5175..c36030db5b048 100644 --- a/aten/src/ATen/autocast_mode.h +++ b/aten/src/ATen/autocast_mode.h @@ -297,9 +297,9 @@ TORCH_API Tensor cached_cast( c10::DeviceType device_type = c10::DeviceType::CUDA); // Overload to process optional -inline c10::optional cached_cast( +inline std::optional cached_cast( at::ScalarType to_type, - const c10::optional& arg, + const std::optional& arg, c10::DeviceType device_type = c10::DeviceType::CUDA) { if (arg.has_value()) { return cached_cast(to_type, *arg, device_type); @@ -353,9 +353,9 @@ Otherwise, set it to the autocast type. ********************************************************/ // Overload to catch dtype flags -c10::optional inline set_opt_dtype( +std::optional inline set_opt_dtype( at::ScalarType to_type, - const c10::optional& dtype) { + const std::optional& dtype) { return dtype.has_value() ? dtype : to_type; } @@ -392,7 +392,7 @@ enum class CastPolicy : uint8_t { fp32, // Cast all inputs to at::kFloat before running the op. fp32_set_opt_dtype, // Treats functions (like softmax) that // 1. we'd like to run in fp32 and - // 2. have a c10::optional arg that controls + // 2. have a std::optional arg that controls // the output type. // fp32_set_opt_dtype wrappers' policy is: if the output // type is already set, don't touch it, otherwise, set @@ -865,24 +865,24 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions. _(ADD_NS(norm), \ "norm.Scalar", \ Tensor(const Tensor&, const Scalar&), \ - Tensor(const Tensor&, const c10::optional&, ScalarType), \ + Tensor(const Tensor&, const std::optional&, ScalarType), \ fp32_append_dtype) \ _(ADD_NS(norm), \ "norm.ScalarOpt_dim", \ - Tensor(const Tensor&, const c10::optional&, IntArrayRef, bool), \ + Tensor(const Tensor&, const std::optional&, IntArrayRef, bool), \ Tensor( \ const Tensor&, \ - const c10::optional&, \ + const std::optional&, \ IntArrayRef, \ bool, \ ScalarType), \ fp32_append_dtype) \ _(ADD_NS(norm), \ "norm.names_ScalarOpt_dim", \ - Tensor(const Tensor&, const c10::optional&, DimnameList, bool), \ + Tensor(const Tensor&, const std::optional&, DimnameList, bool), \ Tensor( \ const Tensor&, \ - const c10::optional&, \ + const std::optional&, \ DimnameList, \ bool, \ ScalarType), \ @@ -895,6 +895,7 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions. _(bilinear) \ _(cross) \ _(dot) \ + _(vdot) \ _(grid_sampler) \ _(index_put) \ _(tensordot) \ diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h index d04cb1c6b8a70..449f8d743157b 100644 --- a/aten/src/ATen/core/CachingHostAllocator.h +++ b/aten/src/ATen/core/CachingHostAllocator.h @@ -152,7 +152,7 @@ struct CachingHostAllocatorImpl { // do not need to look up the ctx in blocks_. auto* block = reinterpret_cast(ctx); - c10::optional> events; + std::optional> events; { std::lock_guard g(block->mutex_); block->allocated_ = false; @@ -263,7 +263,7 @@ struct CachingHostAllocatorImpl { // Avoid calling cudaEventDestroy while holding a mutex, so move // intermediate events out of the lock into this object. // process the last event - c10::optional> processed; + std::optional> processed; { std::lock_guard g(events_mutex_); if (!events_.empty()) { @@ -324,7 +324,7 @@ struct CachingHostAllocatorImpl { } // Record an event on stream and store event into events. - virtual void record_stream(c10::optional>& events, S stream) { + virtual void record_stream(std::optional>& events, S stream) { TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream"); } diff --git a/aten/src/ATen/core/CheckMemoryFormat.h b/aten/src/ATen/core/CheckMemoryFormat.h index 442889e2eec6f..8add9509f4d5f 100644 --- a/aten/src/ATen/core/CheckMemoryFormat.h +++ b/aten/src/ATen/core/CheckMemoryFormat.h @@ -2,10 +2,10 @@ namespace c10::impl { -inline c10::optional +inline std::optional check_tensor_options_and_extract_memory_format( const TensorOptions& options, - c10::optional memory_format) { + std::optional memory_format) { TORCH_CHECK( options.requires_grad_opt() == c10::nullopt || options.requires_grad_opt().value() == false, diff --git a/aten/src/ATen/core/DeprecatedTypeProperties.cpp b/aten/src/ATen/core/DeprecatedTypeProperties.cpp index 15231f965aefd..a97a6828571e7 100644 --- a/aten/src/ATen/core/DeprecatedTypeProperties.cpp +++ b/aten/src/ATen/core/DeprecatedTypeProperties.cpp @@ -14,7 +14,7 @@ Storage DeprecatedTypeProperties::unsafeStorageFromTH(void * th_pointer, bool re return at::unsafeStorageFromTH(th_pointer, retain); } -Tensor DeprecatedTypeProperties::copy(const Tensor & src, bool non_blocking, c10::optional to_device) const { +Tensor DeprecatedTypeProperties::copy(const Tensor & src, bool non_blocking, std::optional to_device) const { if (to_device) { return src.to(src.options().dtype(scalarType()).device(to_device), non_blocking, /*copy=*/true); } diff --git a/aten/src/ATen/core/DeprecatedTypeProperties.h b/aten/src/ATen/core/DeprecatedTypeProperties.h index 222465eac56f2..a945761e8ff97 100644 --- a/aten/src/ATen/core/DeprecatedTypeProperties.h +++ b/aten/src/ATen/core/DeprecatedTypeProperties.h @@ -107,7 +107,7 @@ class TORCH_API DeprecatedTypeProperties { /// Constructs the `TensorOptions` from a type and a Device. Asserts that /// the device type matches the device type of the type. - TensorOptions options(c10::optional device_opt) const { + TensorOptions options(std::optional device_opt) const { if (!device_opt.has_value()) { return options(-1); } else { @@ -129,7 +129,7 @@ class TORCH_API DeprecatedTypeProperties { Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const; Storage unsafeStorageFromTH(void * th_pointer, bool retain) const; - Tensor copy(const Tensor & src, bool non_blocking=false, c10::optional to_device={}) const; + Tensor copy(const Tensor & src, bool non_blocking=false, std::optional to_device={}) const; private: Backend backend_; diff --git a/aten/src/ATen/core/Dimname.h b/aten/src/ATen/core/Dimname.h index e53db14732c89..d3bc5a45abb7a 100644 --- a/aten/src/ATen/core/Dimname.h +++ b/aten/src/ATen/core/Dimname.h @@ -21,7 +21,7 @@ struct TORCH_API Dimname { bool isWildcard() const { return type_ == NameType::WILDCARD; } bool matches(Dimname other) const; - c10::optional unify(Dimname other) const; + std::optional unify(Dimname other) const; private: Dimname(Symbol name) diff --git a/aten/src/ATen/core/DistributionsHelper.h b/aten/src/ATen/core/DistributionsHelper.h index 8b399510e94aa..a46608200e5b9 100644 --- a/aten/src/ATen/core/DistributionsHelper.h +++ b/aten/src/ATen/core/DistributionsHelper.h @@ -144,7 +144,7 @@ template next_##TYPE##_normal_sample()) { \ *ret = *(generator->next_##TYPE##_normal_sample()); \ - generator->set_next_##TYPE##_normal_sample(c10::optional()); \ + generator->set_next_##TYPE##_normal_sample(std::optional()); \ return true; \ } \ return false; \ diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h index b237c571b22d3..6b76db5d06864 100644 --- a/aten/src/ATen/core/Generator.h +++ b/aten/src/ATen/core/Generator.h @@ -150,7 +150,7 @@ Generator make_generator(Args&&... args) { * the backend generator type (CPU/CUDAGeneratorImpl etc.) */ template -static inline T * check_generator(c10::optional gen) { +static inline T * check_generator(std::optional gen) { TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt"); TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed"); TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'"); @@ -164,7 +164,7 @@ static inline T * check_generator(c10::optional gen) { * the backend generator type (CPU/CUDAGeneratorImpl etc.) */ template -static inline T* get_generator_or_default(const c10::optional& gen, const Generator& default_gen) { +static inline T* get_generator_or_default(const std::optional& gen, const Generator& default_gen) { return gen.has_value() && gen->defined() ? check_generator(gen) : check_generator(default_gen); } diff --git a/aten/src/ATen/core/GeneratorForPrivateuseone.cpp b/aten/src/ATen/core/GeneratorForPrivateuseone.cpp index 1e8d8daa9fc8f..35b1dd9fdd4eb 100644 --- a/aten/src/ATen/core/GeneratorForPrivateuseone.cpp +++ b/aten/src/ATen/core/GeneratorForPrivateuseone.cpp @@ -5,8 +5,8 @@ namespace at { static std::mutex _generator_mutex_lock; -c10::optional& GetGeneratorPrivate() { - static c10::optional generator_privateuse1 = c10::nullopt; +std::optional& GetGeneratorPrivate() { + static std::optional generator_privateuse1 = c10::nullopt; return generator_privateuse1; } diff --git a/aten/src/ATen/core/GeneratorForPrivateuseone.h b/aten/src/ATen/core/GeneratorForPrivateuseone.h index 9b84f162a7652..747c77897ff9b 100644 --- a/aten/src/ATen/core/GeneratorForPrivateuseone.h +++ b/aten/src/ATen/core/GeneratorForPrivateuseone.h @@ -7,7 +7,7 @@ namespace at { using GeneratorFuncType = std::function; -c10::optional& GetGeneratorPrivate(); +std::optional& GetGeneratorPrivate(); class TORCH_API _GeneratorRegister { public: diff --git a/aten/src/ATen/core/List.h b/aten/src/ATen/core/List.h index 68ecf5ed343f8..53560b9666ae3 100644 --- a/aten/src/ATen/core/List.h +++ b/aten/src/ATen/core/List.h @@ -58,10 +58,10 @@ struct ListElementConstReferenceTraits { using const_reference = typename c10::detail::ivalue_to_const_ref_overload_return::type; }; -// There is no to() overload for c10::optional. +// There is no to() overload for std::optional. template<> -struct ListElementConstReferenceTraits> { - using const_reference = c10::optional>; +struct ListElementConstReferenceTraits> { + using const_reference = std::optional>; }; template diff --git a/aten/src/ATen/core/List_inl.h b/aten/src/ATen/core/List_inl.h index f8ce73eb3f9cc..64760b5f782b4 100644 --- a/aten/src/ATen/core/List_inl.h +++ b/aten/src/ATen/core/List_inl.h @@ -168,8 +168,8 @@ list_element_to_const_ref(const IValue& element) { } template<> -inline typename ListElementConstReferenceTraits>::const_reference -list_element_to_const_ref>(const IValue& element) { +inline typename ListElementConstReferenceTraits>::const_reference +list_element_to_const_ref>(const IValue& element) { return element.toOptionalStringRef(); } diff --git a/aten/src/ATen/core/List_test.cpp b/aten/src/ATen/core/List_test.cpp index 56da3cf299e90..808cbe2d8b63a 100644 --- a/aten/src/ATen/core/List_test.cpp +++ b/aten/src/ATen/core/List_test.cpp @@ -1127,13 +1127,13 @@ TEST(ListTest, canAccessStringByReference) { } TEST(ListTest, canAccessOptionalStringByReference) { - List> list({"one", "two", c10::nullopt}); + List> list({"one", "two", c10::nullopt}); const auto& listRef = list; static_assert( - std::is_same_v>>, - "List> access should be by const reference"); - c10::optional str1 = list[1]; - c10::optional str2 = list[2]; + std::is_same_v>>, + "List> access should be by const reference"); + std::optional str1 = list[1]; + std::optional str2 = list[2]; decltype(auto) strRef1 = listRef[1]; decltype(auto) strRef2 = listRef[2]; // NOLINTNEXTLINE(bugprone-unchecked-optional-access) diff --git a/aten/src/ATen/core/NamedTensor.h b/aten/src/ATen/core/NamedTensor.h index d6ff30ce00838..7eed27e4f1a61 100644 --- a/aten/src/ATen/core/NamedTensor.h +++ b/aten/src/ATen/core/NamedTensor.h @@ -100,7 +100,7 @@ void check_names_valid_for(const TensorBase& tensor, DimnameList names); void check_names_valid_for(size_t tensor_dim, DimnameList names); // Sets the names of `tensor` to be `names`. -TORCH_API const TensorBase& internal_set_names_inplace(const TensorBase& tensor, c10::optional names); +TORCH_API const TensorBase& internal_set_names_inplace(const TensorBase& tensor, std::optional names); TORCH_API const TensorBase& internal_set_names_inplace(const TensorBase& tensor, std::vector&& names, bool validate_names); constexpr size_t kMaxNamedTensorDim = 64; @@ -111,7 +111,7 @@ namespace impl { // Some helper functions on TensorImpl. Useful for working with names in TH. // XXX: Ideally these would exist as methods on TensorImpl -TORCH_API void internal_set_names_inplace(TensorImpl* impl, c10::optional names, bool validate_names); +TORCH_API void internal_set_names_inplace(TensorImpl* impl, std::optional names, bool validate_names); TORCH_API void internal_set_names_inplace(TensorImpl* impl, std::vector&& names, bool validate_names); void check_names_valid_for(TensorImpl* impl, DimnameList names); @@ -132,7 +132,7 @@ TORCH_API DimnameList get_names(const TensorImpl* impl); // Returns the names of the tensor if they have been allocated; returns nullopt // instead if the haven't been. The names of a tensor are not allocated if a // tensor is constructed with names=None. -TORCH_API c10::optional get_opt_names(const TensorImpl* impl); +TORCH_API std::optional get_opt_names(const TensorImpl* impl); } // namespace impl diff --git a/aten/src/ATen/core/NestedIntSymNodeImpl.cpp b/aten/src/ATen/core/NestedIntSymNodeImpl.cpp index b703f76773b46..7cdc7aa2cbe8f 100644 --- a/aten/src/ATen/core/NestedIntSymNodeImpl.cpp +++ b/aten/src/ATen/core/NestedIntSymNodeImpl.cpp @@ -7,7 +7,7 @@ namespace c10 { namespace { bool _eq(const char* op, c10::SymNodeImpl* lhs, c10::SymNodeImpl* rhs) { TORCH_INTERNAL_ASSERT(lhs->is_nested_int()); - c10::optional c = rhs->nested_int(); + std::optional c = rhs->nested_int(); return ( c.has_value() && lhs->nested_int() == *c && lhs->nested_int_coeff() == rhs->nested_int_coeff()); @@ -68,7 +68,7 @@ c10::SymNode NestedIntSymNodeImpl::le(const c10::SymNode& other) { c10::SymNode NestedIntSymNodeImpl::mul(const c10::SymNode& other) { TORCH_CHECK(!other->nested_int(), "nested int cannot be multiplied by nested int"); - c10::optional c = other->constant_int(); + std::optional c = other->constant_int(); TORCH_CHECK(c.has_value()); return SymNode(c10::make_intrusive(val_, coeff_ * *c)); } diff --git a/aten/src/ATen/core/NestedIntSymNodeImpl.h b/aten/src/ATen/core/NestedIntSymNodeImpl.h index 228f4310a38fc..786464c4c3ea8 100644 --- a/aten/src/ATen/core/NestedIntSymNodeImpl.h +++ b/aten/src/ATen/core/NestedIntSymNodeImpl.h @@ -134,11 +134,11 @@ class TORCH_API NestedIntSymNodeImpl : public SymNodeImpl { c10::SymNode le(const c10::SymNode& other) override; c10::SymNode mul(const c10::SymNode& other) override; - c10::optional nested_int() override { + std::optional nested_int() override { return val_; } - c10::optional nested_int_coeff() override { + std::optional nested_int_coeff() override { return coeff_; } diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp index a34341b4a9437..caef951ed1268 100644 --- a/aten/src/ATen/core/PythonFallbackKernel.cpp +++ b/aten/src/ATen/core/PythonFallbackKernel.cpp @@ -14,7 +14,7 @@ namespace { // To achieve this, we ensure that the tls is empty by default and emptied again both when // we call into user torch_dispatch or returning back to python after this call. -thread_local c10::optional tls_on_entry; +thread_local std::optional tls_on_entry; c10::impl::LocalDispatchKeySet safe_get_tls_on_entry() { TORCH_CHECK(tls_on_entry.has_value(), "Accessing torch dispatch state outside of '__torch_dispatch__' " diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp index ed19144d0eaff..2ddd9b4e65bac 100644 --- a/aten/src/ATen/core/Tensor.cpp +++ b/aten/src/ATen/core/Tensor.cpp @@ -42,7 +42,7 @@ TensorBase TensorBase::to( at::TensorOptions options, bool non_blocking, bool copy, - c10::optional memory_format) const { + std::optional memory_format) const { Tensor self(*this); return at::_ops::to_dtype_layout::call( self, optTypeMetaToScalarType(options.dtype_opt()), @@ -134,8 +134,8 @@ bool TensorBase::retains_grad() const { } void Tensor::_backward(TensorList inputs, - const c10::optional& gradient, - c10::optional keep_graph, + const std::optional& gradient, + std::optional keep_graph, bool create_graph) const { return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph); } diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index e03c6bdf2bd10..87d5937cf9ebc 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -147,7 +147,7 @@ class TORCH_API TensorBase { const TensorBase& fill_(const c10::Scalar& scalar) const; const TensorBase& zero_() const; - TensorBase to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, c10::optional memory_format=c10::nullopt) const; + TensorBase to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, std::optional memory_format=c10::nullopt) const; bool is_complex() const { return at::isComplexType(this->scalar_type()); @@ -249,7 +249,7 @@ class TORCH_API TensorBase { return impl_->strides(); } // See impl::get_opt_names in ATen/NamedTensor.h for docs. - c10::optional opt_names() const { + std::optional opt_names() const { return impl::get_opt_names(unsafeGetTensorImpl()); } // See impl::get_names in ATen/NamedTensor.h for docs. @@ -712,7 +712,7 @@ class TORCH_API TensorBase { /// // f requires grad, has no operation creating it /// @endcode - /// \fn void backward(const Tensor & gradient={}, c10::optional retain_graph=c10::nullopt, bool create_graph=false, c10::optional inputs=c10::nullopt) const; + /// \fn void backward(const Tensor & gradient={}, std::optional retain_graph=c10::nullopt, bool create_graph=false, c10::optional inputs=c10::nullopt) const; /// /// Computes the gradient of current tensor with respect to graph leaves. /// @@ -1010,7 +1010,7 @@ struct ExclusivelyOwnedTraits : public c10::ExclusivelyOwnedTens namespace at { inline c10::MaybeOwned borrow_from_optional_tensor( - const c10::optional& opt) { + const std::optional& opt) { return opt.has_value() ? c10::MaybeOwned::borrowed(*opt) : c10::MaybeOwned::owned(std::in_place); diff --git a/aten/src/ATen/core/TorchDispatchUtils.cpp b/aten/src/ATen/core/TorchDispatchUtils.cpp index 8f666e5a476ab..32085a9f70627 100644 --- a/aten/src/ATen/core/TorchDispatchUtils.cpp +++ b/aten/src/ATen/core/TorchDispatchUtils.cpp @@ -17,7 +17,7 @@ bool tensorlist_has_dispatch(at::ITensorListRef li) { return false; } -bool tensorlist_has_dispatch(const c10::List>& li) { +bool tensorlist_has_dispatch(const c10::List>& li) { for (auto i : c10::irange(li.size())) { auto t = li.get(i); if (t && tensor_has_dispatch(*t)) { diff --git a/aten/src/ATen/core/TorchDispatchUtils.h b/aten/src/ATen/core/TorchDispatchUtils.h index 0ead779360097..4f5d9e22e4692 100644 --- a/aten/src/ATen/core/TorchDispatchUtils.h +++ b/aten/src/ATen/core/TorchDispatchUtils.h @@ -10,7 +10,7 @@ namespace at::impl { TORCH_API bool tensor_has_dispatch(const at::Tensor& t); TORCH_API bool tensorlist_has_dispatch(at::ITensorListRef li); -TORCH_API bool tensorlist_has_dispatch(const c10::List>& li); +TORCH_API bool tensorlist_has_dispatch(const c10::List>& li); using c10::impl::dispatch_mode_enabled; } diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h index 47d74f5433ac2..f9c0aa4a5fc14 100644 --- a/aten/src/ATen/core/VariableHooksInterface.h +++ b/aten/src/ATen/core/VariableHooksInterface.h @@ -60,8 +60,8 @@ struct TORCH_API VariableHooksInterface { virtual void _backward( const Tensor&, TensorList, - const c10::optional&, - c10::optional, + const std::optional&, + std::optional, bool) const = 0; virtual void requires_grad_(const TensorBase&, bool) const = 0; virtual void basic_autograd_not_implemented_fallback( diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h index c950f4c80ffc7..7b55c2323a2ff 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.h +++ b/aten/src/ATen/core/boxing/KernelFunction.h @@ -22,7 +22,7 @@ using has_symint = std::is_same, std::is_same, std::is_same, - std::is_same, T> + std::is_same, T> >; template @@ -46,8 +46,8 @@ struct remove_symint { }; template <> -struct remove_symint> { - using type = c10::optional; +struct remove_symint> { + using type = std::optional; }; diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h index 0d6149c8090a9..0ad79b00be56b 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_impl.h +++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h @@ -71,7 +71,7 @@ inline typename remove_symint::type unpackSymInt(c10::SymIn } template <> -inline typename remove_symint>::type unpackSymInt(c10::optional x) { +inline typename remove_symint>::type unpackSymInt(c10::optional x) { return x.has_value() ? c10::make_optional(x->guard_int(__FILE__, __LINE__)) : c10::nullopt; } diff --git a/aten/src/ATen/core/boxing/KernelFunction_test.cpp b/aten/src/ATen/core/boxing/KernelFunction_test.cpp index 6453e5e00b5c4..a0f990e87aafe 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_test.cpp +++ b/aten/src/ATen/core/boxing/KernelFunction_test.cpp @@ -6,7 +6,7 @@ using std::vector; using std::tuple; -using c10::optional; +using std::optional; using c10::IValue; using c10::OperatorKernel; using c10::OperatorHandle; diff --git a/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp b/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp index 7eb0137b283fc..fa562c1d7ca4f 100644 --- a/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp +++ b/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp @@ -207,15 +207,15 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithIntListOu EXPECT_EQ(6, result[0].toIntVector()[2]); } -std::tuple, c10::optional, Dict> kernelWithMultipleOutputs(Tensor) { +std::tuple, std::optional, Dict> kernelWithMultipleOutputs(Tensor) { Dict dict; dict.insert("first", dummyTensor(DispatchKey::CPU)); dict.insert("second", dummyTensor(DispatchKey::CUDA)); - return std::tuple, c10::optional, Dict>( + return std::tuple, std::optional, Dict>( dummyTensor(DispatchKey::CUDA), 5, {dummyTensor(DispatchKey::CPU), dummyTensor(DispatchKey::CUDA)}, - c10::optional(std::in_place, 0), + std::optional(std::in_place, 0), dict ); } @@ -808,11 +808,11 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenFallbackKernelWitho EXPECT_EQ(4, outputs[0].toInt()); } -c10::optional called_arg2 = c10::nullopt; -c10::optional called_arg3 = c10::nullopt; -c10::optional called_arg4 = c10::nullopt; +std::optional called_arg2 = c10::nullopt; +std::optional called_arg3 = c10::nullopt; +std::optional called_arg4 = c10::nullopt; -void kernelWithOptInputWithoutOutput(Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { +void kernelWithOptInputWithoutOutput(Tensor arg1, const std::optional& arg2, c10::optional arg3, c10::optional arg4) { called = true; called_arg2 = arg2; called_arg3 = arg3; @@ -846,7 +846,7 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithOptionalI EXPECT_FALSE(called_arg4.has_value()); } -c10::optional kernelWithOptInputWithOutput(Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { +std::optional kernelWithOptInputWithOutput(Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { called = true; called_arg2 = arg2; called_arg3 = arg3; @@ -883,8 +883,8 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithOptionalI EXPECT_FALSE(called_arg4.has_value()); } -std::tuple, c10::optional, c10::optional> -kernelWithOptInputWithMultipleOutputs(Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { +std::tuple, c10::optional, c10::optional> +kernelWithOptInputWithMultipleOutputs(Tensor arg1, const std::optional& arg2, c10::optional arg3, c10::optional arg4) { return std::make_tuple(arg2, arg3, arg4); } @@ -936,7 +936,7 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernel_whenRegister auto op = c10::Dispatcher::singleton().findSchema({"_test::no_schema_specified", ""}); ASSERT_TRUE(op.has_value()); - c10::optional differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema()); + std::optional differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema()); EXPECT_FALSE(differences.has_value()); } diff --git a/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp b/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp index 15f7caae529b4..ed448d054c713 100644 --- a/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp +++ b/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp @@ -223,15 +223,15 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenKernelWithIntListOutput_w EXPECT_EQ(6, result[0].toIntVector()[2]); } -std::tuple, c10::optional, Dict> kernelWithMultipleOutputs(Tensor) { +std::tuple, std::optional, Dict> kernelWithMultipleOutputs(Tensor) { Dict dict; dict.insert("first", dummyTensor(DispatchKey::CPU)); dict.insert("second", dummyTensor(DispatchKey::CUDA)); - return std::tuple, c10::optional, Dict>( + return std::tuple, std::optional, Dict>( dummyTensor(DispatchKey::CUDA), 5, c10::List({dummyTensor(DispatchKey::CPU), dummyTensor(DispatchKey::CUDA)}), - c10::optional(std::in_place, 0), + std::optional(std::in_place, 0), dict ); } @@ -550,11 +550,11 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenFallbackKernelWithoutTens EXPECT_EQ(4, outputs[0].toInt()); } -c10::optional called_arg2 = c10::nullopt; -c10::optional called_arg3 = c10::nullopt; -c10::optional called_arg4 = c10::nullopt; +std::optional called_arg2 = c10::nullopt; +std::optional called_arg3 = c10::nullopt; +std::optional called_arg4 = c10::nullopt; -void kernelWithOptInputWithoutOutput(Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { +void kernelWithOptInputWithoutOutput(Tensor arg1, const std::optional& arg2, c10::optional arg3, c10::optional arg4) { called = true; called_arg2 = arg2; called_arg3 = arg3; @@ -588,7 +588,7 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenKernelWithOptionalInputs_ EXPECT_FALSE(called_arg4.has_value()); } -c10::optional kernelWithOptInputWithOutput(Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { +std::optional kernelWithOptInputWithOutput(Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { called = true; called_arg2 = arg2; called_arg3 = arg3; @@ -625,8 +625,8 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenKernelWithOptionalInputs_ EXPECT_FALSE(called_arg4.has_value()); } -std::tuple, c10::optional, c10::optional> -kernelWithOptInputWithMultipleOutputs(Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { +std::tuple, c10::optional, c10::optional> +kernelWithOptInputWithMultipleOutputs(Tensor arg1, const std::optional& arg2, c10::optional arg3, c10::optional arg4) { return std::make_tuple(arg2, arg3, arg4); } @@ -690,7 +690,7 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenKernel_whenRegisteredWith auto op = c10::Dispatcher::singleton().findSchema({"_test::no_schema_specified", ""}); ASSERT_TRUE(op.has_value()); - c10::optional differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema()); + std::optional differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema()); EXPECT_FALSE(differences.has_value()); } diff --git a/aten/src/ATen/core/boxing/impl/kernel_lambda_legacy_test.cpp b/aten/src/ATen/core/boxing/impl/kernel_lambda_legacy_test.cpp index a1a1b37e2d83e..22203b7326f38 100644 --- a/aten/src/ATen/core/boxing/impl/kernel_lambda_legacy_test.cpp +++ b/aten/src/ATen/core/boxing/impl/kernel_lambda_legacy_test.cpp @@ -188,15 +188,15 @@ TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithIntListOutp TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithMultipleOutputs_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))", [] (Tensor) -> std::tuple, c10::optional, Dict> { + .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))", [] (Tensor) -> std::tuple, std::optional, Dict> { Dict dict; dict.insert("first", dummyTensor(DispatchKey::CPU)); dict.insert("second", dummyTensor(DispatchKey::CUDA)); - return std::tuple, c10::optional, Dict>( + return std::tuple, std::optional, Dict>( dummyTensor(DispatchKey::CUDA), 5, {dummyTensor(DispatchKey::CPU), dummyTensor(DispatchKey::CUDA)}, - c10::optional(std::in_place, 0), + std::optional(std::in_place, 0), dict ); }); @@ -733,13 +733,13 @@ TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenFallbackKernelWithout TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithOptionalInputs_withoutOutput_whenRegistered_thenCanBeCalled) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool called; - c10::optional called_arg2 = c10::nullopt; - c10::optional called_arg3 = c10::nullopt; - c10::optional called_arg4 = c10::nullopt; + std::optional called_arg2 = c10::nullopt; + std::optional called_arg3 = c10::nullopt; + std::optional called_arg4 = c10::nullopt; auto registrar = RegisterOperators().op( "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()", - [&] (Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { + [&] (Tensor arg1, const std::optional& arg2, c10::optional arg3, c10::optional arg4) { called = true; called_arg2 = arg2; called_arg3 = arg3; @@ -773,13 +773,13 @@ TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithOptionalInp TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithOptionalInputs_withOutput_whenRegistered_thenCanBeCalled) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool called; - c10::optional called_arg2 = c10::nullopt; - c10::optional called_arg3 = c10::nullopt; - c10::optional called_arg4 = c10::nullopt; + std::optional called_arg2 = c10::nullopt; + std::optional called_arg3 = c10::nullopt; + std::optional called_arg4 = c10::nullopt; auto registrar = RegisterOperators().op( "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?", - [&] (Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { + [&] (Tensor arg1, const std::optional& arg2, c10::optional arg3, c10::optional arg4) { called = true; called_arg2 = arg2; called_arg3 = arg3; @@ -816,13 +816,13 @@ TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithOptionalInp TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernelWithOptionalInputs_withMultipleOutputs_whenRegistered_thenCanBeCalled) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool called; - c10::optional called_arg2 = c10::nullopt; - c10::optional called_arg3 = c10::nullopt; - c10::optional called_arg4 = c10::nullopt; + std::optional called_arg2 = c10::nullopt; + std::optional called_arg3 = c10::nullopt; + std::optional called_arg4 = c10::nullopt; auto registrar = RegisterOperators().op( "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)", - [] (Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { + [] (Tensor arg1, const std::optional& arg2, c10::optional arg3, c10::optional arg4) { return std::make_tuple(arg2, arg3, arg4); }); auto op = c10::Dispatcher::singleton().findSchema({"_test::opt_input", ""}); @@ -866,7 +866,7 @@ TEST(OperatorRegistrationTestLegacyLambdaBasedKernel, givenKernel_whenRegistered auto op = c10::Dispatcher::singleton().findSchema({"_test::no_schema_specified", ""}); ASSERT_TRUE(op.has_value()); - c10::optional differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema()); + std::optional differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema()); EXPECT_FALSE(differences.has_value()); } diff --git a/aten/src/ATen/core/boxing/impl/kernel_lambda_test.cpp b/aten/src/ATen/core/boxing/impl/kernel_lambda_test.cpp index dc463cb3fe180..ea06bbccc7bd6 100644 --- a/aten/src/ATen/core/boxing/impl/kernel_lambda_test.cpp +++ b/aten/src/ATen/core/boxing/impl/kernel_lambda_test.cpp @@ -187,15 +187,15 @@ TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithIntListOutput_whe TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithMultipleOutputs_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))", - RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor) -> std::tuple, c10::optional, Dict> { + RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor) -> std::tuple, std::optional, Dict> { Dict dict; dict.insert("first", dummyTensor(DispatchKey::CPU)); dict.insert("second", dummyTensor(DispatchKey::CUDA)); - return std::tuple, c10::optional, Dict>( + return std::tuple, std::optional, Dict>( dummyTensor(DispatchKey::CUDA), 5, c10::List({dummyTensor(DispatchKey::CPU), dummyTensor(DispatchKey::CUDA)}), - c10::optional(std::in_place, 0), + std::optional(std::in_place, 0), dict ); })); @@ -466,14 +466,14 @@ TEST(OperatorRegistrationTestLambdaBasedKernel, givenFallbackKernelWithoutTensor EXPECT_EQ(4, outputs[0].toInt()); } -c10::optional called_arg2 = c10::nullopt; -c10::optional called_arg3 = c10::nullopt; -c10::optional called_arg4 = c10::nullopt; +std::optional called_arg2 = c10::nullopt; +std::optional called_arg3 = c10::nullopt; +std::optional called_arg4 = c10::nullopt; TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithOptionalInputs_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators().op( "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()", - RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { + RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor arg1, const std::optional& arg2, c10::optional arg3, c10::optional arg4) { called = true; called_arg2 = arg2; called_arg3 = arg3; @@ -507,7 +507,7 @@ TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithOptionalInputs_wi TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithOptionalInputs_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators().op( "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?", - RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { + RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor arg1, const std::optional& arg2, c10::optional arg3, c10::optional arg4) { called = true; called_arg2 = arg2; called_arg3 = arg3; @@ -544,7 +544,7 @@ TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithOptionalInputs_wi TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernelWithOptionalInputs_withMultipleOutputs_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators().op( "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)", - RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { + RegisterOperators::options().kernel(DispatchKey::CPU, [] (Tensor arg1, const std::optional& arg2, c10::optional arg3, c10::optional arg4) { return std::make_tuple(arg2, arg3, arg4); })); auto op = c10::Dispatcher::singleton().findSchema({"_test::opt_input", ""}); @@ -588,7 +588,7 @@ TEST(OperatorRegistrationTestLambdaBasedKernel, givenKernel_whenRegisteredWithou auto op = c10::Dispatcher::singleton().findSchema({"_test::no_schema_specified", ""}); ASSERT_TRUE(op.has_value()); - c10::optional differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema()); + std::optional differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema()); EXPECT_FALSE(differences.has_value()); } diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h index ccd94ff1de2be..4642be5d689a5 100644 --- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h +++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h @@ -116,7 +116,7 @@ namespace impl { }; template - struct assert_is_valid_input_type, AllowDeprecatedTypes> + struct assert_is_valid_input_type, AllowDeprecatedTypes> : assert_is_valid_input_type {}; template @@ -226,7 +226,7 @@ namespace impl { }; template - struct assert_is_valid_output_type, AllowDeprecatedTypes> + struct assert_is_valid_output_type, AllowDeprecatedTypes> : assert_is_valid_output_type {}; template diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp index 337f0d4c0cad3..1609e014f43f0 100644 --- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp +++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp @@ -205,15 +205,15 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithIntListOutput_wh } struct KernelWithMultipleOutputs final : OperatorKernel { - std::tuple, c10::optional, Dict> operator()(Tensor) { + std::tuple, std::optional, Dict> operator()(Tensor) { Dict dict; dict.insert("first", dummyTensor(DispatchKey::CPU)); dict.insert("second", dummyTensor(DispatchKey::CUDA)); - return std::tuple, c10::optional, Dict>( + return std::tuple, std::optional, Dict>( dummyTensor(DispatchKey::CUDA), 5, c10::List({dummyTensor(DispatchKey::CPU), dummyTensor(DispatchKey::CUDA)}), - c10::optional(std::in_place, 0), + std::optional(std::in_place, 0), dict ); } @@ -679,12 +679,12 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenFallbackKernelWithoutTenso EXPECT_EQ(4, outputs[0].toInt()); } -c10::optional called_arg2 = c10::nullopt; -c10::optional called_arg3 = c10::nullopt; -c10::optional called_arg4 = c10::nullopt; +std::optional called_arg2 = c10::nullopt; +std::optional called_arg3 = c10::nullopt; +std::optional called_arg4 = c10::nullopt; struct KernelWithOptInputWithoutOutput final : OperatorKernel { - void operator()(Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { + void operator()(Tensor arg1, const std::optional& arg2, c10::optional arg3, c10::optional arg4) { called = true; called_arg2 = arg2; called_arg3 = arg3; @@ -720,7 +720,7 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithOptionalInputs_w } struct KernelWithOptInputWithOutput final : OperatorKernel { - c10::optional operator()(Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { + std::optional operator()(Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { called = true; called_arg2 = arg2; called_arg3 = arg3; @@ -759,8 +759,8 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithOptionalInputs_w } struct KernelWithOptInputWithMultipleOutputs final : OperatorKernel { - std::tuple, c10::optional, c10::optional> - operator()(Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { + std::tuple, c10::optional, c10::optional> + operator()(Tensor arg1, const std::optional& arg2, c10::optional arg3, c10::optional arg4) { return std::make_tuple(arg2, arg3, arg4); } }; @@ -821,7 +821,7 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernel_whenRegisteredWitho auto op = c10::Dispatcher::singleton().findSchema({"_test::no_schema_specified", ""}); ASSERT_TRUE(op.has_value()); - c10::optional differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema()); + std::optional differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema()); EXPECT_FALSE(differences.has_value()); } @@ -832,7 +832,7 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernel_whenRegisteredCatch auto op = c10::Dispatcher::singleton().findSchema({"_test::no_schema_specified", ""}); ASSERT_TRUE(op.has_value()); - c10::optional differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema()); + std::optional differences = c10::findSchemaDifferences(torch::jit::parseSchema("_test::no_schema_specified(Tensor arg1, int arg2, Tensor[] arg3) -> (int, Tensor)"), op->schema()); EXPECT_FALSE(differences.has_value()); } diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h index b25ca55c16851..9aef3a0f62cf5 100644 --- a/aten/src/ATen/core/builtin_function.h +++ b/aten/src/ATen/core/builtin_function.h @@ -63,7 +63,7 @@ struct BuiltinOpFunction : public Function { bool call( Stack& stack, - c10::optional, + std::optional, c10::function_ref) override { run(stack); return false; diff --git a/aten/src/ATen/core/class_type.cpp b/aten/src/ATen/core/class_type.cpp index b4ef2979738f9..0a9a8074067ee 100644 --- a/aten/src/ATen/core/class_type.cpp +++ b/aten/src/ATen/core/class_type.cpp @@ -469,7 +469,7 @@ bool ClassType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const { } ClassTypePtr ClassType::create( - c10::optional qualifiedName, + std::optional qualifiedName, std::weak_ptr cu, bool is_module, std::string doc_string, @@ -483,7 +483,7 @@ ClassTypePtr ClassType::create( } ClassType::ClassType( - c10::optional name, + std::optional name, std::weak_ptr cu, bool is_module, std::string doc_string, @@ -620,7 +620,7 @@ IValue ClassType::getConstant(size_t slot) const { return constantValues_[slot]; } -c10::optional ClassType::findConstant(const std::string& name) const { +std::optional ClassType::findConstant(const std::string& name) const { TORCH_INTERNAL_ASSERT(constantNames_.size() == constantValues_.size()); size_t pos = 0; for (const auto& c : constantNames_) { @@ -652,7 +652,7 @@ std::shared_ptr ClassType::compilation_unit() const { return cu; } -c10::optional ClassType::getProperty(const std::string& name) { +std::optional ClassType::getProperty(const std::string& name) { for (auto& prop : properties_) { if (name == prop.name) { return prop; @@ -667,7 +667,7 @@ void ClassType::addProperty(const std::string& name, torch::jit::Function* gette properties_.push_back({name, getter, setter}); } -c10::optional ClassType::findConstantSlot(const std::string& name) const { +std::optional ClassType::findConstantSlot(const std::string& name) const { TORCH_CHECK(constantNames_.size() == constantValues_.size()); size_t slot = 0; for (const auto& constant : constantNames_) { diff --git a/aten/src/ATen/core/class_type.h b/aten/src/ATen/core/class_type.h index 99fd27bba5426..b137f0ed208a1 100644 --- a/aten/src/ATen/core/class_type.h +++ b/aten/src/ATen/core/class_type.h @@ -74,7 +74,7 @@ struct TORCH_API ClassType : public NamedType { // Create a class type with name `name` and its methods stored in `cu`. static ClassTypePtr create( - c10::optional qualifiedName, + std::optional qualifiedName, std::weak_ptr cu, bool is_module = false, std::string doc_string = "", @@ -152,7 +152,7 @@ struct TORCH_API ClassType : public NamedType { // Attributes are stored in a specific slot at runtime for effiency. // When emitting instructions we specify the slot so that attribute access is // a constant lookup - c10::optional findAttributeSlot(const std::string& name) const { + std::optional findAttributeSlot(const std::string& name) const { size_t slot = 0; for (const auto& attr : attributes_) { if (name == attr.getName()) { @@ -239,7 +239,7 @@ struct TORCH_API ClassType : public NamedType { } // Get the property with the given \p name, if it exists on the class. - c10::optional getProperty(const std::string& name); + std::optional getProperty(const std::string& name); // Add a property named \p name with \p getter and \p setter as its getter and setter. void addProperty(const std::string& name, torch::jit::Function* getter, torch::jit::Function* setter); // Get a list of all properties. @@ -257,7 +257,7 @@ struct TORCH_API ClassType : public NamedType { size_t addConstant(const std::string& name, const IValue& value); - c10::optional findConstantSlot(const std::string& name) const; + std::optional findConstantSlot(const std::string& name) const; size_t getConstantSlot(const std::string& name) const { if (auto r = findConstantSlot(name)) { @@ -281,7 +281,7 @@ struct TORCH_API ClassType : public NamedType { IValue getConstant(size_t slot) const; - c10::optional findConstant(const std::string& name) const; + std::optional findConstant(const std::string& name) const; size_t numConstants() const; @@ -384,7 +384,7 @@ struct TORCH_API ClassType : public NamedType { private: ClassType( - c10::optional name, + std::optional name, std::weak_ptr cu, bool is_module = false, std::string doc_string = "", diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h index 33e910591de0a..46c291bada308 100644 --- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h +++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h @@ -56,7 +56,7 @@ namespace detail { void operator()(const at::Tensor& x) { ts = ts | x.key_set(); } - void operator()(const c10::optional& x) { + void operator()(const std::optional& x) { if (x.has_value()) { ts = ts | x->key_set(); } @@ -67,8 +67,8 @@ namespace detail { } } // Tensor?[] translates to this case. - void operator()(const c10::List>& xs) { - for (c10::optional x : xs) { + void operator()(const c10::List>& xs) { + for (std::optional x : xs) { if (x.has_value()) { ts = ts | x.value().key_set(); } @@ -80,7 +80,7 @@ namespace detail { ts = ts | x.key_set(); } } - [[noreturn]] void operator()(at::ArrayRef>) { + [[noreturn]] void operator()(at::ArrayRef>) { // Just checking that the handling of Tensor?[] didn't change. TORCH_INTERNAL_ASSERT(false); } @@ -89,7 +89,7 @@ namespace detail { ts = ts | gen.key_set(); } } - void operator()(const c10::optional& gen) { + void operator()(const std::optional& gen) { if (gen.has_value() && gen->defined()) { ts = ts | gen->key_set(); } diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index 6077ac8e34cc8..85897f7653ee6 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -76,8 +76,8 @@ C10_EXPORT Dispatcher& Dispatcher::realSingleton() { return _singleton; } -c10::optional Dispatcher::findOp(const OperatorName& overload_name) { - return operatorLookupTable_.read([&] (const ska::flat_hash_map& operatorLookupTable) -> c10::optional { +std::optional Dispatcher::findOp(const OperatorName& overload_name) { + return operatorLookupTable_.read([&] (const ska::flat_hash_map& operatorLookupTable) -> std::optional { auto found = operatorLookupTable.find(overload_name); if (found == operatorLookupTable.end()) { return c10::nullopt; @@ -103,7 +103,7 @@ void Dispatcher::waitForDef(const FunctionSchema& schema) { "the same dependencies."); } -void Dispatcher::waitForImpl(const OperatorName& op_name, c10::optional maybe_dk) { +void Dispatcher::waitForImpl(const OperatorName& op_name, std::optional maybe_dk) { using namespace std::chrono_literals; std::unique_lock lock(guard_->mutex); auto dk = maybe_dk.value_or(DispatchKey::CompositeImplicitAutograd); @@ -121,7 +121,7 @@ void Dispatcher::waitForImpl(const OperatorName& op_name, c10::optional Dispatcher::findSchema(const OperatorName& overload_name) { +std::optional Dispatcher::findSchema(const OperatorName& overload_name) { auto it = findOp(overload_name); if (it.has_value()) { if (it->hasSchema()) { @@ -275,7 +275,7 @@ PythonModuleMapType& pythonModulesSingleton() { } -c10::optional> Dispatcher::getPyStub(OperatorName op_name) { +std::optional> Dispatcher::getPyStub(OperatorName op_name) { std::lock_guard lock(guard_->mutex); auto found = pythonModulesSingleton().find(op_name); if (found == pythonModulesSingleton().end()) { @@ -332,9 +332,9 @@ void Dispatcher::throwIfHasPythonModule(OperatorName op_name) { RegistrationHandleRAII Dispatcher::registerImpl( OperatorName op_name, - c10::optional dispatch_key, + std::optional dispatch_key, KernelFunction kernel, - c10::optional cpp_signature, + std::optional cpp_signature, std::unique_ptr inferred_function_schema, std::string debug ) { @@ -364,7 +364,7 @@ RegistrationHandleRAII Dispatcher::registerImpl( }); } -void Dispatcher::deregisterImpl_(const OperatorHandle& op, const OperatorName& op_name, c10::optional dispatch_key, impl::OperatorEntry::AnnotatedKernelContainerIterator handle) { +void Dispatcher::deregisterImpl_(const OperatorHandle& op, const OperatorName& op_name, std::optional dispatch_key, impl::OperatorEntry::AnnotatedKernelContainerIterator handle) { op.operatorDef_->op.deregisterKernel_(*this, dispatch_key, handle); TORCH_INTERNAL_ASSERT(op.operator_name() == op_name); @@ -486,7 +486,7 @@ std::vector Dispatcher::findDanglingImpls() const { }); } -std::vector Dispatcher::getRegistrationsForDispatchKey(c10::optional k) const { +std::vector Dispatcher::getRegistrationsForDispatchKey(std::optional k) const { return operatorLookupTable_.read([&] (const ska::flat_hash_map& operatorLookupTable) -> std::vector { std::vector op_names; for (const auto& op : operatorLookupTable) { diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index caf73d7cebb21..6e679992a9f2d 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -137,7 +137,7 @@ class TORCH_API Dispatcher final { * and returns it if it is registered WITH A SCHEMA. * Returns nullopt otherwise. */ - c10::optional findSchema(const OperatorName& operator_name); + std::optional findSchema(const OperatorName& operator_name); /** * Variant of findSchema that results in less code generated at the call site. @@ -155,7 +155,7 @@ class TORCH_API Dispatcher final { OperatorHandle findSchemaOrThrow(const char* name, const char* overload_name); // Like findSchema, but also returns OperatorHandle even if there is no schema - c10::optional findOp(const OperatorName& operator_name); + std::optional findOp(const OperatorName& operator_name); // Returns a list of all operator names present in the operatorLookupTable_ const std::vector getAllOpNames(); @@ -196,7 +196,7 @@ class TORCH_API Dispatcher final { // Used by torchdeploy/multipy for multiple interpreters racing. void waitForDef(const FunctionSchema& schema); - void waitForImpl(const OperatorName& op_name, c10::optional dispatch_key); + void waitForImpl(const OperatorName& op_name, std::optional dispatch_key); // ------------------------------------------------------------------------ // @@ -221,7 +221,7 @@ class TORCH_API Dispatcher final { */ // NB: steals the inferred function schema, as we may need to hold on to // it for a bit until the real schema turns up - RegistrationHandleRAII registerImpl(OperatorName op_name, c10::optional dispatch_key, KernelFunction kernel, c10::optional cpp_signature, std::unique_ptr inferred_function_schema, std::string debug); + RegistrationHandleRAII registerImpl(OperatorName op_name, std::optional dispatch_key, KernelFunction kernel, c10::optional cpp_signature, std::unique_ptr inferred_function_schema, std::string debug); /** * Given an operator, tells the Dispatcher that we have implemented a fake impl @@ -234,7 +234,7 @@ class TORCH_API Dispatcher final { */ void throwIfHasPythonModule(OperatorName op_name); - c10::optional> getPyStub(OperatorName op_name); + std::optional> getPyStub(OperatorName op_name); /** * Register a new operator by name. @@ -299,7 +299,7 @@ class TORCH_API Dispatcher final { * Returns the names of all operators with a kernel registered for the specified DispatchKey. * If no DispatchKey is specified, it returns all registered operators. */ - std::vector getRegistrationsForDispatchKey(c10::optional k) const; + std::vector getRegistrationsForDispatchKey(std::optional k) const; private: Dispatcher(); @@ -321,7 +321,7 @@ class TORCH_API Dispatcher final { void deregisterImpl_( const OperatorHandle& op, const OperatorName& op_name, - c10::optional dispatch_key, + std::optional dispatch_key, impl::OperatorEntry::AnnotatedKernelContainerIterator kernel_handle); void deregisterName_(const OperatorHandle& op, const OperatorName& op_name); void deregisterFallback_(DispatchKey dispatchKey); diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index 5f4538f2c9790..74e5a7e2cf955 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -7,7 +7,7 @@ namespace c10 { namespace impl { namespace { - std::string toString(c10::optional k) { + std::string toString(std::optional k) { if (k.has_value()) { return toString(*k); } else { @@ -39,7 +39,7 @@ namespace { // TODO: figure out if we can just directly save real schema at def time FunctionSchema from_def = from_def_.cloneWithRealTypes(kernel.isValidSymUnboxed()); FunctionSchema inferred = inferred_.cloneWithRealTypes(); - c10::optional schema_difference = findSchemaDifferences(from_def, inferred); + std::optional schema_difference = findSchemaDifferences(from_def, inferred); if (schema_difference.has_value()) { TORCH_CHECK(false, "Inferred operator schema for a C++ kernel function doesn't match the expected function schema.\n" @@ -101,9 +101,9 @@ void OperatorEntry::deregisterSchema() { OperatorEntry::AnnotatedKernelContainerIterator OperatorEntry::registerKernel( const c10::Dispatcher& dispatcher, - c10::optional dispatch_key, + std::optional dispatch_key, KernelFunction kernel, - c10::optional cpp_signature, + std::optional cpp_signature, std::unique_ptr inferred_function_schema, std::string debug ) { @@ -181,7 +181,7 @@ OperatorEntry::AnnotatedKernelContainerIterator OperatorEntry::registerKernel( void OperatorEntry::deregisterKernel_( const c10::Dispatcher& dispatcher, - c10::optional dispatch_key, + std::optional dispatch_key, AnnotatedKernelContainerIterator kernel ) { // Redirect catchAll deregistrations to CompositeImplicitAutograd. diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h index 903ff043799b2..873b385845ed3 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.h +++ b/aten/src/ATen/core/dispatch/OperatorEntry.h @@ -129,9 +129,9 @@ class TORCH_API OperatorEntry final { // Postcondition: caller is responsible for disposing of the kernel AnnotatedKernelContainerIterator registerKernel( const Dispatcher& dispatcher, - c10::optional dispatch_key, + std::optional dispatch_key, KernelFunction kernel, - c10::optional cpp_signature, + std::optional cpp_signature, std::unique_ptr inferred_function_schema, std::string debug ); @@ -139,7 +139,7 @@ class TORCH_API OperatorEntry final { // Precondition: Dispatcher::mutex_ is held void deregisterKernel_( const Dispatcher& dispatcher, - c10::optional dispatch_key, + std::optional dispatch_key, AnnotatedKernelContainerIterator kernel ); @@ -221,7 +221,7 @@ class TORCH_API OperatorEntry final { private: OperatorName name_; - c10::optional schema_; + std::optional schema_; #ifndef C10_MOBILE std::vector tags_; #endif @@ -282,10 +282,10 @@ class TORCH_API OperatorEntry final { struct CppSignatureWithDebug { CppSignature signature; std::string debug; - c10::optional dispatch_key; + std::optional dispatch_key; }; - c10::optional cpp_signature_; - c10::optional sym_cpp_signature_; + std::optional cpp_signature_; + std::optional sym_cpp_signature_; // A Python custom error handler for OperatorEntry::reportError std::unique_ptr report_error_callback_; diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h index 25b75b9e51114..fe4f0b4dfe602 100644 --- a/aten/src/ATen/core/dynamic_type.h +++ b/aten/src/ATen/core/dynamic_type.h @@ -121,7 +121,7 @@ class DynamicType : public SharedType { * A implementation detail to support NamedTuple. */ struct LabeledDynamicType { - c10::optional label; + std::optional label; DynamicTypePtr ty; explicit LabeledDynamicType(DynamicTypePtr t) : ty(std::move(t)) {} @@ -163,7 +163,7 @@ class DynamicType : public SharedType { Tag tag() const { return tag_; } - const c10::optional& name() const { + const std::optional& name() const { return name_; } const Arguments& arguments() const { @@ -200,7 +200,7 @@ class DynamicType : public SharedType { } Tag tag_; - c10::optional name_; + std::optional name_; union { Arguments arguments_; ClassTypePtr class_; diff --git a/aten/src/ATen/core/function.h b/aten/src/ATen/core/function.h index f55e15e50b4fa..01e395bcf6106 100644 --- a/aten/src/ATen/core/function.h +++ b/aten/src/ATen/core/function.h @@ -97,7 +97,7 @@ struct TORCH_API Function { // executor. virtual bool call( Stack&, - c10::optional, + std::optional, c10::function_ref) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false); return false; diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp index 6e119ae25cc72..6f6cc8ed68557 100644 --- a/aten/src/ATen/core/function_schema.cpp +++ b/aten/src/ATen/core/function_schema.cpp @@ -30,7 +30,7 @@ FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const { // NB: keep this in sync with unpackSymInt in KernelFunction_impl.h if ( *a.real_type() == *getTypePtr() || - *a.real_type() == *getTypePtr>() || + *a.real_type() == *getTypePtr>() || *a.real_type() == *getTypePtr() || *a.real_type() == *getTypePtr() ) { @@ -53,7 +53,7 @@ FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const { is_varret()); } -bool FunctionSchema::canAliasTypeSetsAlias(const c10::optional &lhs, const c10::optional &rhs) const { +bool FunctionSchema::canAliasTypeSetsAlias(const std::optional &lhs, const c10::optional &rhs) const { if (!lhs || !rhs) { return false; } @@ -67,7 +67,7 @@ bool FunctionSchema::canAliasTypeSetsAlias(const c10::optional &lh return false; } -c10::optional FunctionSchema::getAliasTypeSetContainedTypes(const c10::optional &aliasTypeSet) const { +std::optional FunctionSchema::getAliasTypeSetContainedTypes(const c10::optional &aliasTypeSet) const { if (!aliasTypeSet) { return c10::nullopt; } @@ -95,7 +95,7 @@ c10::optional FunctionSchema::getAliasTypeSetContainedTypes(const return AliasTypeSet(containedTypes.begin(), containedTypes.end()); } -c10::optional FunctionSchema::mapTypeToAliasTypeSet(const TypePtr& type) const { +std::optional FunctionSchema::mapTypeToAliasTypeSet(const TypePtr& type) const { switch(type->kind()) { case TypeKind::ListType: case TypeKind::DictType: @@ -155,8 +155,8 @@ bool FunctionSchema::may_alias(const SchemaArgument& lhs, const SchemaArgument& const Argument lhsArg = getCorrectList(lhs.type)[lhs.index]; const Argument rhsArg = getCorrectList(rhs.type)[rhs.index]; - c10::optional lhsTypes = mapTypeToAliasTypeSet(lhsArg.type()); - c10::optional rhsTypes = mapTypeToAliasTypeSet(rhsArg.type()); + std::optional lhsTypes = mapTypeToAliasTypeSet(lhsArg.type()); + std::optional rhsTypes = mapTypeToAliasTypeSet(rhsArg.type()); // Check to see if lhs and rhs have the same alias set if (canAliasTypeSetsAlias(lhsTypes, rhsTypes)) { @@ -182,10 +182,10 @@ bool FunctionSchema::may_contain_alias(const SchemaArgument& lhs, const SchemaAr const c10::Argument lhsArg = getCorrectList(lhs.type)[lhs.index]; const c10::Argument rhsArg = getCorrectList(rhs.type)[rhs.index]; - c10::optional lhsTypes = mapTypeToAliasTypeSet(lhsArg.type()); - c10::optional rhsTypes = mapTypeToAliasTypeSet(rhsArg.type()); - c10::optional lhsContainedTypes = getAliasTypeSetContainedTypes(lhsTypes); - c10::optional rhsContainedTypes = getAliasTypeSetContainedTypes(rhsTypes); + std::optional lhsTypes = mapTypeToAliasTypeSet(lhsArg.type()); + std::optional rhsTypes = mapTypeToAliasTypeSet(rhsArg.type()); + std::optional lhsContainedTypes = getAliasTypeSetContainedTypes(lhsTypes); + std::optional rhsContainedTypes = getAliasTypeSetContainedTypes(rhsTypes); // Checks if one side is wildcard and the other side is a container of the same type bool lhsWildcard = lhsArg.alias_info() && lhsArg.alias_info()->isWildcardAfter() && canAliasTypeSetsAlias(lhsTypes, rhsContainedTypes); diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h index 79e7ffed1a14f..801bd43c84c01 100644 --- a/aten/src/ATen/core/function_schema.h +++ b/aten/src/ATen/core/function_schema.h @@ -29,20 +29,20 @@ struct Argument { Argument( std::string name = "", const TypePtr& type = nullptr, - c10::optional N = c10::nullopt, - c10::optional default_value = c10::nullopt, + std::optional N = c10::nullopt, + std::optional default_value = c10::nullopt, bool kwarg_only = false, - c10::optional alias_info = c10::nullopt) + std::optional alias_info = c10::nullopt) : Argument(std::move(name), type, type, N, std::move(default_value), kwarg_only, std::move(alias_info)) {} Argument( std::string name, TypePtr fake_type, TypePtr real_type, - c10::optional N = c10::nullopt, - c10::optional default_value = c10::nullopt, + std::optional N = c10::nullopt, + std::optional default_value = c10::nullopt, bool kwarg_only = false, - c10::optional alias_info = c10::nullopt) + std::optional alias_info = c10::nullopt) : name_(std::move(name)), type_(fake_type ? std::move(fake_type) : TensorType::get()), real_type_(real_type ? std::move(real_type) : type_), @@ -94,10 +94,10 @@ struct Argument { const TypePtr& real_type() const { return real_type_; } - c10::optional N() const { + std::optional N() const { return N_; } - const c10::optional& default_value() const { + const std::optional& default_value() const { return default_value_; } bool kwarg_only() const { @@ -150,7 +150,7 @@ struct Argument { N_, default_value_, kwarg_only_, - alias_info_ ? c10::optional(*alias_info_) : c10::nullopt); + alias_info_ ? std::optional(*alias_info_) : c10::nullopt); } // this function checks whether this Argument is backward compatible with @@ -179,9 +179,9 @@ struct Argument { // e.g. for int[3]: type = ListType::ofInts(), N = 3 // If present, this will allow scalars to be broadcast to this length to // become a list. - c10::optional N_; + std::optional N_; - c10::optional default_value_; + std::optional default_value_; // AliasInfo is huge, so let's only allocate memory for it if // necessary (which it isn't during schema parsing on startup, to // give a pertinent example). @@ -322,7 +322,7 @@ struct TORCH_API FunctionSchema { // alias information should we infer? // NB: due to alias analysis kind merging, this may be nullopt. Eventually // this should always be set no matter what - c10::optional alias_kind_; + std::optional alias_kind_; template void checkArg(const IValue& value, const Argument& argument, optional pos) const; @@ -395,7 +395,7 @@ struct TORCH_API FunctionSchema { return aliasInfo && aliasInfo->isWrite(); } bool is_mutable(c10::string_view name) const { - c10::optional index = argumentIndexWithName(name); + std::optional index = argumentIndexWithName(name); TORCH_INTERNAL_ASSERT( index != c10::nullopt, "Schema has no argument named ", name); @@ -416,22 +416,22 @@ struct TORCH_API FunctionSchema { // Returns whether the two AliasTypeSets contain any similarities // ie: whether the two type sets can alias. - bool canAliasTypeSetsAlias(const c10::optional &lhs, const c10::optional &rhs) const; + bool canAliasTypeSetsAlias(const std::optional &lhs, const c10::optional &rhs) const; // Recursively Finds all contained types within the AliasTypeSet. - c10::optional getAliasTypeSetContainedTypes(const c10::optional &aliasTypeSet) const; + std::optional getAliasTypeSetContainedTypes(const c10::optional &aliasTypeSet) const; // Similar to mapTypeToAliasTypeSet defined in alias_analysis.cpp. // Used to map types to a type such that all types that can alias will be mapped to the same type. // For example, calling this method on 'Optional[List[int]]' is the same as calling this method // on 'List[int]'. - c10::optional mapTypeToAliasTypeSet(const TypePtr& type) const; + std::optional mapTypeToAliasTypeSet(const TypePtr& type) const; // Returns either arguments() or returns() depending on the SchemaArgType // output => returns(), input => arguments() const std::vector& getCorrectList(SchemaArgType type) const; - c10::optional argumentIndexWithName(c10::string_view name) const { + std::optional argumentIndexWithName(c10::string_view name) const { for (const auto i : c10::irange(arguments().size())) { if(name == arguments()[i].name()) return i; @@ -470,8 +470,8 @@ struct TORCH_API FunctionSchema { std::string formatTypeMismatchMsg( const Argument& expected, const std::string& actual_type, - c10::optional position = c10::nullopt, - c10::optional value = c10::nullopt) const; + std::optional position = c10::nullopt, + std::optional value = c10::nullopt) const; FunctionSchema cloneWithRemappedTypes( const std::function type_map) const; @@ -514,7 +514,7 @@ struct TORCH_API FunctionSchema { alias_kind_ = v; } - c10::optional getNamespace() const { + std::optional getNamespace() const { return name_.getNamespace(); } diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h index a6959c661af15..182d7a181cde4 100644 --- a/aten/src/ATen/core/function_schema_inl.h +++ b/aten/src/ATen/core/function_schema_inl.h @@ -162,8 +162,8 @@ inline bool Argument::isForwardCompatibleWith( inline std::string FunctionSchema::formatTypeMismatchMsg( const Argument& expected, const std::string& actual_type, - c10::optional position, - c10::optional value) const { + std::optional position, + std::optional value) const { std::string position_str; if (position) { position_str = c10::str("Position: ", *position, "\n"); diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index 7343d66fcb97d..6c505f8b656cf 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -471,7 +471,7 @@ bool IValue::isOptionalTensorList() const { return false; } const auto& ty = static_cast(payload.u.as_intrusive_ptr)->elementType; - const auto& expected_ty = c10::getTypePtr>(); + const auto& expected_ty = c10::getTypePtr>(); return expected_ty == ty; } @@ -886,14 +886,14 @@ c10::intrusive_ptr ivalue::Object::create( StrongTypePtr(nullptr, std::move(classType)), numSlots); } -IValue IValue::deepcopy(c10::optional device) const { +IValue IValue::deepcopy(std::optional device) const { IValue::HashAliasedIValueMap memo; return deepcopy(memo, device); } IValue IValue::deepcopy( IValue::HashAliasedIValueMap& memo, - c10::optional device) const { + std::optional device) const { if (memo.count(*this)) { return memo.at(*this); } @@ -1027,14 +1027,14 @@ c10::intrusive_ptr ivalue::Object::copy_to_weak_compilation_ref( } c10::intrusive_ptr ivalue::Object::deepcopy( - c10::optional device) const { + std::optional device) const { IValue::HashAliasedIValueMap memo; return deepcopy(memo, device); } c10::intrusive_ptr ivalue::Object::deepcopy( IValue::HashAliasedIValueMap& memo, - c10::optional device) const { + std::optional device) const { auto cu = type_.cu_; auto object = ivalue::Object::create(WeakOrStrongTypePtr(type_.cu_, type_.type_), type()->numAttributes()); for (const auto i : c10::irange(slots_.size())) { diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index 07e85677c3c75..7715ffbe3c31d 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -86,20 +86,20 @@ struct StreamData3Holder : c10::intrusive_ptr_target { } // namespace ivalue -// This is an owning wrapper for a c10::optional> +// This is an owning wrapper for a std::optional> // that can be implicitly converted to a (non-owning) optional>. // Its purpose is to be used in generated code to keep the vector alive // either until the end of a statement (as a temporary), or as a saved arg // in autograd. template struct OptionalArray { - c10::optional> list; + std::optional> list; OptionalArray() = default; OptionalArray(std::vector val) : list(std::move(val)) {} // Used when saving an argument for the backwards pass. - OptionalArray& operator=(c10::optional> ref) { + OptionalArray& operator=(std::optional> ref) { if (ref) { list = std::vector(ref->begin(), ref->end()); } else { @@ -118,7 +118,7 @@ struct OptionalArray { return *this; } - operator c10::optional>() { + operator std::optional>() { if (!list) { return nullopt; } @@ -697,7 +697,7 @@ struct TORCH_API IValue final { c10::intrusive_ptr toString() &&; c10::intrusive_ptr toString() const&; const std::string& toStringRef() const; - c10::optional> toOptionalStringRef() + std::optional> toOptionalStringRef() const; c10::string_view toStringView() const; @@ -726,9 +726,9 @@ struct TORCH_API IValue final { // OptionalTensorList bool isOptionalTensorList() const; - c10::List> toOptionalTensorList() &&; - c10::List> toOptionalTensorList() const&; - std::vector> toOptionalTensorVector() const; + c10::List> toOptionalTensorList() &&; + c10::List> toOptionalTensorList() const&; + std::vector> toOptionalTensorVector() const; // GenericList IValue(c10::List v); @@ -817,7 +817,7 @@ struct TORCH_API IValue final { IValue(std::unordered_map v); template = nullptr> - IValue(c10::optional v); + IValue(std::optional v); template = nullptr> IValue(c10::OptionalArrayRef v); IValue(c10::nullopt_t); @@ -1128,10 +1128,10 @@ struct TORCH_API IValue final { // TODO: There are several places that recurse over IValue. This is fragile. // This visitor should be used to recurse over ivalues. void visit(const std::function& visitor) const; - IValue deepcopy(c10::optional device = c10::nullopt) const; + IValue deepcopy(std::optional device = c10::nullopt) const; IValue deepcopy( HashAliasedIValueMap& memo, - c10::optional device = c10::nullopt) const; + std::optional device = c10::nullopt) const; private: static c10::intrusive_ptr_target* null_to_undefined_tensor( @@ -1530,8 +1530,8 @@ struct WeakOrStrongCompilationUnit { return holdingStrongRef() && *strong_ptr_ == nullptr; } - c10::optional> strong_ptr_; - c10::optional> weak_ptr_; + std::optional> strong_ptr_; + std::optional> weak_ptr_; }; // An Object will hold a non-owning Compilation Unit reference if it is a diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index 3e3525c274118..b1124c12cfb34 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -909,7 +909,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target { using WeakStorage = c10::weak_intrusive_ptr; void markCompleted( IValue value, - c10::optional> storages = c10::nullopt) { + std::optional> storages = c10::nullopt) { // Start by performing all steps that can throw, before setting any field. // Do this before even acquiring the mutex, because extractStorages might // acquire the GIL, which could lead to a lock inversion with our mutex. @@ -1586,11 +1586,11 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target { c10::intrusive_ptr copy() const; c10::intrusive_ptr deepcopy( - c10::optional device = c10::nullopt) const; + std::optional device = c10::nullopt) const; c10::intrusive_ptr deepcopy( IValue::HashAliasedIValueMap& memo, - c10::optional device = c10::nullopt) const; + std::optional device = c10::nullopt) const; bool is_weak_compilation_ref() const { return !type_.holds_strong_ref(); @@ -1613,7 +1613,7 @@ struct ivalue::PyObjectHolder : c10::intrusive_ptr_target { public: virtual PyObject* getPyObject() = 0; virtual c10::InferredType tryToInferType() = 0; - virtual IValue toIValue(const TypePtr& type, c10::optional N = c10::nullopt) = 0; + virtual IValue toIValue(const TypePtr& type, std::optional N = c10::nullopt) = 0; virtual std::string toStr() = 0; virtual std::vector extractTensors() = 0; @@ -1909,7 +1909,7 @@ std::unordered_map generic_to( } template -c10::optional generic_to(IValue ivalue, _fake_type>) { +std::optional generic_to(IValue ivalue, _fake_type>) { if (ivalue.isNone()) { return c10::nullopt; } @@ -1946,11 +1946,11 @@ inline T IValue::to() && { } template <> -inline c10::optional IValue::to() && { +inline std::optional IValue::to() && { // In the default implementation, the IValue is destroyed with std::move. // But if the unboxed type is optional we cannot destroy // the IValue. - return generic_to(*this, _fake_type>{}); + return generic_to(*this, _fake_type>{}); } template @@ -2046,20 +2046,20 @@ inline std::vector IValue::toTensorVector() const { return createVectorFromList( static_cast(payload.u.as_intrusive_ptr)); } -inline c10::List> IValue::toOptionalTensorList() && { +inline c10::List> IValue::toOptionalTensorList() && { AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind()); - return c10::List>(moveToIntrusivePtr()); + return c10::List>(moveToIntrusivePtr()); } -inline c10::List> IValue::toOptionalTensorList() const& { +inline c10::List> IValue::toOptionalTensorList() const& { AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind()); - return c10::List>(toIntrusivePtr()); + return c10::List>(toIntrusivePtr()); } -inline std::vector> IValue::toOptionalTensorVector() const { +inline std::vector> IValue::toOptionalTensorVector() const { AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind()); TORCH_INTERNAL_ASSERT_DEBUG_ONLY( payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), "called toOptionalTensorVector on null intrusive_ptr IValue"); - return createVectorFromList>( + return createVectorFromList>( static_cast(payload.u.as_intrusive_ptr)); } inline c10::List IValue::toList() && { @@ -2274,7 +2274,7 @@ inline IValue::IValue(std::unordered_map v) } template > -inline IValue::IValue(c10::optional v) : IValue() { +inline IValue::IValue(std::optional v) : IValue() { if (v.has_value()) { *this = IValue(std::move(*v)); } @@ -2360,7 +2360,7 @@ inline const std::string& IValue::toStringRef() const { payload.u.as_intrusive_ptr) ->string(); } -inline c10::optional> IValue:: +inline std::optional> IValue:: toOptionalStringRef() const { if (isNone()) { return c10::nullopt; diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index 05f7242855417..be4414e8fe5b0 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -32,7 +32,7 @@ class Dict; struct IValue; struct FunctionSchema; struct NamedType; -using OptNameList = c10::optional>; +using OptNameList = std::optional>; void standardizeVectorForUnion(std::vector& reference, std::vector* to_fill); void standardizeVectorForUnion(std::vector* to_flatten); @@ -164,9 +164,9 @@ struct TORCH_API UnionType : public SharedType { return has_free_variables_; } - c10::optional toOptional() const; + std::optional toOptional() const; - c10::optional subtractTypeSet(std::vector& to_subtract) const; + std::optional subtractTypeSet(std::vector& to_subtract) const; protected: explicit UnionType(std::vector types, TypeKind kind=TypeKind::UnionType); @@ -247,13 +247,13 @@ struct TORCH_API OptionalType : public UnionType { }; template -inline c10::optional merge_primitive( - const c10::optional& a, - const c10::optional& b) { +inline std::optional merge_primitive( + const std::optional& a, + const std::optional& b) { if (a.has_value() && b.has_value() && a.value() == b.value()) { return a; } - return c10::optional{}; + return std::optional{}; } // If we see `a + b + c` and know that a, b, and c are the same size and have @@ -274,9 +274,9 @@ inline c10::optional merge_primitive( struct TORCH_API Stride { Stride() = default; Stride( - const c10::optional& stride_index, - c10::optional contiguous, - const c10::optional& stride) + const std::optional& stride_index, + std::optional contiguous, + const std::optional& stride) : stride_index_(stride_index), contiguous_(contiguous), stride_(stride) {} bool operator==(const Stride& b) const { @@ -288,17 +288,17 @@ struct TORCH_API Stride { return stride_index_ && contiguous_ && stride_; } - c10::optional stride_index_; - c10::optional contiguous_; - c10::optional stride_; + std::optional stride_index_; + std::optional contiguous_; + std::optional stride_; }; template <> -inline c10::optional merge_primitive( - const c10::optional& a, - const c10::optional& b) { - c10::optional left = a; - c10::optional right = b; +inline std::optional merge_primitive( + const std::optional& a, + const std::optional& b) { + std::optional left = a; + std::optional right = b; if (!left.has_value()) { left = {Stride()}; } @@ -314,7 +314,7 @@ inline c10::optional merge_primitive( // normalize if (!r.stride_index_.has_value() && !r.contiguous_.has_value() && !r.stride_.has_value()) { - return c10::optional{}; + return std::optional{}; } return r; @@ -375,7 +375,7 @@ struct TORCH_API SymbolicShape { SymbolicShape() : dims_(c10::nullopt) {} // Known rank but unknown dimentions. - SymbolicShape(c10::optional rank) : dims_(c10::nullopt) { + SymbolicShape(std::optional rank) : dims_(c10::nullopt) { if(!rank) { return; } @@ -389,10 +389,10 @@ struct TORCH_API SymbolicShape { } // Mix of known and unknown ranks - SymbolicShape(const std::vector>& dims) { + SymbolicShape(const std::vector>& dims) { std::vector shape_symbols; shape_symbols.reserve(dims.size()); - for(c10::optional dim: dims) { + for(std::optional dim: dims) { if(!dim) { shape_symbols.push_back(ShapeSymbol::newSymbol()); } else { @@ -430,18 +430,18 @@ struct TORCH_API SymbolicShape { } // Returns rank or nullopt in case of unranked shape. - c10::optional rank() const { + std::optional rank() const { if(!dims_) { return c10::nullopt; } return dims_->size(); } - c10::optional> sizes() const { + std::optional> sizes() const { return dims_; } - c10::optional> symbolicDims() const { + std::optional> symbolicDims() const { if (!dims_) { return c10::nullopt; } @@ -482,7 +482,7 @@ struct TORCH_API SymbolicShape { } private: - c10::optional> dims_; + std::optional> dims_; }; namespace detail { @@ -498,14 +498,14 @@ inline bool isComplete(const T& /*t*/) { template struct VaryingShape { - using ListOfOptionalElements = std::vector>; + using ListOfOptionalElements = std::vector>; VaryingShape(const std::vector& vec) : VaryingShape(ListOfOptionalElements(vec.begin(), vec.end())) {} VaryingShape(c10::ArrayRef vec) : VaryingShape(ListOfOptionalElements(vec.begin(), vec.end())) {} - VaryingShape(c10::optional size = c10::nullopt) : dims_(c10::nullopt) { + VaryingShape(std::optional size = c10::nullopt) : dims_(c10::nullopt) { if (size) { dims_ = ListOfOptionalElements(*size); } @@ -513,20 +513,20 @@ struct VaryingShape { VaryingShape(ListOfOptionalElements dims) : dims_(std::move(dims)) {} - VaryingShape(size_t size) : VaryingShape(c10::optional(size)) {} + VaryingShape(size_t size) : VaryingShape(std::optional(size)) {} bool operator==(const VaryingShape& other) const { return dims_ == other.dims_; } - const c10::optional &operator[](size_t i) const { + const std::optional &operator[](size_t i) const { if (!dims_) { throw std::runtime_error("Rank isn't fixed"); } return (*dims_).at(i); } - c10::optional size() const { + std::optional size() const { if (!dims_) { return c10::nullopt; } @@ -534,13 +534,13 @@ struct VaryingShape { return dims.size(); } - const c10::optional& sizes() const { + const std::optional& sizes() const { return dims_; } TORCH_API VaryingShape merge(const VaryingShape& other) const; - c10::optional> concrete_sizes() const { + std::optional> concrete_sizes() const { if (!dims_) { return c10::nullopt; } @@ -568,7 +568,7 @@ struct VaryingShape { } private: - c10::optional dims_; + std::optional dims_; }; struct TensorType; @@ -581,27 +581,27 @@ struct TORCH_API TensorType : public SharedType { // used by TensorType::create(size_t dim) which in turn used by // shape_analysis.cpp static TensorTypePtr create( - c10::optional scalar_type, - c10::optional device, + std::optional scalar_type, + std::optional device, const VaryingShape& sizes, const VaryingShape& strides, - c10::optional requires_grad, - c10::optional undefined = false, + std::optional requires_grad, + std::optional undefined = false, bool tensor_contiguity = false); static TensorTypePtr create( - c10::optional scalar_type, - c10::optional device, + std::optional scalar_type, + std::optional device, const SymbolicShape& sizes, const VaryingShape& stride_, - c10::optional requires_grad, - c10::optional undefined = false); + std::optional requires_grad, + std::optional undefined = false); static TensorTypePtr create( - c10::optional scalar_type, - c10::optional device, - c10::optional dim, - c10::optional requires_grad); + std::optional scalar_type, + std::optional device, + std::optional dim, + std::optional requires_grad); // overloaded create variadic template argument as it could not distinguish // initializer list @@ -613,7 +613,7 @@ struct TORCH_API TensorType : public SharedType { static TypePtr fromNumberType(const Type& typ); static TypePtr fromBoolType(); - c10::optional dim() const { + std::optional dim() const { return sizes().size(); } @@ -625,13 +625,13 @@ struct TORCH_API TensorType : public SharedType { return strides_; } - c10::optional device() const { + std::optional device() const { return device_; } - c10::optional scalarType() const { + std::optional scalarType() const { return scalar_type_; } - c10::optional requiresGrad() const { + std::optional requiresGrad() const { return requires_grad_; } bool requires_grad() const override { @@ -651,32 +651,32 @@ struct TORCH_API TensorType : public SharedType { } } - c10::optional numel() const { + std::optional numel() const { size_t prod = 1; const auto& shape = sizes(); for (size_t i = 0; i < shape.size(); i++) { if (!shape[i]) { - return c10::optional{}; + return std::optional{}; } prod *= shape[i].value(); } return prod; } - TensorTypePtr withRequiresGrad(c10::optional s) { + TensorTypePtr withRequiresGrad(std::optional s) { auto copy = clone(); copy->requires_grad_ = s; return copy; } - TensorTypePtr withScalarType(c10::optional st) { + TensorTypePtr withScalarType(std::optional st) { auto copy = clone(); copy->scalar_type_ = st; return copy; } - TensorTypePtr withDim(c10::optional d) { + TensorTypePtr withDim(std::optional d) { auto copy = clone(); // withDim is only used by the legacy executor // that only cares about the rank, so create dummy symbols)) : @@ -712,7 +712,7 @@ struct TORCH_API TensorType : public SharedType { sizes, contiguousStridesOf(sizes)); } - TensorTypePtr withDevice(const c10::optional device) const { + TensorTypePtr withDevice(const std::optional device) const { auto copy = clone(); copy->device_ = device; return copy; @@ -784,7 +784,7 @@ struct TORCH_API TensorType : public SharedType { return r; } - c10::optional undefined() const { return undefined_; } + std::optional undefined() const { return undefined_; } static const TensorTypePtr& get(); @@ -824,12 +824,12 @@ struct TORCH_API TensorType : public SharedType { private: TensorType( - c10::optional scalar_type, - c10::optional device, + std::optional scalar_type, + std::optional device, SymbolicShape sizes, VaryingShape strides, - c10::optional requires_grad, - c10::optional undefined = false); + std::optional requires_grad, + std::optional undefined = false); TensorTypePtr clone() const { return TensorTypePtr(new TensorType( @@ -841,11 +841,11 @@ struct TORCH_API TensorType : public SharedType { at::IntArrayRef strides, bool tensor_contiguity = false); - c10::optional scalar_type_; - c10::optional device_; + std::optional scalar_type_; + std::optional device_; SymbolicShape sizes_; VaryingShape strides_; - c10::optional requires_grad_; + std::optional requires_grad_; // we exploit the fact certain tensors must be zero in the autograd to // optimize gradient computation. Such zero tensors are currently implemented // with `UndefinedTensorImpl.` They can be handled only by special operators @@ -857,7 +857,7 @@ struct TORCH_API TensorType : public SharedType { // undefined_ may become `c10::nullopt` if the tensor was observed to be both // defined and undefined. However, no tensor type starts out with // `undefined_` set to `c10::nullopt` - c10::optional undefined_; + std::optional undefined_; // Represents whether or not this type was inferred. bool is_inferred_ = false; }; @@ -1144,16 +1144,16 @@ using NameList = std::vector; // This type represents a Tuple struct TORCH_API TupleType : public NamedType { - static TupleTypePtr createNamed(const c10::optional& name, + static TupleTypePtr createNamed(const std::optional& name, const std::vector& field_names, const std::vector& field_types, std::vector& field_defaults); - static TupleTypePtr createNamed(const c10::optional& name, + static TupleTypePtr createNamed(const std::optional& name, const std::vector& field_names, const std::vector& field_types); - static TupleTypePtr createNamed(const c10::optional& name, + static TupleTypePtr createNamed(const std::optional& name, const std::vector& field_names, const std::vector& field_types); @@ -1190,21 +1190,21 @@ struct TORCH_API TupleType : public NamedType { const std::shared_ptr& schema() const { return schema_; } - c10::optional> names() const; + std::optional> names() const; static const TypeKind Kind = TypeKind::TupleType; private: template static TupleTypePtr createWithSpec( - const c10::optional& name, + const std::optional& name, const std::vector& field_names, const std::vector& field_types, std::vector& field_defaults); TupleType( std::vector elements_, - c10::optional name, + std::optional name, std::shared_ptr schema); bool compare( @@ -1747,7 +1747,7 @@ inline TypePtr TensorType::fromBoolType() { return TensorType::createContiguous(at::kBool, at::kCPU, {}); } -inline c10::optional tryScalarTypeFromJitType(const Type& type) { +inline std::optional tryScalarTypeFromJitType(const Type& type) { if (type == *FloatType::get()) { return at::typeMetaToScalarType(c10::get_default_dtype()); } else if (type == *IntType::get()) { @@ -1782,13 +1782,13 @@ inline at::ScalarType scalarTypeFromJitType(const Type& type) { // If `type_hint` is an `InterfaceType`, then we can use that as a // potential supertype for `ClassType`s in the list. Otherwise, we have // no way to find and use some common interface type -TORCH_API c10::optional unifyTypes( +TORCH_API std::optional unifyTypes( const TypePtr& t1, const TypePtr& t2, bool default_to_union = false, const TypePtr& type_hint = nullptr); -TORCH_API c10::optional unifyTypeList( +TORCH_API std::optional unifyTypeList( at::ArrayRef elements, std::ostream& why_not, bool default_to_union = false, @@ -2132,7 +2132,7 @@ struct MatchTypeReturn { private: MatchTypeReturn() : reason_(c10::nullopt) {} - c10::optional reason_; // is there is no match, this contains the reason + std::optional reason_; // is there is no match, this contains the reason }; // attempt to match the type variables in formal to actual, adding them to type_env. diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h index 21692db56dd87..ac2cb0528245c 100644 --- a/aten/src/ATen/core/jit_type_base.h +++ b/aten/src/ATen/core/jit_type_base.h @@ -75,7 +75,7 @@ struct SharedType; // Use this to customize how a Type is printed using `annotation_str()`. If // c10::nullopt is returned, `annotation_str()` falls through to its default // implementation. -using TypePrinter = std::function(const Type&)>; +using TypePrinter = std::function(const Type&)>; namespace detail { template @@ -688,7 +688,7 @@ using NamedTypePtr = std::shared_ptr; using ConstNamedTypePtr = std::shared_ptr; struct TORCH_API NamedType : public SharedType { - NamedType(TypeKind tk, c10::optional name) + NamedType(TypeKind tk, std::optional name) : SharedType(tk), name_(std::move(name)) { TORCH_INTERNAL_ASSERT( tk == TypeKind::TupleType || tk == TypeKind::FunctionType || @@ -700,12 +700,12 @@ struct TORCH_API NamedType : public SharedType { // Fully qualified name of type // Looks like: "foo.bar.Baz". - const c10::optional& name() const { + const std::optional& name() const { return name_; } private: - c10::optional name_; + std::optional name_; }; } // namespace c10 diff --git a/aten/src/ATen/core/library.cpp b/aten/src/ATen/core/library.cpp index fd349da2f8b0c..6a910d7b60a57 100644 --- a/aten/src/ATen/core/library.cpp +++ b/aten/src/ATen/core/library.cpp @@ -42,7 +42,7 @@ namespace { constexpr auto CatchAll = c10::DispatchKey::CatchAll; } // anonymous namespace -CppFunction::CppFunction(c10::KernelFunction func, c10::optional cpp_signature, std::unique_ptr schema) +CppFunction::CppFunction(c10::KernelFunction func, std::optional cpp_signature, std::unique_ptr schema) : func_(std::move(func)) , cpp_signature_(cpp_signature) , schema_(std::move(schema)) @@ -57,10 +57,10 @@ void Library::reset() { #define ERROR_CONTEXT "(Error occurred while processing ", toString(kind_), " block at ", file_, ":", line_, ")" -Library::Library(Kind kind, std::string ns, c10::optional k, const char* file, uint32_t line) +Library::Library(Kind kind, std::string ns, std::optional k, const char* file, uint32_t line) : kind_(kind) , ns_(ns == "_" ? c10::nullopt : c10::make_optional(std::move(ns))) - , dispatch_key_(k.value_or(CatchAll) == CatchAll ? c10::optional() : k) + , dispatch_key_(k.value_or(CatchAll) == CatchAll ? std::optional() : k) , file_(file) , line_(line) { diff --git a/aten/src/ATen/core/op_registration/infer_schema.cpp b/aten/src/ATen/core/op_registration/infer_schema.cpp index 7e0fd28f9a7b1..e280bb140220b 100644 --- a/aten/src/ATen/core/op_registration/infer_schema.cpp +++ b/aten/src/ATen/core/op_registration/infer_schema.cpp @@ -43,7 +43,7 @@ FunctionSchema make_function_schema( } // namespace infer_schema } // namespace detail -c10::optional findSchemaDifferences( +std::optional findSchemaDifferences( const FunctionSchema& lhs, const FunctionSchema& rhs) { if (lhs.arguments().size() != rhs.arguments().size()) { diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h index 57409442950f2..2f845f7c4c10f 100644 --- a/aten/src/ATen/core/op_registration/infer_schema.h +++ b/aten/src/ATen/core/op_registration/infer_schema.h @@ -155,6 +155,6 @@ FunctionSchema inferFunctionSchemaSingleReturn(std::string&& name, std::string&& return detail::infer_schema::createFunctionSchemaFromTraitsSingleReturn>(std::move(name), std::move(overload_name)); } -TORCH_API c10::optional findSchemaDifferences(const FunctionSchema& inferred, const FunctionSchema& specified); +TORCH_API std::optional findSchemaDifferences(const FunctionSchema& inferred, const FunctionSchema& specified); } diff --git a/aten/src/ATen/core/op_registration/op_registration.cpp b/aten/src/ATen/core/op_registration/op_registration.cpp index 8a516e68bd0dc..0a64e0f44d7e5 100644 --- a/aten/src/ATen/core/op_registration/op_registration.cpp +++ b/aten/src/ATen/core/op_registration/op_registration.cpp @@ -17,9 +17,9 @@ void build_feature_required_feature_not_available(const char* feature) { } // namespace impl static_assert(std::is_nothrow_move_constructible< - c10::optional>::value); + std::optional>::value); static_assert(std::is_nothrow_move_assignable< - c10::optional>::value); + std::optional>::value); void RegisterOperators::checkSchemaAndRegisterOp_(Options&& options) { TORCH_CHECK( @@ -71,7 +71,7 @@ c10::FunctionSchema RegisterOperators::inferSchemaFromKernels_( opName, " because there is no kernel specified."); - c10::optional inferred_schema = c10::nullopt; + std::optional inferred_schema = c10::nullopt; for (const auto& kernel : options.kernels) { if (nullptr != kernel.inferred_function_schema.get()) { if (!inferred_schema.has_value()) { diff --git a/aten/src/ATen/core/op_registration/op_registration.h b/aten/src/ATen/core/op_registration/op_registration.h index 0b083dc6b6759..b1b1e2c47bc45 100644 --- a/aten/src/ATen/core/op_registration/op_registration.h +++ b/aten/src/ATen/core/op_registration/op_registration.h @@ -399,7 +399,7 @@ class TORCH_API RegisterOperators final { } private: - Options&& kernel(c10::optional dispatch_key, KernelFunction&& func, c10::optional cpp_signature, std::unique_ptr&& inferred_function_schema) && { + Options&& kernel(std::optional dispatch_key, KernelFunction&& func, c10::optional cpp_signature, std::unique_ptr&& inferred_function_schema) && { KernelRegistrationConfig config; config.dispatch_key = dispatch_key; config.func = std::move(func); @@ -425,13 +425,13 @@ class TORCH_API RegisterOperators final { , inferred_function_schema(nullptr) {} - c10::optional dispatch_key; + std::optional dispatch_key; KernelFunction func; - c10::optional cpp_signature; + std::optional cpp_signature; std::unique_ptr inferred_function_schema; }; - c10::optional> schemaOrName_; + std::optional> schemaOrName_; std::vector kernels; optional aliasAnalysisKind_; diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp index 377cb403cdcfd..d1305ac6d9491 100644 --- a/aten/src/ATen/core/op_registration/op_registration_test.cpp +++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp @@ -882,56 +882,56 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) { // optional types (with has_value() == true) - testArgTypes>::test( - c10::optional(1.5), [] (const c10::optional& v) {EXPECT_EQ(1.5, v.value());}, - c10::optional(2.5), [] (const IValue& v) {EXPECT_EQ(2.5, v.toDouble());}, + testArgTypes>::test( + std::optional(1.5), [] (const c10::optional& v) {EXPECT_EQ(1.5, v.value());}, + std::optional(2.5), [] (const IValue& v) {EXPECT_EQ(2.5, v.toDouble());}, "(float? a) -> float?"); - testArgTypes>::test( - c10::optional(1), [] (const c10::optional& v) {EXPECT_EQ(1, v.value());}, - c10::optional(2), [] (const IValue& v) {EXPECT_EQ(2, v.toInt());}, + testArgTypes>::test( + std::optional(1), [] (const c10::optional& v) {EXPECT_EQ(1, v.value());}, + std::optional(2), [] (const IValue& v) {EXPECT_EQ(2, v.toInt());}, "(int? a) -> int?"); - testArgTypes>::test( - c10::optional(true), [] (const c10::optional& v) {EXPECT_EQ(true, v.value());}, - c10::optional(false), [] (const IValue& v) {EXPECT_EQ(false, v.toBool());}, + testArgTypes>::test( + std::optional(true), [] (const c10::optional& v) {EXPECT_EQ(true, v.value());}, + std::optional(false), [] (const IValue& v) {EXPECT_EQ(false, v.toBool());}, "(bool? a) -> bool?"); - testArgTypes>::test( - c10::optional(false), [] (const c10::optional& v) {EXPECT_EQ(false, v.value());}, - c10::optional(true), [] (const IValue& v) {EXPECT_EQ(true, v.toBool());}, + testArgTypes>::test( + std::optional(false), [] (const c10::optional& v) {EXPECT_EQ(false, v.value());}, + std::optional(true), [] (const IValue& v) {EXPECT_EQ(true, v.toBool());}, "(bool? a) -> bool?"); - testArgTypes>::test( - c10::optional("string1"), [] (const c10::optional& v) {EXPECT_EQ("string1", v.value());}, - c10::optional("string2"), [] (const IValue& v) {EXPECT_EQ("string2", v.toStringRef());}, + testArgTypes>::test( + std::optional("string1"), [] (const c10::optional& v) {EXPECT_EQ("string1", v.value());}, + std::optional("string2"), [] (const IValue& v) {EXPECT_EQ("string2", v.toStringRef());}, "(str? a) -> str?"); - testArgTypes>::test( - c10::optional(dummyTensor(c10::DispatchKey::CPU)), [] (const c10::optional& v) {EXPECT_EQ(c10::DispatchKey::CPU, extractDispatchKey(v.value()));}, - c10::optional(dummyTensor(c10::DispatchKey::CUDA)), [] (const IValue& v) {EXPECT_EQ(c10::DispatchKey::CUDA, extractDispatchKey(v.toTensor()));}, + testArgTypes>::test( + std::optional(dummyTensor(c10::DispatchKey::CPU)), [] (const c10::optional& v) {EXPECT_EQ(c10::DispatchKey::CPU, extractDispatchKey(v.value()));}, + std::optional(dummyTensor(c10::DispatchKey::CUDA)), [] (const IValue& v) {EXPECT_EQ(c10::DispatchKey::CUDA, extractDispatchKey(v.toTensor()));}, "(Tensor? a) -> Tensor?"); // optional types (with has_value() == false) - testArgTypes>::test( - c10::optional(c10::nullopt), [] (const c10::optional& v) {EXPECT_FALSE(v.has_value());}, - c10::optional(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, + testArgTypes>::test( + std::optional(c10::nullopt), [] (const c10::optional& v) {EXPECT_FALSE(v.has_value());}, + std::optional(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, "(float? a) -> float?"); - testArgTypes>::test( - c10::optional(c10::nullopt), [] (const c10::optional& v) {EXPECT_FALSE(v.has_value());}, - c10::optional(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, + testArgTypes>::test( + std::optional(c10::nullopt), [] (const c10::optional& v) {EXPECT_FALSE(v.has_value());}, + std::optional(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, "(int? a) -> int?"); - testArgTypes>::test( - c10::optional(c10::nullopt), [] (const c10::optional& v) {EXPECT_FALSE(v.has_value());}, - c10::optional(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, + testArgTypes>::test( + std::optional(c10::nullopt), [] (const c10::optional& v) {EXPECT_FALSE(v.has_value());}, + std::optional(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, "(bool? a) -> bool?"); - testArgTypes>::test( - c10::optional(c10::nullopt), [] (const c10::optional& v) {EXPECT_FALSE(v.has_value());}, - c10::optional(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, + testArgTypes>::test( + std::optional(c10::nullopt), [] (const c10::optional& v) {EXPECT_FALSE(v.has_value());}, + std::optional(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, "(bool? a) -> bool?"); - testArgTypes>::test( - c10::optional(c10::nullopt), [] (const c10::optional& v) {EXPECT_FALSE(v.has_value());}, - c10::optional(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, + testArgTypes>::test( + std::optional(c10::nullopt), [] (const c10::optional& v) {EXPECT_FALSE(v.has_value());}, + std::optional(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, "(str? a) -> str?"); - testArgTypes>::test( - c10::optional(c10::nullopt), [] (const c10::optional& v) {EXPECT_FALSE(v.has_value());}, - c10::optional(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, + testArgTypes>::test( + std::optional(c10::nullopt), [] (const c10::optional& v) {EXPECT_FALSE(v.has_value());}, + std::optional(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, "(Tensor? a) -> Tensor?"); @@ -1136,21 +1136,21 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) { "(Tensor[] a) -> Tensor[]"); // Test optional of list (with nullopt) - testArgTypes>>::test( - c10::optional>(c10::nullopt), [] (const c10::optional>& v) {EXPECT_FALSE(v.has_value());}, - c10::optional>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, + testArgTypes>>::test( + std::optional>(c10::nullopt), [] (const c10::optional>& v) {EXPECT_FALSE(v.has_value());}, + std::optional>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, "(int[]? a) -> int[]?"); // Test optional of list (with empty list) - testArgTypes>>::test( - c10::optional>(c10::List({})), [] (const c10::optional>& v) {EXPECT_EQ(0, v.value().size());}, - c10::optional>(c10::List({})), [] (const IValue& v) {EXPECT_EQ(0, v.to>().size());}, + testArgTypes>>::test( + std::optional>(c10::List({})), [] (const c10::optional>& v) {EXPECT_EQ(0, v.value().size());}, + std::optional>(c10::List({})), [] (const IValue& v) {EXPECT_EQ(0, v.to>().size());}, "(int[]? a) -> int[]?"); // Test optional of list (with values) - testArgTypes>>::test( - c10::optional>(c10::List({1, 2})), [] (const c10::optional>& v) {expectListEquals({1, 2}, v.value());}, - c10::optional>(c10::List({3, 4})), [] (const IValue& v) {expectListEquals({3, 4}, v.to>());}, + testArgTypes>>::test( + std::optional>(c10::List({1, 2})), [] (const c10::optional>& v) {expectListEquals({1, 2}, v.value());}, + std::optional>(c10::List({3, 4})), [] (const IValue& v) {expectListEquals({3, 4}, v.to>());}, "(int[]? a) -> int[]?"); // Test list of optional (with empty list) @@ -1161,8 +1161,8 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) { // Test list of optional (with values) testArgTypes>>::test( - c10::List<::std::optional>(c10::List<::std::optional>({3, c10::nullopt, 2})), [] (const c10::List<::std::optional>& v) {expectListEquals>({3, c10::nullopt, 2}, v);}, - c10::List<::std::optional>(c10::List<::std::optional>({3, c10::nullopt, 2})), [] (const IValue& v) {expectListEquals>({3, c10::nullopt, 2}, v.to>>());}, + c10::List<::std::optional>(c10::List<::std::optional>({3, c10::nullopt, 2})), [] (const c10::List<::std::optional>& v) {expectListEquals>({3, c10::nullopt, 2}, v);}, + c10::List<::std::optional>(c10::List<::std::optional>({3, c10::nullopt, 2})), [] (const IValue& v) {expectListEquals>({3, c10::nullopt, 2}, v.to>>());}, "(int?[] a) -> int?[]"); // dict types diff --git a/aten/src/ATen/core/operator_name.h b/aten/src/ATen/core/operator_name.h index 6440a695b55ec..5ba01b4a7df58 100644 --- a/aten/src/ATen/core/operator_name.h +++ b/aten/src/ATen/core/operator_name.h @@ -23,7 +23,7 @@ struct OperatorName final { // Return the namespace of this OperatorName, if it exists. The // returned string_view is only live as long as the OperatorName // exists and name is not mutated - c10::optional getNamespace() const { + std::optional getNamespace() const { auto pos = name.find("::"); if (pos == std::string::npos) { return c10::nullopt; diff --git a/aten/src/ATen/core/tensor_type.cpp b/aten/src/ATen/core/tensor_type.cpp index c7f8c8b05f91e..9110b4261d396 100644 --- a/aten/src/ATen/core/tensor_type.cpp +++ b/aten/src/ATen/core/tensor_type.cpp @@ -274,12 +274,12 @@ TensorTypePtr TensorType::create(const at::Tensor& t) { } TensorTypePtr TensorType::create( - c10::optional scalar_type, - c10::optional device, + std::optional scalar_type, + std::optional device, const VaryingShape& sizes, const VaryingShape& strides, - c10::optional requires_grad, - c10::optional undefined, bool tensor_contiguity) { + std::optional requires_grad, + std::optional undefined, bool tensor_contiguity) { if(strides.concrete_sizes() && strides.concrete_sizes().has_value()){ // handles case where strides are set // NOLINTNEXTLINE(bugprone-unchecked-optional-access) @@ -304,22 +304,22 @@ TensorTypePtr TensorType::create( } TensorTypePtr TensorType::create( - c10::optional scalar_type, - c10::optional device, + std::optional scalar_type, + std::optional device, const SymbolicShape& sizes, const VaryingShape& strides, - c10::optional requires_grad, - c10::optional undefined) { + std::optional requires_grad, + std::optional undefined) { auto pt = TensorTypePtr(new TensorType( scalar_type, device, sizes, strides, requires_grad, undefined)); return pt; } TensorTypePtr TensorType::create( - c10::optional scalar_type, - c10::optional device, - c10::optional dim, - c10::optional requires_grad) { + std::optional scalar_type, + std::optional device, + std::optional dim, + std::optional requires_grad) { return TensorType::create( scalar_type, device, @@ -349,7 +349,7 @@ VaryingShape TensorType::sizes() const { fmap(*sizes_.sizes(), [](ShapeSymbol ss) { // we turn symbolic shapes into unknowns return ss.is_static() - ? c10::optional(ss.static_size()) + ? std::optional(ss.static_size()) : c10::nullopt; })); } @@ -371,7 +371,7 @@ TensorTypePtr TensorType::merge(const TensorType& other, bool merge_sizes) const } template -bool is_null_or_equal(c10::optional a, c10::IntArrayRef b) { +bool is_null_or_equal(std::optional a, c10::IntArrayRef b) { return !a.has_value() || a.value() == b; } @@ -417,7 +417,7 @@ VaryingShape TensorType::strides() const { if (!strides_.size().has_value()) { return VaryingShape(); } - std::vector> ss(*strides_.size()); + std::vector> ss(*strides_.size()); for (size_t i = 0; i < *strides_.size(); i++) { if (!strides_[i].has_value()) { continue; @@ -431,12 +431,12 @@ VaryingShape TensorType::strides() const { } TensorType::TensorType( - c10::optional scalar_type, - c10::optional device, + std::optional scalar_type, + std::optional device, SymbolicShape sizes, VaryingShape strides, - c10::optional requires_grad, - c10::optional undefined) + std::optional requires_grad, + std::optional undefined) : SharedType(TypeKind::TensorType), scalar_type_(scalar_type), device_(device), diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp index f7d67ca84861a..572b15a118b36 100644 --- a/aten/src/ATen/core/type.cpp +++ b/aten/src/ATen/core/type.cpp @@ -364,7 +364,7 @@ SymBoolTypePtr SymBoolType::get() { return value; } -static c10::optional unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool default_to_union=false, const TypePtr& type_hint=nullptr) { +static std::optional unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool default_to_union=false, const TypePtr& type_hint=nullptr) { // check direct subtyping relation if (t1->isSubtypeOf(*t2)) { return t2; @@ -446,7 +446,7 @@ static c10::optional unifyTypesImpl(const TypePtr& t1, const TypePtr& t return c10::nullopt; } -c10::optional unifyTypes(const TypePtr& t1, const TypePtr& t2, bool default_to_union, const TypePtr& type_hint) { +std::optional unifyTypes(const TypePtr& t1, const TypePtr& t2, bool default_to_union, const TypePtr& type_hint) { auto unified = unifyTypesImpl(t1, t2, default_to_union, type_hint); if (default_to_union && !unified) { @@ -456,7 +456,7 @@ c10::optional unifyTypes(const TypePtr& t1, const TypePtr& t2, bool def return unified; } -c10::optional unifyTypeList( +std::optional unifyTypeList( at::ArrayRef elements, std::ostream& why_not, bool default_to_union, @@ -468,7 +468,7 @@ c10::optional unifyTypeList( TypePtr ret_type = elements.at(0); for (size_t i = 1; i < elements.size() && ret_type; ++i) { - c10::optional maybe_unified = unifyTypes(ret_type, elements.at(i), default_to_union, type_hint); + std::optional maybe_unified = unifyTypes(ret_type, elements.at(i), default_to_union, type_hint); if (!maybe_unified) { why_not << "Could not unify type list since element " << i << " of type " << elements.at(i)->repr_str() @@ -719,7 +719,7 @@ bool Type::is_module() const { } TupleTypePtr TupleType::createNamed( - const c10::optional& qualName, + const std::optional& qualName, const std::vector& field_names, const std::vector& field_types) { std::vector empty_defaults; @@ -727,7 +727,7 @@ TupleTypePtr TupleType::createNamed( } TupleTypePtr TupleType::createNamed( - const c10::optional& qualName, + const std::optional& qualName, const std::vector& field_names, const std::vector& field_types) { std::vector empty_defaults; @@ -735,7 +735,7 @@ TupleTypePtr TupleType::createNamed( } TupleTypePtr TupleType::createNamed( - const c10::optional& qualName, + const std::optional& qualName, const std::vector& field_names, const std::vector& field_types, std::vector& field_defaults) { @@ -743,7 +743,7 @@ TupleTypePtr TupleType::createNamed( } template -TupleTypePtr TupleType::createWithSpec(const c10::optional& qualName, +TupleTypePtr TupleType::createWithSpec(const std::optional& qualName, const std::vector& field_names, const std::vector& field_types, std::vector& field_defaults) { @@ -784,7 +784,7 @@ TupleTypePtr TupleType::createWithSpec(const c10::optional& field_types, qualName, std::move(schema))); // NOLINT(modernize-make-shared) } -c10::optional> TupleType::names() const { +std::optional> TupleType::names() const { if (!schema_) { return {}; } @@ -820,7 +820,7 @@ bool NumberType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const { TupleType::TupleType( std::vector elements, - c10::optional name, + std::optional name, std::shared_ptr schema) : NamedType(TypeKind::TupleType, std::move(name)), elements_(std::move(elements)), diff --git a/aten/src/ATen/core/union_type.cpp b/aten/src/ATen/core/union_type.cpp index 2acc4c497ba56..4039e2a4418f9 100644 --- a/aten/src/ATen/core/union_type.cpp +++ b/aten/src/ATen/core/union_type.cpp @@ -29,7 +29,7 @@ ListTypePtr ListType::ofOptionalTensors() { namespace { -c10::optional subtractTypeSetFrom(std::vector& to_subtract, ArrayRef from) { +std::optional subtractTypeSetFrom(std::vector& to_subtract, ArrayRef from) { std::vector types; // Given a TypePtr `lhs`, this function says whether or not `lhs` (or @@ -93,7 +93,7 @@ void filterDuplicateSubtypes(std::vector* types) { if (types->empty()) { return; } - auto get_supertype = [](const TypePtr& t1, const TypePtr& t2) -> c10::optional { + auto get_supertype = [](const TypePtr& t1, const TypePtr& t2) -> std::optional { // We don't want nested Optionals. Also, prematurely unifying to // `Optional` could prevent us from coalescing other types if ((t1->isSubtypeOf(*NoneType::get()) && !t2->isSubtypeOf(*NoneType::get())) @@ -114,7 +114,7 @@ void filterDuplicateSubtypes(std::vector* types) { size_t end_idx = types->size()-1; for (size_t i = types->size()-1; i > 0; --i) { for (size_t j = std::min(i-1, end_idx); ; --j) { - c10::optional unified; + std::optional unified; unified = get_supertype((*types)[i], (*types)[j]); if (unified) { (*types)[j] = *unified; @@ -272,11 +272,11 @@ UnionTypePtr UnionType::create(std::vector reference) { return union_type; } -c10::optional UnionType::subtractTypeSet(std::vector& to_subtract) const { +std::optional UnionType::subtractTypeSet(std::vector& to_subtract) const { return subtractTypeSetFrom(to_subtract, containedTypes()); } -c10::optional UnionType::toOptional() const { +std::optional UnionType::toOptional() const { if (!canHoldType(*NoneType::get())) { return c10::nullopt; } @@ -432,7 +432,7 @@ bool UnionType::canHoldType(const Type& type) const { bool OptionalType::equals(const Type& rhs) const { if (auto union_rhs = rhs.cast()) { auto optional_rhs = union_rhs->toOptional(); - // `**optional_rhs` = `*` to get value of `c10::optional`, + // `**optional_rhs` = `*` to get value of `std::optional`, // then `*` to dereference the pointer return optional_rhs && *this == **optional_rhs; } else if (auto optional_rhs = rhs.cast()) { diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp index f4f22711d61a3..9ae49113dc8a2 100644 --- a/aten/src/ATen/cuda/CachingHostAllocator.cpp +++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp @@ -105,7 +105,7 @@ struct CUDACachingHostAllocatorImpl } void record_stream( - c10::optional>& events, + std::optional>& events, CUDAStream stream) override { auto event = create_event_internal(stream.device_index()); event->record(stream); diff --git a/aten/src/ATen/cuda/EmptyTensor.cpp b/aten/src/ATen/cuda/EmptyTensor.cpp index a3cd55f4b2b7b..269b4a3ecfc11 100644 --- a/aten/src/ATen/cuda/EmptyTensor.cpp +++ b/aten/src/ATen/cuda/EmptyTensor.cpp @@ -8,8 +8,8 @@ namespace at::detail { TensorBase empty_cuda( IntArrayRef size, ScalarType dtype, - c10::optional device_opt, - c10::optional memory_format_opt) { + std::optional device_opt, + std::optional memory_format_opt) { at::globalContext().lazyInitCUDA(); const auto device = device_or_default(device_opt); TORCH_INTERNAL_ASSERT(device.is_cuda()); @@ -22,11 +22,11 @@ TensorBase empty_cuda( TensorBase empty_cuda( IntArrayRef size, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt) { TORCH_CHECK(!pin_memory_opt.has_value() || !*pin_memory_opt, "Only dense CPU tensors can be pinned"); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided); @@ -49,7 +49,7 @@ TensorBase empty_strided_cuda( IntArrayRef size, IntArrayRef stride, ScalarType dtype, - c10::optional device_opt) { + std::optional device_opt) { at::globalContext().lazyInitCUDA(); const auto device = device_or_default(device_opt); TORCH_INTERNAL_ASSERT(device.is_cuda()); @@ -63,10 +63,10 @@ TensorBase empty_strided_cuda( TensorBase empty_strided_cuda( IntArrayRef size, IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) { TORCH_CHECK(!pin_memory_opt.has_value() || !*pin_memory_opt, "Only dense CPU tensors can be pinned"); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided); diff --git a/aten/src/ATen/cuda/EmptyTensor.h b/aten/src/ATen/cuda/EmptyTensor.h index 18733f0beb30b..2fd88a94b75d2 100644 --- a/aten/src/ATen/cuda/EmptyTensor.h +++ b/aten/src/ATen/cuda/EmptyTensor.h @@ -6,16 +6,16 @@ namespace at::detail { TORCH_CUDA_CPP_API TensorBase empty_cuda( IntArrayRef size, ScalarType dtype, - c10::optional device_opt, - c10::optional memory_format_opt); + std::optional device_opt, + std::optional memory_format_opt); TORCH_CUDA_CPP_API TensorBase empty_cuda( IntArrayRef size, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt); + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt); TORCH_CUDA_CPP_API TensorBase empty_cuda( IntArrayRef size, @@ -25,15 +25,15 @@ TORCH_CUDA_CPP_API TensorBase empty_strided_cuda( IntArrayRef size, IntArrayRef stride, ScalarType dtype, - c10::optional device_opt); + std::optional device_opt); TORCH_CUDA_CPP_API TensorBase empty_strided_cuda( IntArrayRef size, IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt); + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt); TORCH_CUDA_CPP_API TensorBase empty_strided_cuda( IntArrayRef size, diff --git a/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp b/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp index 973027cd87f61..0c3e37825640d 100644 --- a/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp +++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp @@ -8,13 +8,13 @@ namespace at::native { -bool is_pinned_cuda(const Tensor& self, c10::optional device) { +bool is_pinned_cuda(const Tensor& self, std::optional device) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda()); // TODO: unhook this return detail::getCUDAHooks().isPinnedPtr(self.storage().data()); } -Tensor _pin_memory_cuda(const Tensor& self, c10::optional device) { +Tensor _pin_memory_cuda(const Tensor& self, std::optional device) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_cuda()); auto* allocator = at::cuda::getPinnedMemoryAllocator(); auto storage = Storage( diff --git a/aten/src/ATen/cudnn/AutocastRNN.cpp b/aten/src/ATen/cudnn/AutocastRNN.cpp index 083d435975c7c..2677e52df0929 100644 --- a/aten/src/ATen/cudnn/AutocastRNN.cpp +++ b/aten/src/ATen/cudnn/AutocastRNN.cpp @@ -22,9 +22,9 @@ std::tuple _cudnn_rnn_cast_reflatten(const Tensor & input, TensorList weight, int64_t weight_stride0, - const c10::optional& weight_buf_opt, + const std::optional& weight_buf_opt, const Tensor& hx, - const c10::optional& cx, + const std::optional& cx, int64_t mode, int64_t hidden_size, int64_t proj_size, @@ -34,7 +34,7 @@ _cudnn_rnn_cast_reflatten(const Tensor & input, bool train, bool bidirectional, IntArrayRef batch_sizes, - const c10::optional& dropout_state) { + const std::optional& dropout_state) { #if AT_CUDNN_ENABLED() c10::impl::ExcludeDispatchKeyGuard no_autocast(DispatchKey::Autocast); diff --git a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp index 44ca2802bf3a2..e7a914c1e0f69 100644 --- a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp @@ -303,7 +303,7 @@ static std::tuple> log_sigmoid_backward_batch_rule( return std::make_tuple(at::log_sigmoid_backward(out_grad, out_self, out_buffer), 0); } -static Tensor binomial_wrapper(const Tensor& count, const Tensor& prob, c10::optional gen) { +static Tensor binomial_wrapper(const Tensor& count, const Tensor& prob, std::optional gen) { return at::binomial(count, prob.contiguous(), std::move(gen)); // Bug in PyTorch, prob shouldn't need to be contiguous } @@ -457,7 +457,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { using TensorScalarInplaceT = Tensor& (Tensor::*)(const Tensor&, const Scalar&) const; using ScalarScalarInplaceT = Tensor& (Tensor::*)(const Scalar&, const Scalar&) const; using TensorInplaceT = Tensor& (Tensor::*)(const Tensor&) const; - using TensorInplaceModeT = Tensor& (Tensor::*)(const Tensor&, c10::optional) const; + using TensorInplaceModeT = Tensor& (Tensor::*)(const Tensor&, std::optional) const; using ScalarInplaceT = Tensor& (Tensor::*)(const Scalar&) const; using CopyT = Tensor& (Tensor::*)(const Tensor&, bool) const; @@ -471,7 +471,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { VMAP_SUPPORT2(mul_, Tensor, SINGLE_ARG(binary_pointwise_inplace_batch_rule)); VMAP_SUPPORT2(mul_, Scalar, SINGLE_ARG(unary_inplace_batch_rule)); VMAP_SUPPORT2(div_, Tensor, SINGLE_ARG(binary_pointwise_inplace_batch_rule)); - VMAP_SUPPORT2(div_, Tensor_mode, SINGLE_ARG(binary_pointwise_inplace_batch_rule>)); + VMAP_SUPPORT2(div_, Tensor_mode, SINGLE_ARG(binary_pointwise_inplace_batch_rule>)); VMAP_SUPPORT2(div_, Scalar, SINGLE_ARG(unary_inplace_batch_rule)); VMAP_SUPPORT2(clamp_min_, Tensor, SINGLE_ARG(binary_pointwise_inplace_batch_rule)); VMAP_SUPPORT2(clamp_max_, Tensor, SINGLE_ARG(binary_pointwise_inplace_batch_rule)); diff --git a/aten/src/ATen/functorch/BatchRulesConvolution.cpp b/aten/src/ATen/functorch/BatchRulesConvolution.cpp index ca4eda19a36fb..dd24207e7e778 100644 --- a/aten/src/ATen/functorch/BatchRulesConvolution.cpp +++ b/aten/src/ATen/functorch/BatchRulesConvolution.cpp @@ -124,7 +124,7 @@ convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tens } static Tensor _convolution_decomp( - const Tensor& input_r, const Tensor& weight_r, const c10::optional& bias_r_opt, + const Tensor& input_r, const Tensor& weight_r, const std::optional& bias_r_opt, IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_, bool transposed_, IntArrayRef output_padding_, int64_t groups_, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { diff --git a/aten/src/ATen/functorch/BatchRulesFactory.cpp b/aten/src/ATen/functorch/BatchRulesFactory.cpp index f317fee6af6c7..1edce4f52e271 100644 --- a/aten/src/ATen/functorch/BatchRulesFactory.cpp +++ b/aten/src/ATen/functorch/BatchRulesFactory.cpp @@ -107,11 +107,11 @@ static std::tuple> linspace_logspace_batch_rule_helper( const at::Tensor& start, optional start_bdim, const at::Tensor& end, optional end_bdim, int64_t steps, - c10::optional base, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) + std::optional base, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { auto batch_size = get_bdim_size2(start, start_bdim, end, end_bdim); auto start_ = ensure_has_bdim(start, start_bdim.has_value(), batch_size); @@ -145,10 +145,10 @@ static std::tuple> linspace_Tensor_Tensor_batch_rule( const at::Tensor& start, optional start_bdim, const at::Tensor& end, optional end_bdim, int64_t steps, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory){ + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory){ return linspace_logspace_batch_rule_helper(start, start_bdim, end, end_bdim, steps, c10::nullopt, dtype, layout, device, pin_memory); } @@ -156,10 +156,10 @@ static std::tuple> linspace_Tensor_Scalar_batch_rule( const at::Tensor& start, optional start_bdim, const at::Scalar& end, int64_t steps, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory){ + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory){ auto end_t = at::native::wrapped_scalar_tensor(end, start.device()); return linspace_logspace_batch_rule_helper(start, start_bdim, end_t, c10::nullopt, steps, c10::nullopt, dtype, layout, device, pin_memory); @@ -169,10 +169,10 @@ static std::tuple> linspace_Scalar_Tensor_batch_rule( const at::Scalar& start, const at::Tensor& end, optional end_bdim, int64_t steps, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory){ + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory){ auto start_t = at::native::wrapped_scalar_tensor(start, end.device()); return linspace_logspace_batch_rule_helper(start_t, c10::nullopt, end, end_bdim, steps, c10::nullopt, dtype, layout, device, pin_memory); @@ -183,10 +183,10 @@ static std::tuple> logspace_Tensor_Tensor_batch_rule( const at::Tensor& end, optional end_bdim, int64_t steps, double base, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory){ + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory){ return linspace_logspace_batch_rule_helper(start, start_bdim, end, end_bdim, steps, c10::make_optional(base), dtype, layout, device, pin_memory); } @@ -195,10 +195,10 @@ static std::tuple> logspace_Tensor_Scalar_batch_rule( const at::Scalar& end, int64_t steps, double base, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory){ + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory){ auto end_t = at::native::wrapped_scalar_tensor(end, start.device()); return linspace_logspace_batch_rule_helper(start, start_bdim, end_t, c10::nullopt, steps, c10::make_optional(base), dtype, layout, device, pin_memory); @@ -209,10 +209,10 @@ static std::tuple> logspace_Scalar_Tensor_batch_rule( const at::Tensor& end, optional end_bdim, int64_t steps, double base, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory){ + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory){ auto start_t = at::native::wrapped_scalar_tensor(start, end.device()); return linspace_logspace_batch_rule_helper(start_t, c10::nullopt, end, end_bdim, steps, c10::make_optional(base), dtype, layout, device, pin_memory); diff --git a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp index 6a17adb4e268c..511a0a6d45450 100644 --- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp +++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp @@ -157,9 +157,9 @@ void _linalg_check_errors_batch_rule(const Tensor& info, optional info_ at::_linalg_check_errors(info_, api_name, false); } -std::tuple> -householder_product_batch_rule(const Tensor &input, c10::optional input_bdim, - const Tensor &tau, c10::optional tau_bdim) +std::tuple> +householder_product_batch_rule(const Tensor &input, std::optional input_bdim, + const Tensor &tau, std::optional tau_bdim) { auto input_ = moveBatchDimToFront(input, input_bdim); auto tau_ = moveBatchDimToFront(tau, tau_bdim); @@ -330,8 +330,8 @@ oneOutput linalg_lu_solve_batch_rule( } oneOutput cholesky_solve_batch_rule( - const Tensor& self, c10::optional self_bdim, - const Tensor& A, c10::optional A_bdim, + const Tensor& self, std::optional self_bdim, + const Tensor& A, std::optional A_bdim, bool upper) { TORCH_CHECK(rankWithoutBatchDim(self, self_bdim) >= 2, "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead"); @@ -345,14 +345,14 @@ oneOutput cholesky_solve_batch_rule( } threeOutputs linalg_lu_factor_ex_batch_rule( - const Tensor& A, c10::optional A_bdim, bool pivot, bool check_errors) { + const Tensor& A, std::optional A_bdim, bool pivot, bool check_errors) { TORCH_CHECK(rankWithoutBatchDim(A, A_bdim) >= 2, "torch.lu_factor_ex: Expected tensor with 2 or more dimensions. Got size: ", A.sizes(), " instead"); const auto A_ = moveBatchDimToFront(A, A_bdim); const auto res = at::linalg_lu_factor_ex(A_, pivot, check_errors); return std::make_tuple(std::get<0>(res), 0, std::get<1>(res), 0, std::get<2>(res), 0); } -oneOutput matrix_exp_batch_rule(const Tensor& self, c10::optional self_bdim) { +oneOutput matrix_exp_batch_rule(const Tensor& self, std::optional self_bdim) { TORCH_CHECK(rankWithoutBatchDim(self, self_bdim) >= 2, "torch.matrix_exp: The input tensor A must have at least 2 dimensions."); const auto self_ = moveBatchDimToFront(self, self_bdim).contiguous(); // seems to be a bug return std::make_tuple(at::matrix_exp(self_), 0); @@ -400,8 +400,8 @@ fourOutputs solve_ex_batch_rule( return std::make_tuple(std::get<0>(res), 0, std::get<1>(res), 0, std::get<2>(res), 0, std::get<3>(res), 0); } -oneOutput cross_batch_rule(const Tensor& self, c10::optional self_bdim, - const Tensor& other, c10::optional other_bdim, const int64_t dim) { +oneOutput cross_batch_rule(const Tensor& self, std::optional self_bdim, + const Tensor& other, std::optional other_bdim, const int64_t dim) { // match cross dimension checks TORCH_CHECK(rankWithoutBatchDim(self, self_bdim) == rankWithoutBatchDim(other, other_bdim), "linalg.cross: inputs must have the same number of dimensions." @@ -418,16 +418,16 @@ oneOutput cross_batch_rule(const Tensor& self, c10::optional self_bdim, return std::make_tuple(linalg_cross(self_, other_, dim_), 0); } -c10::optional batch_dim_if_not_empty(const Tensor& t) { +std::optional batch_dim_if_not_empty(const Tensor& t) { if (t.dim() == 1 && t.size(0) == 0) { - return c10::optional(); + return std::optional(); } - return c10::optional(0); + return std::optional(0); } fourOutputs linalg_lstsq_batch_rule( - const Tensor& self, c10::optional self_bdim, const Tensor& b, c10::optional b_bdim, - c10::optional rcond, c10::optional driver) { + const Tensor& self, std::optional self_bdim, const Tensor& b, c10::optional b_bdim, + std::optional rcond, c10::optional driver) { TORCH_CHECK(rankWithoutBatchDim(self, self_bdim) >= 2, "torch.linalg.lstsq: input must have at least 2 dimensions."); TORCH_CHECK(rankWithoutBatchDim(b, b_bdim) >= 1, "torch.linalg.lstsq: other must have at least 1 dimension."); @@ -449,7 +449,7 @@ fourOutputs linalg_lstsq_batch_rule( } template -std::tuple> +std::tuple> atol_rtol_tensor_batch_rule( F Func, const Tensor& input, optional input_bdim, const optional& atol, const optional atol_bdim, @@ -478,11 +478,11 @@ atol_rtol_tensor_batch_rule( return std::make_tuple(Func(input_, atol_, rtol_, hermitian), 0); } -static std::tuple> +static std::tuple> pinv_batch_rule( - const Tensor& input, c10::optional input_bdim, const optional& atol, - const c10::optional atol_bdim, const optional& rtol, - const c10::optional rtol_bdim, bool hermitian) { + const Tensor& input, std::optional input_bdim, const optional& atol, + const std::optional atol_bdim, const optional& rtol, + const std::optional rtol_bdim, bool hermitian) { return atol_rtol_tensor_batch_rule(ATEN_FN2(linalg_pinv, atol_rtol_tensor), input, input_bdim, atol, atol_bdim, rtol, rtol_bdim, hermitian, "linalg.pinv"); } } diff --git a/aten/src/ATen/functorch/BatchRulesLoss.cpp b/aten/src/ATen/functorch/BatchRulesLoss.cpp index 22f3adff95a01..cd5ef41d4069f 100644 --- a/aten/src/ATen/functorch/BatchRulesLoss.cpp +++ b/aten/src/ATen/functorch/BatchRulesLoss.cpp @@ -123,7 +123,7 @@ static Tensor binary_cross_entropy_plumbing( static Tensor binary_cross_entropy_backward_plumbing( const Tensor& grad, const Tensor& input, const Tensor& target, - const c10::optional& weight_opt, int64_t reduction) { + const std::optional& weight_opt, int64_t reduction) { auto maybe_layer = maybeCurrentDynamicLayer(); vmap_check_escaped(maybe_layer, "binary_cross_entropy_backward_plumbing"); int64_t cur_level = maybe_layer->layerId(); diff --git a/aten/src/ATen/functorch/BatchRulesNorm.cpp b/aten/src/ATen/functorch/BatchRulesNorm.cpp index faf39d8e374a3..89a23fe0298d7 100644 --- a/aten/src/ATen/functorch/BatchRulesNorm.cpp +++ b/aten/src/ATen/functorch/BatchRulesNorm.cpp @@ -45,10 +45,10 @@ template std::tuple,Tensor,optional,Tensor,optional> batch_norm_batch_rule( const Tensor& input, optional input_bdim, - const c10::optional& weight_opt, optional weight_bdim, - const c10::optional& bias_opt, optional bias_bdim, - const c10::optional& running_mean_opt, optional running_mean_bdim, - const c10::optional& running_var_opt, optional running_var_bdim, + const std::optional& weight_opt, optional weight_bdim, + const std::optional& bias_opt, optional bias_bdim, + const std::optional& running_mean_opt, optional running_mean_bdim, + const std::optional& running_var_opt, optional running_var_bdim, bool training, double momentum, double eps) { c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -63,7 +63,7 @@ batch_norm_batch_rule( "were not batched.\nIf you are using a module and do not need eval mode, please set `track_running_stats` to be False.", "If you are using a prebuilt module and do not need eval mode, please see the functorch website for resources on ", "how to patch your module to work with vmap"); - c10::optional bdim_size; + std::optional bdim_size; Tensor result0; Tensor mean; Tensor rstd; @@ -80,8 +80,8 @@ batch_norm_batch_rule( input_ = ensure_has_bdim(input_, input_bdim.has_value(), bdim_size.value()); input_ = reshape_dim_into(0, /*channels dim*/1, input_); - c10::optional running_mean_; - c10::optional running_var_; + std::optional running_mean_; + std::optional running_var_; if (running_mean.defined()) { running_mean_ = moveBatchDimToFront(running_mean, running_mean_bdim); running_mean_ = ensure_has_bdim(*running_mean_, running_mean_bdim.has_value(), bdim_size.value()); @@ -127,8 +127,8 @@ template std::tuple> batch_norm_backward_no_weight_bias_batch_rule( const at::Tensor & grad_out, optional grad_out_bdim, const at::Tensor & input, optional input_bdim, - const c10::optional & running_mean_opt, optional running_mean_bdim, - const c10::optional & running_var_opt, optional running_var_bdim, + const std::optional & running_mean_opt, optional running_mean_bdim, + const std::optional & running_var_opt, optional running_var_bdim, const at::Tensor & mean, optional mean_bdim, const at::Tensor & rstd, optional rstd_bdim, bool training, double eps) { @@ -199,11 +199,11 @@ template std::tuple batch_norm_backward_plumbing( const at::Tensor & grad_out, const at::Tensor & input, - const c10::optional & weight_opt, - const c10::optional & running_mean_opt, - const c10::optional & running_var_opt, - const c10::optional & save_mean_opt, - const c10::optional & save_rstd_opt, + const std::optional & weight_opt, + const std::optional & running_mean_opt, + const std::optional & running_var_opt, + const std::optional & save_mean_opt, + const std::optional & save_rstd_opt, bool training, double eps, std::array output_mask) { @@ -284,8 +284,8 @@ std::tuple batch_norm_backward_plumbing( } static std::tuple native_group_norm_plumbing( - const Tensor & input, const c10::optional & weight_opt, - const c10::optional & bias_opt, int64_t N, int64_t C, + const Tensor & input, const std::optional & weight_opt, + const std::optional & bias_opt, int64_t N, int64_t C, int64_t HxW, int64_t group, double eps) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); @@ -372,7 +372,7 @@ static std::tuple> group_norm_backward_no_weight_bi static std::tuple native_group_norm_backward_plumbing( const Tensor & grad_out, const Tensor & input, const Tensor & mean, - const Tensor & rstd, const c10::optional & weight_opt, + const Tensor & rstd, const std::optional & weight_opt, int64_t N, int64_t C, int64_t HxW, int64_t group, std::array output_mask ) { // See [Note: hacky wrapper removal for optional tensor] @@ -488,8 +488,8 @@ static std::tuple,Tensor,optional,Tensor,optio native_layer_norm_batch_rule( const Tensor& input, optional input_bdim, c10::SymIntArrayRef normalized_shape, - const c10::optional& weight_opt, optional weight_bdim, - const c10::optional& bias_opt, optional bias_bdim, + const std::optional& weight_opt, optional weight_bdim, + const std::optional& bias_opt, optional bias_bdim, double eps) { auto input_ = moveBatchDimToFront(input, input_bdim); if (!weight_bdim && !bias_bdim) { @@ -573,8 +573,8 @@ static std::tuple native_layer_norm_backward_p at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, - const c10::optional & weight_opt, - const c10::optional & bias_opt, + const std::optional & weight_opt, + const std::optional & bias_opt, std::array output_mask) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); @@ -653,10 +653,10 @@ template struct NativeBatchNormBatchRuleHelper { static std::tuple,Tensor,optional,Tensor,optional> apply( const Tensor& input, optional input_bdim, - const c10::optional& weight_opt, optional weight_bdim, - const c10::optional& bias_opt, optional bias_bdim, - const c10::optional& running_mean_opt, optional running_mean_bdim, - const c10::optional& running_var_opt, optional running_var_bdim, + const std::optional& weight_opt, optional weight_bdim, + const std::optional& bias_opt, optional bias_bdim, + const std::optional& running_mean_opt, optional running_mean_bdim, + const std::optional& running_var_opt, optional running_var_bdim, bool training, double momentum, double eps) { return batch_norm_batch_rule( input, input_bdim, weight_opt, weight_bdim, bias_opt, bias_bdim, @@ -669,9 +669,9 @@ struct CudnnBatchNormBatchRuleHelper { static std::tuple,Tensor,optional,Tensor,optional,Tensor,optional> apply( const Tensor& input, optional input_bdim, const Tensor& weight_opt, optional weight_bdim, - const c10::optional& bias_opt, optional bias_bdim, - const c10::optional& running_mean_opt, optional running_mean_bdim, - const c10::optional& running_var_opt, optional running_var_bdim, + const std::optional& bias_opt, optional bias_bdim, + const std::optional& running_mean_opt, optional running_mean_bdim, + const std::optional& running_var_opt, optional running_var_bdim, bool training, double momentum, double eps) { auto reserve = at::empty({0}, input.options().dtype(kByte)); // in experiments, reserve was never set to anything other than empty by cuda auto res = batch_norm_batch_rule( @@ -686,9 +686,9 @@ struct MiopenBatchNormBatchRuleHelper { static std::tuple,Tensor,optional,Tensor,optional> apply( const Tensor& input, optional input_bdim, const Tensor& weight_opt, optional weight_bdim, - const c10::optional& bias_opt, optional bias_bdim, - const c10::optional& running_mean_opt, optional running_mean_bdim, - const c10::optional& running_var_opt, optional running_var_bdim, + const std::optional& bias_opt, optional bias_bdim, + const std::optional& running_mean_opt, optional running_mean_bdim, + const std::optional& running_var_opt, optional running_var_bdim, bool training, double momentum, double eps) { return batch_norm_batch_rule( input, input_bdim, weight_opt, weight_bdim, bias_opt, bias_bdim, @@ -716,11 +716,11 @@ struct NativeBatchNormBackwardBatchRuleHelper { static std::tuple apply( const at::Tensor & grad_out, const at::Tensor & input, - const c10::optional & weight_opt, - const c10::optional & running_mean_opt, - const c10::optional & running_var_opt, - const c10::optional & save_mean_opt, - const c10::optional & save_rstd_opt, + const std::optional & weight_opt, + const std::optional & running_mean_opt, + const std::optional & running_var_opt, + const std::optional & save_mean_opt, + const std::optional & save_rstd_opt, bool training, double eps, std::array output_mask) { @@ -748,10 +748,10 @@ struct CudnnBatchNormBackwardBatchRuleHelper { const at::Tensor & input, const at::Tensor & grad_out, const at::Tensor & weight, - const c10::optional & running_mean_opt, - const c10::optional & running_var_opt, - const c10::optional & save_mean_opt, - const c10::optional & save_rstd_opt, + const std::optional & running_mean_opt, + const std::optional & running_var_opt, + const std::optional & save_mean_opt, + const std::optional & save_rstd_opt, double eps, const at::Tensor & reserve) { @@ -777,10 +777,10 @@ struct MiopenBatchNormBackwardBatchRuleHelper { const at::Tensor & input, const at::Tensor & grad_out, const at::Tensor & weight, - const c10::optional & running_mean_opt, - const c10::optional & running_var_opt, - const c10::optional & save_mean_opt, - const c10::optional & save_rstd_opt, + const std::optional & running_mean_opt, + const std::optional & running_var_opt, + const std::optional & save_mean_opt, + const std::optional & save_rstd_opt, double eps) { auto maybe_layer = maybeCurrentDynamicLayer(); @@ -818,10 +818,10 @@ static std::tuple cudnn_batch_norm_backward_wr const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor& weight_opt, - const c10::optional & running_mean_opt, - const c10::optional & running_var_opt, - const c10::optional & save_mean_opt, - const c10::optional & save_rstd_opt, + const std::optional & running_mean_opt, + const std::optional & running_var_opt, + const std::optional & save_mean_opt, + const std::optional & save_rstd_opt, bool training, double eps, std::array output_mask) { @@ -834,10 +834,10 @@ static std::tuple miopen_batch_norm_backward_w const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor& weight_opt, - const c10::optional & running_mean_opt, - const c10::optional & running_var_opt, - const c10::optional & save_mean_opt, - const c10::optional & save_rstd_opt, + const std::optional & running_mean_opt, + const std::optional & running_var_opt, + const std::optional & save_mean_opt, + const std::optional & save_rstd_opt, bool training, double eps, std::array output_mask) { @@ -850,13 +850,13 @@ static std::tuple miopen_batch_norm_backward_w // work with dynamo anyway so we gain some buffer room to do wrong things here. The (reasonable) hope is that we will // make native_batch_norm composite implicit within a few weeks and we can fix this before vmap works with dynamo. static std::tuple _native_batch_norm_legit_batch( - const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, + const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps) { return at::native_batch_norm(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, eps); } static std::tuple _native_batch_norm_legit_no_stats_batch( - const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, + const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, bool train, double momentum, double eps) { return at::native_batch_norm(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, eps); } diff --git a/aten/src/ATen/functorch/BatchRulesRandomness.cpp b/aten/src/ATen/functorch/BatchRulesRandomness.cpp index 79572f22ea3f6..fe2e790331fa0 100644 --- a/aten/src/ATen/functorch/BatchRulesRandomness.cpp +++ b/aten/src/ATen/functorch/BatchRulesRandomness.cpp @@ -58,7 +58,7 @@ Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) { } } -static Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor& p_, c10::optional gen) { +static Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor& p_, std::optional gen) { c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode); auto maybe_layer = maybeCurrentDynamicLayer(); auto cur_level = maybe_layer->layerId(); @@ -173,7 +173,7 @@ Tensor tensor_like_random_batch_rule(const Tensor& self, ExtraArgs... extra_args return (randomness == RandomnessType::Same) ? res : makeBatched(res, 0, cur_level); } -static std::tuple native_dropout_batching_rule(const Tensor& tensor, double p, c10::optional train) { +static std::tuple native_dropout_batching_rule(const Tensor& tensor, double p, std::optional train) { c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode); auto maybe_layer = maybeCurrentDynamicLayer(); const auto cur_level = maybe_layer->layerId(); @@ -213,7 +213,7 @@ static std::tuple native_dropout_batching_rule(const Tensor& tens return std::make_tuple(output, mask); } -static Tensor multinomial_batching_rule(const Tensor& self, const int64_t num_samples, const bool replacement, const c10::optional generator) { +static Tensor multinomial_batching_rule(const Tensor& self, const int64_t num_samples, const bool replacement, const std::optional generator) { c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode); auto maybe_layer = maybeCurrentDynamicLayer(); const auto cur_level = maybe_layer->layerId(); diff --git a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp index cb6d6ac519dd8..90371c0eb9ce8 100644 --- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp @@ -169,7 +169,7 @@ void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack new_dims.push_back(getPhysicalDim(self, self_bdim.has_value(), dim)); } bool is_scalar_case = logical_dim == 0 && dims.size() == 1 && is_allowed_dim_on_scalar_tensor(dims[0]); - c10::optional maybe_keepdim; + std::optional maybe_keepdim; if (is_scalar_case) { // NOTE: [boxed_reduction_batch_rule scalar tensor handling] // Reduction operations in PyTorch have an edge case where they allow @@ -321,9 +321,9 @@ static std::tuple> searchsorted_batch_rule( optional self_bdim, bool out_int32, bool right, - c10::optional side, - const c10::optional& sorter, - c10::optional sorter_bdim) { + std::optional side, + const std::optional& sorter, + std::optional sorter_bdim) { auto buckets_logical_rank = rankWithoutBatchDim(sorted_sequence, sorted_sequence_bdim); auto self_logical_rank = rankWithoutBatchDim(self, self_bdim); diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp index 0a1475497b03d..839e0ee405abb 100644 --- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp @@ -375,7 +375,7 @@ namespace { // Code is mostly duplicated from // https://github.com/pytorch/pytorch/blob/fb0e27d38a8fdab4e1c14d6378c9e41cb30fd6a3 // /aten/src/ATen/native/TensorAdvancedIndexing.cpp#L379-L405 - VmapDimVector get_indexed_shape(Tensor self, const torch::List> &orig) + VmapDimVector get_indexed_shape(Tensor self, const torch::List> &orig) { at::native::checkIndexTensorTypes(orig); // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors @@ -869,8 +869,8 @@ Tensor index_copy_decomp( // through a decomposition: slice_scatter's output needs to have the same // size, size, strides and storage_offset as the input. Tensor slice_scatter_decomp(const Tensor &self, const Tensor &src, - int64_t dim, c10::optional start, - c10::optional end, int64_t step) + int64_t dim, std::optional start, + std::optional end, int64_t step) { auto idx = at::arange(start.value_or(0), end.value_or(self.size(dim)), step, self.options().dtype(kLong)); idx = get_expanded_index(idx, self.sizes(), dim); @@ -889,8 +889,8 @@ Tensor select_scatter_decomp( } std::tuple> diagonal_scatter_batch_rule( - const Tensor &self, c10::optional self_bdim, - const Tensor &src, c10::optional src_bdim, + const Tensor &self, std::optional self_bdim, + const Tensor &src, std::optional src_bdim, int64_t offset, int64_t dim1, int64_t dim2) { auto self_ = moveBatchDimToFront(self, self_bdim); diff --git a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp index f44000674db8a..d8213a1b9e0dd 100644 --- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp @@ -63,7 +63,7 @@ std::tuple> to_other_batch_rule(const Tensor& self, optional self_bdim, const Tensor& other, optional other_bdim, bool non_blocking, - bool copy, c10::optional memory_format) { + bool copy, std::optional memory_format) { return std::make_tuple(self.to(other, non_blocking, copy, memory_format), self_bdim); } } diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp index 81e9d5b9aa21c..18f5d4f38f3cc 100644 --- a/aten/src/ATen/functorch/BatchRulesViews.cpp +++ b/aten/src/ATen/functorch/BatchRulesViews.cpp @@ -149,7 +149,7 @@ std::tuple> flip_batch_rule(const Tensor& self, optiona const Tensor& resize__plumbing( const Tensor& self, IntArrayRef size, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { TORCH_CHECK( !optional_memory_format.has_value() || optional_memory_format == c10::MemoryFormat::Contiguous, @@ -217,7 +217,7 @@ std::tuple> squeeze_batch_rule(const Tensor& self, opt } auto result = self.view_symint(squeezed_sizes); - return std::make_tuple(std::move(result), c10::optional(new_batch_idx)); + return std::make_tuple(std::move(result), std::optional(new_batch_idx)); } std::tuple> squeeze_dims_batch_rule( @@ -335,8 +335,8 @@ std::tuple> slice_batch_rule( const Tensor& self, optional self_bdim, int64_t dim, - c10::optional start, - c10::optional end, + std::optional start, + std::optional end, c10::SymInt step) { auto self_ = moveBatchDimToFront(self, self_bdim); dim = getPhysicalDim(self, self_bdim.has_value(), dim); diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp index 45976fa855f32..35f2439c982db 100644 --- a/aten/src/ATen/functorch/DynamicLayer.cpp +++ b/aten/src/ATen/functorch/DynamicLayer.cpp @@ -387,7 +387,7 @@ bool isInplaceOp(const FunctionSchema& schema) { return return_alias_info && return_alias_info->isWrite(); } -c10::optional findAliasedOutput(const FunctionSchema& schema, const int64_t immutable_input_idx) { +std::optional findAliasedOutput(const FunctionSchema& schema, const int64_t immutable_input_idx) { for (size_t res_idx = 0; res_idx != schema.returns().size(); ++res_idx) { if (schema.may_contain_alias(SchemaArgument(SchemaArgType::input, immutable_input_idx), SchemaArgument(SchemaArgType::output, res_idx))) { return res_idx; // for everything currently in native_functions, each input aliases at most one output (tensor list counts as one output) diff --git a/aten/src/ATen/functorch/DynamicLayer.h b/aten/src/ATen/functorch/DynamicLayer.h index 9311503f3538d..554e6678d09a1 100644 --- a/aten/src/ATen/functorch/DynamicLayer.h +++ b/aten/src/ATen/functorch/DynamicLayer.h @@ -71,7 +71,7 @@ TORCH_API int64_t initAndPushDynamicLayer( optional prev_fwd_grad_mode = nullopt, optional functionalize_add_back_views = nullopt); TORCH_API DynamicLayer popDynamicLayerAndDeleteMetadata(); -TORCH_API c10::optional maybeCurrentDynamicLayer(); +TORCH_API std::optional maybeCurrentDynamicLayer(); TORCH_API const std::vector& getDynamicLayerStack(); TORCH_API void setDynamicLayerStack(const std::vector& stack); TORCH_API void setDynamicLayerFrontBackKeysIncluded(bool included); @@ -95,7 +95,7 @@ TORCH_API const std::shared_ptr& getLifeHandleForLevel(int64_t level); TORCH_API bool isInplaceOp(const c10::FunctionSchema& schema); // Given the indices of unwrapped inputs and the schema, this returns the indices of any outputs that should remain unwrapped -TORCH_API c10::optional findAliasedOutput(const FunctionSchema& schema, const int64_t immutable_input); +TORCH_API std::optional findAliasedOutput(const FunctionSchema& schema, const int64_t immutable_input); TORCH_API Tensor unwrapIfDead(const Tensor& tensor); TORCH_API bool isDeadTensorWrapper(const Tensor& tensor); diff --git a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp index b7a131766ec86..760035d8e46ec 100644 --- a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp +++ b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp @@ -536,7 +536,7 @@ Tensor cat_batching_rule(const ITensorListRef& tensors, int64_t dim) { // we'll just slice the tensor to get a Tensor of shape [0] to pass to at::cat. std::vector tensors_to_cat; tensors_to_cat.reserve(tensors.size()); - c10::optional bdim_size = c10::nullopt; + std::optional bdim_size = c10::nullopt; // find the bdim size. Might not exist if all BatchedTensors should be skipped // by cat's special case. @@ -573,7 +573,7 @@ Tensor cat_batching_rule(const ITensorListRef& tensors, int64_t dim) { } auto new_dim = bdim_size.has_value() ? dim + 1 : dim; - c10::optional new_bdim = bdim_size.has_value() ? c10::make_optional((int64_t)0) : nullopt; + std::optional new_bdim = bdim_size.has_value() ? c10::make_optional((int64_t)0) : nullopt; auto result = at::cat(tensors_to_cat, new_dim); return makeBatched(result, new_bdim, get_current_level()); } diff --git a/aten/src/ATen/functorch/PlumbingHelper.cpp b/aten/src/ATen/functorch/PlumbingHelper.cpp index 76982fd1b6480..e2a3a9582cf49 100644 --- a/aten/src/ATen/functorch/PlumbingHelper.cpp +++ b/aten/src/ATen/functorch/PlumbingHelper.cpp @@ -40,7 +40,7 @@ std::vector makeBatchedVector(const std::vector& tensors, option return res; } -std::tuple> unwrapTensorAtLevel(const Tensor& tensor, int64_t level) { +std::tuple> unwrapTensorAtLevel(const Tensor& tensor, int64_t level) { auto* batched = maybeGetBatchedImpl(tensor); if (!batched) { return std::make_tuple(tensor, nullopt); @@ -56,7 +56,7 @@ bool isBatchedAtLevel(const Tensor& tensor, int64_t level) { return std::get<1>(result).has_value(); } -bool isBatchedAtLevel(const c10::optional& maybe_tensor, int64_t level) { +bool isBatchedAtLevel(const std::optional& maybe_tensor, int64_t level) { if (!maybe_tensor.has_value()) { return false; } @@ -72,7 +72,7 @@ bool isBatchedAtLevel(ITensorListRef tensors, int64_t level) { return false; } -bool isBatchedAtLevel(const c10::List>& maybe_tensors, int64_t level) { +bool isBatchedAtLevel(const c10::List>& maybe_tensors, int64_t level) { for (const auto idx : c10::irange(0, maybe_tensors.size())) { const auto& maybe_tensor = maybe_tensors.get(idx); if (isBatchedAtLevel(maybe_tensor, level)) { diff --git a/aten/src/ATen/functorch/PlumbingHelper.h b/aten/src/ATen/functorch/PlumbingHelper.h index 552a618b144c8..c2c16c67bcd91 100644 --- a/aten/src/ATen/functorch/PlumbingHelper.h +++ b/aten/src/ATen/functorch/PlumbingHelper.h @@ -35,16 +35,16 @@ TORCH_API Tensor makeBatched(const Tensor& tensor, optional bdim, int64 // If `tensor` is not a BatchedTensor, or is a BatchedTensor but the level // doesn't match, then this returns (tensor, nullopt). // Otherwise, it returns (unwrap(tensor), bdim). -TORCH_API std::tuple> unwrapTensorAtLevel(const Tensor& tensor, int64_t level); +TORCH_API std::tuple> unwrapTensorAtLevel(const Tensor& tensor, int64_t level); // Creates a vector of BatchedTensor TORCH_API std::vector makeBatchedVector(const std::vector& tensors, optional bdim, int64_t level); // Returns True if ANY tensor in tensors is batched at level TORCH_API bool isBatchedAtLevel(ITensorListRef tensors, int64_t level); -TORCH_API bool isBatchedAtLevel(const c10::List>& maybe_tensors, int64_t level); +TORCH_API bool isBatchedAtLevel(const c10::List>& maybe_tensors, int64_t level); TORCH_API bool isBatchedAtLevel(const Tensor& tensor, int64_t level); -TORCH_API bool isBatchedAtLevel(const c10::optional& maybe_tensor, int64_t level); +TORCH_API bool isBatchedAtLevel(const std::optional& maybe_tensor, int64_t level); // Convenience helper. Returns true if any tensor is batched at level TORCH_API bool areAnyBatchedAtLevel(ArrayRef> maybe_tensors, int64_t level); diff --git a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp index 355ac5965da51..ce3f20ef97efc 100644 --- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp +++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp @@ -73,7 +73,7 @@ static bool can_perform_inplace(const Tensor& a, const Tensor& b) { // TODO: linear is pretty important for performance, but I'm not sure how to work // around the in-place. -Tensor linear_hack(const Tensor& input, const Tensor& weight, const c10::optional& bias_opt) { +Tensor linear_hack(const Tensor& input, const Tensor& weight, const std::optional& bias_opt) { // See [Note: hacky wrapper removal for optional tensor] auto bias = bias_opt.has_value() ? c10::MaybeOwned::borrowed(*bias_opt) @@ -123,8 +123,8 @@ static inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64 Tensor binary_cross_entropy_with_logits_hack( const Tensor& input, const Tensor& target, - const c10::optional& weight_opt, - const c10::optional& pos_weight_opt, + const std::optional& weight_opt, + const std::optional& pos_weight_opt, int64_t reduction) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); diff --git a/aten/src/ATen/miopen/AutocastRNN.cpp b/aten/src/ATen/miopen/AutocastRNN.cpp index 271d80ea03cd4..a23eb4a1a19b8 100644 --- a/aten/src/ATen/miopen/AutocastRNN.cpp +++ b/aten/src/ATen/miopen/AutocastRNN.cpp @@ -14,7 +14,7 @@ miopen_rnn(const Tensor & input_r, TensorList weight, int64_t weight_stride0, const Tensor & hx, - const c10::optional& cx_opt, + const std::optional& cx_opt, int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_num_layers, @@ -23,7 +23,7 @@ miopen_rnn(const Tensor & input_r, bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes, - const c10::optional& fn_dropout_state_opt) { + const std::optional& fn_dropout_state_opt) { #if AT_ROCM_ENABLED() diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp index f7918ac18993c..baa91eabb3898 100644 --- a/aten/src/ATen/mps/EmptyTensor.cpp +++ b/aten/src/ATen/mps/EmptyTensor.cpp @@ -20,11 +20,11 @@ namespace at::detail { TensorBase empty_mps( IntArrayRef size, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt) { #if defined(__APPLE__) #if __is_target_os(macOS) if (at::hasMPS()) { @@ -95,7 +95,7 @@ TensorBase empty_strided_mps( IntArrayRef size, IntArrayRef stride, ScalarType dtype, - c10::optional device_opt) { + std::optional device_opt) { #if defined(__APPLE__) #if __is_target_os(macOS) if (at::hasMPS()) { diff --git a/aten/src/ATen/mps/EmptyTensor.h b/aten/src/ATen/mps/EmptyTensor.h index 88a29547406cd..39b206cb3031d 100644 --- a/aten/src/ATen/mps/EmptyTensor.h +++ b/aten/src/ATen/mps/EmptyTensor.h @@ -7,11 +7,11 @@ namespace at::detail { C10_EXPORT TensorBase empty_mps( IntArrayRef size, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt); + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt); C10_EXPORT TensorBase empty_mps( IntArrayRef size, const TensorOptions &options); @@ -19,7 +19,7 @@ C10_EXPORT TensorBase empty_strided_mps( IntArrayRef size, IntArrayRef stride, ScalarType dtype, - c10::optional device_opt); + std::optional device_opt); C10_EXPORT TensorBase empty_strided_mps( IntArrayRef size, diff --git a/aten/src/ATen/mps/MPSGuardImpl.h b/aten/src/ATen/mps/MPSGuardImpl.h index fe43fcf40fd34..1b57d2966767a 100644 --- a/aten/src/ATen/mps/MPSGuardImpl.h +++ b/aten/src/ATen/mps/MPSGuardImpl.h @@ -52,7 +52,7 @@ struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface return Device(c10::DeviceType::MPS, 0); } - c10::optional uncheckedGetDevice() const noexcept { + std::optional uncheckedGetDevice() const noexcept { return Device(c10::DeviceType::MPS, 0); } @@ -112,12 +112,12 @@ struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface struct OptionalMPSGuard { explicit OptionalMPSGuard() : guard_() {} - explicit OptionalMPSGuard(c10::optional device_opt) + explicit OptionalMPSGuard(std::optional device_opt) : guard_(device_opt) {} /// Set the current MPS device to the passed device index, if it is not /// nullopt - explicit OptionalMPSGuard(c10::optional device_index_opt) + explicit OptionalMPSGuard(std::optional device_index_opt) : guard_(device_index_opt) {} // Copy is not allowed @@ -147,14 +147,14 @@ struct OptionalMPSGuard { /// Returns the device that was set immediately prior to initialization of the /// guard, or nullopt if the guard is uninitialized. - c10::optional original_device() const { + std::optional original_device() const { return guard_.original_device(); } /// Returns the most recent device that was set using this device guard, /// either from construction, or via set_device, if the guard is initialized, /// or nullopt if the guard is uninitialized. - c10::optional current_device() const { + std::optional current_device() const { return guard_.current_device(); } diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp index 533bc32216365..a0141f974923e 100644 --- a/aten/src/ATen/native/Activation.cpp +++ b/aten/src/ATen/native/Activation.cpp @@ -572,7 +572,7 @@ inline void _rrelu_with_noise_train( const Tensor& noise, const Scalar& lower_, const Scalar& upper_, - c10::optional generator) { + std::optional generator) { using opmath_t = at::opmath_type; opmath_t lower = lower_.to(); opmath_t upper = upper_.to(); @@ -603,8 +603,9 @@ Tensor& rrelu_with_noise_out_cpu(const Tensor& self, const Scalar& lower, const Scalar& upper, bool training, - c10::optional generator, + std::optional generator, Tensor& output) { + TORCH_CHECK(self.sym_sizes() == noise.sym_sizes(), "noise tensor shape must match self tensor shape. Got self.shape = ", self.sym_sizes(), " noise.shape = ", noise.sym_sizes()); if (training) { AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "rrelu_with_noise_out_cpu", [&] { _rrelu_with_noise_train(output, self.contiguous(), noise, lower, upper, generator); @@ -625,7 +626,7 @@ Tensor rrelu_with_noise_cpu( const Scalar& lower, const Scalar& upper, bool training, - c10::optional generator) { + std::optional generator) { auto output = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT); return at::native::rrelu_with_noise_out_cpu( self, noise, lower, upper, training, std::move(generator), output); @@ -637,7 +638,7 @@ Tensor& rrelu_with_noise_cpu_( const Scalar& lower, const Scalar& upper, bool training, - c10::optional generator) { + std::optional generator) { return at::native::rrelu_with_noise_out_cpu( self, noise, lower, upper, training, std::move(generator), self); } @@ -660,12 +661,12 @@ Tensor rrelu_with_noise_backward( } } -Tensor rrelu(const Tensor & self, const Scalar& lower, const Scalar& upper, bool training, c10::optional generator) { +Tensor rrelu(const Tensor & self, const Scalar& lower, const Scalar& upper, bool training, std::optional generator) { TORCH_CHECK(lower.to() <= upper.to(), "Lower bound should be less than or equal to the upper bound") return at::rrelu_with_noise(self, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT), lower, upper, training, std::move(generator)); } -Tensor & rrelu_(Tensor & self, const Scalar& lower, const Scalar& upper, bool training, c10::optional generator) { +Tensor & rrelu_(Tensor & self, const Scalar& lower, const Scalar& upper, bool training, std::optional generator) { TORCH_CHECK(lower.to() <= upper.to(), "Lower bound should be less than or equal to the upper bound") return at::rrelu_with_noise_(self, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT), lower, upper, training, std::move(generator)); } diff --git a/aten/src/ATen/native/AveragePool2d.cpp b/aten/src/ATen/native/AveragePool2d.cpp index 854b4585db10a..368dc02c2832f 100644 --- a/aten/src/ATen/native/AveragePool2d.cpp +++ b/aten/src/ATen/native/AveragePool2d.cpp @@ -21,7 +21,7 @@ TORCH_PRECOMPUTE_META_FUNC(avg_pool2d) IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { // #20866, #22032: Guarantee this for the official C++ API? TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2, "avg_pool2d: kernel_size must either be a single int, or a tuple of two ints"); @@ -101,7 +101,7 @@ TORCH_META_FUNC(avg_pool2d_backward) ( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override + std::optional divisor_override ) { // #20866, #22032: Guarantee this for the official C++ API? TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2, @@ -159,7 +159,7 @@ TORCH_IMPL_FUNC(avg_pool2d_out_cpu) int64_t padW, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, const Tensor& output) { avg_pool2d_kernel( kCPU, @@ -183,7 +183,7 @@ TORCH_IMPL_FUNC(avg_pool2d_backward_out_cpu) ( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, const Tensor& gradInput ) { const int kH = safe_downcast(kernel_size[0]); diff --git a/aten/src/ATen/native/AveragePool3d.cpp b/aten/src/ATen/native/AveragePool3d.cpp index c2d7b44a5076c..701ad09bfd512 100644 --- a/aten/src/ATen/native/AveragePool3d.cpp +++ b/aten/src/ATen/native/AveragePool3d.cpp @@ -25,7 +25,7 @@ TORCH_META_FUNC(avg_pool3d) ( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override + std::optional divisor_override ) { // #20866, #22032: Guarantee this for the official C++ API? TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 3, @@ -94,7 +94,7 @@ TORCH_META_FUNC(avg_pool3d_backward) ( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override + std::optional divisor_override ) { // #20866, #22032: Guarantee this for the official C++ API? TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 3, @@ -174,7 +174,7 @@ static void avg_pool3d_out_frame( int padW, int padH, bool count_include_pad, - c10::optional divisor_override) + std::optional divisor_override) { at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { for (const auto k : c10::irange(start, end)) { @@ -261,7 +261,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) ( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, const Tensor& output ) { const int kT = safe_downcast(kernel_size[0]); @@ -362,7 +362,7 @@ static void avg_pool3d_backward_out_frame( int padW, int padH, bool count_include_pad, - c10::optional divisor_override) + std::optional divisor_override) { at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { for (const auto k : c10::irange(start, end)) { @@ -441,7 +441,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cpu) ( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, const Tensor& gradInput ) { const int kT = safe_downcast(kernel_size[0]); diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp index 40e6b34dc9725..ce4b4d15b7968 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp @@ -656,7 +656,7 @@ TORCH_META_FUNC(linalg_qr)(const Tensor& A, TORCH_META_FUNC(_linalg_svd)(const Tensor& A, bool full_matrices, bool compute_uv, - c10::optional driver) { + std::optional driver) { at::native::checkIsMatrix(A, "linalg.svd"); at::native::checkFloatingOrComplex(A, "linalg.svd"); @@ -3128,7 +3128,7 @@ DEFINE_DISPATCH(svd_stub); TORCH_IMPL_FUNC(_linalg_svd_out)(const Tensor& A, const bool full_matrices, const bool compute_uv, - c10::optional driver, + std::optional driver, const Tensor & U, const Tensor & S, const Tensor & Vh) { @@ -3177,7 +3177,7 @@ TORCH_IMPL_FUNC(_linalg_svd_out)(const Tensor& A, std::tuple linalg_svd_out(const Tensor& A, bool full_matrices, - c10::optional driver, + std::optional driver, Tensor & U, Tensor & S, Tensor & Vh) { @@ -3196,12 +3196,12 @@ linalg_svd_out(const Tensor& A, } std::tuple linalg_svd(const Tensor& A, bool full_matrices, - c10::optional driver) { + std::optional driver) { return at::_linalg_svd(A, full_matrices, /*compute_uv=*/true, driver); } // See note in linalg_svd for why this function does not have an _ex variant -Tensor& linalg_svdvals_out(const Tensor& A, c10::optional driver, Tensor & S) { +Tensor& linalg_svdvals_out(const Tensor& A, std::optional driver, Tensor & S) { // Dummies auto U = at::empty({0}, A.options()); auto Vh = at::empty({0}, A.options()); @@ -3209,7 +3209,7 @@ Tensor& linalg_svdvals_out(const Tensor& A, c10::optional driv return S; } -Tensor linalg_svdvals(const Tensor& A, c10::optional driver) { +Tensor linalg_svdvals(const Tensor& A, std::optional driver) { return std::get<1>(at::_linalg_svd(A, /*full_matrices=*/false, /*compute_uv=*/_may_require_fw_or_bw_grad(A), /*driver=*/driver)); @@ -3469,7 +3469,7 @@ static void linalg_lstsq_out_info( } } -static std::string get_default_lstsq_driver(c10::optional driver, const Tensor& input) { +static std::string get_default_lstsq_driver(std::optional driver, const Tensor& input) { // if `driver` is empty, we set driver_str to "gels" if working with CUDA tensors, // otherwise to "gelsy" driver. std::string driver_str; @@ -3505,8 +3505,8 @@ static std::string get_default_lstsq_driver(c10::optional driv std::tuple linalg_lstsq_out( const Tensor& input, const Tensor& other, - c10::optional rcond, - c10::optional driver, + std::optional rcond, + std::optional driver, Tensor& solution, Tensor& residuals, Tensor& rank, @@ -3668,8 +3668,8 @@ std::tuple linalg_lstsq_out( std::tuple linalg_lstsq( const Tensor& input, const Tensor& other, - c10::optional rcond, - c10::optional driver) { + std::optional rcond, + std::optional driver) { Tensor solution = at::empty({0}, input.options()); Tensor residuals = at::empty({0}, input.options().dtype(toRealValueType(input.scalar_type()))); Tensor rank = at::empty({0}, input.options().dtype(at::kLong)); @@ -4003,7 +4003,7 @@ Tensor linalg_solve_triangular( Tensor linalg_vander_symint( const Tensor& x, - c10::optional N) { + std::optional N) { auto t = x.scalar_type(); TORCH_CHECK(t == ScalarType::Float || t == ScalarType::Double || diff --git a/aten/src/ATen/native/BatchLinearAlgebra.h b/aten/src/ATen/native/BatchLinearAlgebra.h index efbe7ce1b9d1c..c8402640aa08a 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.h +++ b/aten/src/ATen/native/BatchLinearAlgebra.h @@ -304,7 +304,7 @@ using svd_fn = void (*)( const Tensor& /*A*/, const bool /*full_matrices*/, const bool /*compute_uv*/, - const c10::optional& /*driver*/, + const std::optional& /*driver*/, const Tensor& /*U*/, const Tensor& /*S*/, const Tensor& /*Vh*/, diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp index f29970afe2b44..79e7b8b049381 100644 --- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp @@ -1087,7 +1087,7 @@ static void apply_svd(const Tensor& A, void svd_kernel(const Tensor& A, const bool full_matrices, const bool compute_uv, - const c10::optional& driver, + const std::optional& driver, const Tensor& U, const Tensor& S, const Tensor& Vh, diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index 78f57470a922d..19c70672fb93c 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -173,7 +173,7 @@ TORCH_META_FUNC2(div, Tensor) (const Tensor& self, const Tensor& other) { build_borrowing_binary_float_op(maybe_get_output(), self, other); } -TORCH_META_FUNC2(div, Tensor_mode) (const Tensor& self, const Tensor& other, c10::optional rounding_mode) { +TORCH_META_FUNC2(div, Tensor_mode) (const Tensor& self, const Tensor& other, std::optional rounding_mode) { if (!rounding_mode.has_value()) { build_borrowing_binary_float_op(maybe_get_output(), self, other); // NOLINTNEXTLINE(bugprone-branch-clone) @@ -303,7 +303,7 @@ TORCH_META_FUNC2(xlogy, Tensor) (const Tensor& self, const Tensor& other) { build_borrowing_binary_float_op(maybe_get_output(), self, other); } -TORCH_META_FUNC(logit_backward) (const Tensor& grad_output, const Tensor& input, c10::optional eps) { +TORCH_META_FUNC(logit_backward) (const Tensor& grad_output, const Tensor& input, std::optional eps) { build_borrowing_binary_op(maybe_get_output(), grad_output, input); } @@ -448,7 +448,7 @@ TORCH_IMPL_FUNC(div_out) (const Tensor& self, const Tensor& other, const Tensor& } TORCH_IMPL_FUNC(div_out_mode) ( - const Tensor& self, const Tensor& other, c10::optional rounding_mode, const Tensor& result + const Tensor& self, const Tensor& other, std::optional rounding_mode, const Tensor& result ) { if (!rounding_mode.has_value()) { div_true_stub(device_type(), *this); @@ -459,7 +459,7 @@ TORCH_IMPL_FUNC(div_out_mode) ( } } -TORCH_IMPL_FUNC(logit_backward_out) (const Tensor& grad_output, const Tensor& input, c10::optional eps, const Tensor& result) { +TORCH_IMPL_FUNC(logit_backward_out) (const Tensor& grad_output, const Tensor& input, std::optional eps, const Tensor& result) { logit_backward_stub(device_type(), *this, Scalar(eps ? eps.value() : -1.0)); } @@ -896,11 +896,11 @@ Tensor& div_(Tensor& self, const Scalar& other) { return self.div_(wrapped_scalar_tensor(other)); // redispatch! } -Tensor div(const Tensor& self, const Scalar& other, c10::optional rounding_mode) { +Tensor div(const Tensor& self, const Scalar& other, std::optional rounding_mode) { return self.div(wrapped_scalar_tensor(other), std::move(rounding_mode)); // redispatch! } -Tensor& div_(Tensor& self, const Scalar& other, c10::optional rounding_mode) { +Tensor& div_(Tensor& self, const Scalar& other, std::optional rounding_mode) { return self.div_(wrapped_scalar_tensor(other), std::move(rounding_mode)); // redispatch! } @@ -925,23 +925,23 @@ Tensor& divide_(Tensor& self, const Scalar& other) { return self.div_(other); } -Tensor& divide_out(const Tensor& self, const Tensor& other, c10::optional rounding_mode, Tensor& result) { +Tensor& divide_out(const Tensor& self, const Tensor& other, std::optional rounding_mode, Tensor& result) { return at::div_out(result, self, other, std::move(rounding_mode)); } -Tensor divide(const Tensor& self, const Tensor& other, c10::optional rounding_mode) { +Tensor divide(const Tensor& self, const Tensor& other, std::optional rounding_mode) { return self.div(other, std::move(rounding_mode)); } -Tensor& divide_(Tensor& self, const Tensor& other, c10::optional rounding_mode) { +Tensor& divide_(Tensor& self, const Tensor& other, std::optional rounding_mode) { return self.div_(other, std::move(rounding_mode)); } -Tensor divide(const Tensor& self, const Scalar& other, c10::optional rounding_mode) { +Tensor divide(const Tensor& self, const Scalar& other, std::optional rounding_mode) { return self.div(other, std::move(rounding_mode)); } -Tensor& divide_(Tensor& self, const Scalar& other, c10::optional rounding_mode) { +Tensor& divide_(Tensor& self, const Scalar& other, std::optional rounding_mode) { return self.div_(other, std::move(rounding_mode)); } diff --git a/aten/src/ATen/native/Bucketization.cpp b/aten/src/ATen/native/Bucketization.cpp index 736273a40cb09..98e37af91b316 100644 --- a/aten/src/ATen/native/Bucketization.cpp +++ b/aten/src/ATen/native/Bucketization.cpp @@ -146,8 +146,8 @@ Tensor& searchsorted_out_cpu( const Tensor& self, bool out_int32, bool right, - const c10::optional side_opt, - const c10::optional& sorter_opt, + const std::optional side_opt, + const std::optional& sorter_opt, Tensor& result) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned sorter_maybe_owned = at::borrow_from_optional_tensor(sorter_opt); @@ -193,8 +193,8 @@ Tensor& searchsorted_out_cpu( const Scalar& self, bool out_int32, bool right, - const c10::optional side_opt, - const c10::optional& sorter_opt, + const std::optional side_opt, + const std::optional& sorter_opt, Tensor& result) { const Tensor& scalar_tensor = searchsorted_scalar_tensor(self, sorted_sequence.device()); return searchsorted_out_cpu(sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter_opt, result); @@ -205,8 +205,8 @@ Tensor searchsorted_cpu( const Tensor& self, bool out_int32, bool right, - const c10::optional side_opt, - const c10::optional& sorter_opt) { + const std::optional side_opt, + const std::optional& sorter_opt) { ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long; c10::TensorOptions options = TensorOptions().device(self.options().device()).dtype(scalar_type); Tensor result = at::empty({0}, options, MemoryFormat::Contiguous); @@ -219,8 +219,8 @@ Tensor searchsorted_cpu( const Scalar& self, bool out_int32, bool right, - const c10::optional side_opt, - const c10::optional& sorter_opt) { + const std::optional side_opt, + const std::optional& sorter_opt) { const Tensor& scalar_tensor = searchsorted_scalar_tensor(self, sorted_sequence.device()); return searchsorted_cpu(sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter_opt); } diff --git a/aten/src/ATen/native/BucketizationUtils.h b/aten/src/ATen/native/BucketizationUtils.h index 59d459bd9c29e..90747c264b156 100644 --- a/aten/src/ATen/native/BucketizationUtils.h +++ b/aten/src/ATen/native/BucketizationUtils.h @@ -107,7 +107,7 @@ inline void searchsorted_pre_check( const Tensor& output, const bool out_int32, const bool right, - const c10::optional side_opt, + const std::optional side_opt, const Tensor& sorter) { if (side_opt) { const c10::string_view side = *side_opt; diff --git a/aten/src/ATen/native/CPUFallback.cpp b/aten/src/ATen/native/CPUFallback.cpp index 502c61e4d144c..1d0930cf3a5ea 100644 --- a/aten/src/ATen/native/CPUFallback.cpp +++ b/aten/src/ATen/native/CPUFallback.cpp @@ -48,7 +48,7 @@ static std::vector to_cpu(const at::TensorList& tensors) { return cpu_tensors; } -static c10::optional compute_target_device(std::vector& t_args, std::vector> tlist_args) { +static std::optional compute_target_device(std::vector& t_args, std::vector> tlist_args) { // Decide what device to move the output tensor(s) to. // The current convention is that we use the first tensor arg to pick the device // Barring that, we take the first tensor from a TensorList arg. @@ -89,7 +89,7 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool std::vector> tensorlist_args; std::vector tensorlist_args_indices; - c10::optional tgt_device = c10::nullopt; + std::optional tgt_device = c10::nullopt; // save converted cpu tensor for TensorList std::vector tensorlist_cpu_args; diff --git a/aten/src/ATen/native/ComparisonUtils.cpp b/aten/src/ATen/native/ComparisonUtils.cpp index 5a1138d041b1c..57f00ec86137f 100644 --- a/aten/src/ATen/native/ComparisonUtils.cpp +++ b/aten/src/ATen/native/ComparisonUtils.cpp @@ -25,7 +25,7 @@ void _assert_match(const O& original, const C& compared, const std::string& name } } -void _assert_tensor_metadata(at::Tensor const& tensor, at::OptionalIntArrayRef sizes, at::OptionalIntArrayRef strides, c10::optional dtype) { +void _assert_tensor_metadata(at::Tensor const& tensor, at::OptionalIntArrayRef sizes, at::OptionalIntArrayRef strides, std::optional dtype) { _assert_match(tensor.sizes(), sizes, "sizes"); _assert_match(tensor.strides(), strides, "strides"); _assert_match(tensor.dtype(), dtype, "dtype"); diff --git a/aten/src/ATen/native/Constraints.cpp b/aten/src/ATen/native/Constraints.cpp index 8f3f8c11e696c..21a64537af283 100644 --- a/aten/src/ATen/native/Constraints.cpp +++ b/aten/src/ATen/native/Constraints.cpp @@ -24,8 +24,8 @@ namespace at::native { void sym_constrain_range( const Scalar& size, - c10::optional min, - c10::optional max) { + std::optional min, + std::optional max) { int64_t min_val = min.has_value() ? min.value() : std::numeric_limits::min(); int64_t max_val = max.has_value() ? max.value() : std::numeric_limits::max(); @@ -53,14 +53,14 @@ void sym_constrain_range( Tensor _functional_sym_constrain_range( const Scalar& size, - c10::optional min, - c10::optional max, + std::optional min, + std::optional max, const Tensor& dep_token) { sym_constrain_range(size, min, max); return dep_token.clone(); } -void sym_constrain_range_for_size(const Scalar& size, c10::optional min, c10::optional max) { +void sym_constrain_range_for_size(const Scalar& size, std::optional min, c10::optional max) { int64_t min_val = min.has_value() ? min.value() : 0; if (max.has_value() && max.value() <= 2) { TORCH_CHECK(false, "Max value to constrain_range_for_size must be greater than 2. got: ", max.value()); @@ -70,19 +70,19 @@ void sym_constrain_range_for_size(const Scalar& size, c10::optional min Tensor _functional_sym_constrain_range_for_size( const Scalar& size, - c10::optional min, - c10::optional max, + std::optional min, + std::optional max, const Tensor& dep_token) { sym_constrain_range_for_size(size, min, max); return dep_token.clone(); } Tensor _make_dep_token_cpu( - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt) { return at::empty( {}, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt); } diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h index 4b814f3e442cb..d504d088a8620 100644 --- a/aten/src/ATen/native/ConvUtils.h +++ b/aten/src/ATen/native/ConvUtils.h @@ -44,7 +44,7 @@ using mkldnn_convolution_backward_fn = std::tuple); DECLARE_DISPATCH(mkldnn_convolution_backward_fn, mkldnn_convolution_backward_stub); -using mkldnn_convolution_transpose_fn = Tensor(*)(const Tensor&, const Tensor&, const c10::optional&, +using mkldnn_convolution_transpose_fn = Tensor(*)(const Tensor&, const Tensor&, const std::optional&, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t); DECLARE_DISPATCH(mkldnn_convolution_transpose_fn, mkldnn_convolution_transpose_stub); using mkldnn_convolution_transpose_backward_fn = std::tuple(*)( @@ -117,7 +117,7 @@ enum class ConvBackend { // Overload for selecting the convolution backend from the full set of convolution inputs. // This overload is exposed to python for testing, etc. TORCH_API ConvBackend select_conv_backend( - const Tensor& input, const Tensor& weight, const c10::optional& bias_opt, + const Tensor& input, const Tensor& weight, const std::optional& bias_opt, SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, bool transposed, SymIntArrayRef output_padding, c10::SymInt groups, const at::OptionalSymIntArrayRef bias_sizes_opt); @@ -360,7 +360,7 @@ static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const bool can_use_miopen_channels_last_2d = false; // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen // See #64427 - static c10::optional PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC"); + static std::optional PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC"); auto input_memory_format = input.suggest_memory_format(); auto weight_memory_format = weight.suggest_memory_format(); diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index 717280a6cdcab..ecedc73579d66 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -368,7 +368,7 @@ struct ConvParams { } } - bool use_cpu_depthwise3x3_winograd(const at::Tensor& input, const at::Tensor& weight, const c10::optional& bias) const { + bool use_cpu_depthwise3x3_winograd(const at::Tensor& input, const at::Tensor& weight, const std::optional& bias) const { #if defined(__ARM_NEON__) // Currently only 3x3 depthwise convolutions on tensors of float are supported. return (input.ndimension() == 4) && @@ -878,7 +878,7 @@ at::Tensor complex_convolution( at::Tensor complex_convolution_mode( const at::Tensor& input, const at::Tensor& weight, - const c10::optional& bias_opt, + const std::optional& bias_opt, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, @@ -908,7 +908,7 @@ at::Tensor complex_convolution_mode( } // namespace at::Tensor conv1d_symint( - const Tensor& input_, const Tensor& weight, const c10::optional& bias_opt, + const Tensor& input_, const Tensor& weight, const std::optional& bias_opt, SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, c10::SymInt groups) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); @@ -933,7 +933,7 @@ at::Tensor conv1d_symint( } at::Tensor conv2d_symint( - const Tensor& input_, const Tensor& weight, const c10::optional& bias_opt, + const Tensor& input_, const Tensor& weight, const std::optional& bias_opt, SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, c10::SymInt groups) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); @@ -958,7 +958,7 @@ at::Tensor conv2d_symint( } at::Tensor conv3d_symint( - const Tensor& input_, const Tensor& weight, const c10::optional& bias_opt, + const Tensor& input_, const Tensor& weight, const std::optional& bias_opt, SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, c10::SymInt groups) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); @@ -1049,7 +1049,7 @@ static Tensor convolution_same( } Tensor _convolution_mode_symint( - const Tensor& input, const Tensor& weight, const c10::optional& bias_opt, + const Tensor& input, const Tensor& weight, const std::optional& bias_opt, SymIntArrayRef stride, c10::string_view padding, SymIntArrayRef dilation, c10::SymInt groups) { // See [Note: hacky wrapper removal for optional tensor] @@ -1067,7 +1067,7 @@ Tensor _convolution_mode_symint( } at::Tensor conv1d_padding_symint( - const Tensor& input_, const Tensor& weight, const c10::optional& bias, + const Tensor& input_, const Tensor& weight, const std::optional& bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups) { auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d"); @@ -1081,7 +1081,7 @@ at::Tensor conv1d_padding_symint( } at::Tensor conv2d_padding_symint( - const Tensor& input_, const Tensor& weight, const c10::optional& bias, + const Tensor& input_, const Tensor& weight, const std::optional& bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups) { auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d"); @@ -1095,7 +1095,7 @@ at::Tensor conv2d_padding_symint( } at::Tensor conv3d_padding_symint( - const Tensor& input_, const Tensor& weight, const c10::optional& bias, + const Tensor& input_, const Tensor& weight, const std::optional& bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups) { auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d"); @@ -1109,7 +1109,7 @@ at::Tensor conv3d_padding_symint( } at::Tensor conv_transpose1d_symint( - const Tensor& input_, const Tensor& weight, const c10::optional& bias_opt, + const Tensor& input_, const Tensor& weight, const std::optional& bias_opt, SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, c10::SymInt groups, SymIntArrayRef dilation) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); @@ -1128,7 +1128,7 @@ at::Tensor conv_transpose1d_symint( } at::Tensor conv_transpose2d_symint( - const Tensor& input_, const Tensor& weight, const c10::optional& bias_opt, + const Tensor& input_, const Tensor& weight, const std::optional& bias_opt, SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, c10::SymInt groups, SymIntArrayRef dilation) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); @@ -1147,7 +1147,7 @@ at::Tensor conv_transpose2d_symint( } at::Tensor conv_transpose3d_symint( - const Tensor& input_, const Tensor& weight, const c10::optional& bias_opt, + const Tensor& input_, const Tensor& weight, const std::optional& bias_opt, SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, c10::SymInt groups, SymIntArrayRef dilation) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); @@ -1166,7 +1166,7 @@ at::Tensor conv_transpose3d_symint( } at::Tensor convolution( - const Tensor& input, const Tensor& weight, const c10::optional& bias_opt, + const Tensor& input, const Tensor& weight, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed, IntArrayRef output_padding, int64_t groups) { // See [Note: hacky wrapper removal for optional tensor] @@ -1182,7 +1182,7 @@ at::Tensor convolution( } at::Tensor convolution_overrideable( - const Tensor& input, const Tensor& weight, const c10::optional& bias_opt, + const Tensor& input, const Tensor& weight, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed, IntArrayRef output_padding, int64_t groups) { TORCH_CHECK_NOT_IMPLEMENTED(false, "convolution_overrideable not implemented. You are likely triggering this with tensor backend other than CPU/CUDA/MKLDNN, if this is intended, please use TORCH_LIBRARY_IMPL to override this function "); @@ -1197,7 +1197,7 @@ template ConvBackend _select_conv_backend( const Tensor& input, const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const at::OptionalArrayRef bias_sizes_opt, const bool need_backward, const ConvParams& params) { @@ -1304,7 +1304,7 @@ ConvBackend _select_conv_backend( // Selects a backend for convolution based on the inputs and params. ConvBackend select_conv_backend( - const Tensor& input_r, const Tensor& weight_r, const c10::optional& bias_opt, + const Tensor& input_r, const Tensor& weight_r, const std::optional& bias_opt, SymIntArrayRef stride_, SymIntArrayRef padding_, SymIntArrayRef dilation_, bool transposed_, SymIntArrayRef output_padding_, c10::SymInt groups_, const at::OptionalSymIntArrayRef bias_sizes_opt) { c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); @@ -1339,7 +1339,7 @@ ConvBackend select_conv_backend( weight = view4d(weight); } - auto bias_sizes = bias.defined() ? c10::optional(bias.sym_sizes()) : bias_sizes_opt; + auto bias_sizes = bias.defined() ? std::optional(bias.sym_sizes()) : bias_sizes_opt; bool need_backward = GradMode::is_enabled() && (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad())); return _select_conv_backend(input, weight, bias, bias_sizes, need_backward, params); @@ -1461,7 +1461,7 @@ at::MemoryFormat _determine_backend_memory_format( } at::Tensor _convolution( - const Tensor& input_r, const Tensor& weight_r, const c10::optional& bias_r_opt, + const Tensor& input_r, const Tensor& weight_r, const std::optional& bias_r_opt, IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_, bool transposed_, IntArrayRef output_padding_, int64_t groups_, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { @@ -1504,7 +1504,7 @@ at::Tensor _convolution( } // Select appropriate backend to use. - auto bias_sizes_opt = bias.defined() ? c10::optional(bias.sizes()) : c10::nullopt; + auto bias_sizes_opt = bias.defined() ? std::optional(bias.sizes()) : c10::nullopt; bool need_backward = GradMode::is_enabled() && (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad())); ConvBackend backend = _select_conv_backend(input, weight, bias, c10::OptionalIntArrayRef(bias_sizes_opt), need_backward, params); @@ -1701,7 +1701,7 @@ at::Tensor _convolution( } at::Tensor _convolution( - const Tensor& input_r, const Tensor& weight_r, const c10::optional& bias_r_opt, + const Tensor& input_r, const Tensor& weight_r, const std::optional& bias_r_opt, IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_, bool transposed_, IntArrayRef output_padding_, int64_t groups_, bool benchmark, bool deterministic, bool cudnn_enabled) @@ -1730,7 +1730,7 @@ static Tensor subvariable(const Tensor& var, int dim, int groups, int g) { return result; } -std::tuple _convolution_double_backward( const c10::optional& ggI_opt, const c10::optional& ggW_r_opt, const c10::optional& ggb_opt, +std::tuple _convolution_double_backward( const std::optional& ggI_opt, const c10::optional& ggW_r_opt, const c10::optional& ggb_opt, const Tensor& gO_r, const Tensor& weight_r, const Tensor& input, IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_, bool transposed_, IntArrayRef output_padding_, int64_t groups_, diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp index 6f8a3477c239c..686948584c728 100644 --- a/aten/src/ATen/native/ConvolutionMM2d.cpp +++ b/aten/src/ATen/native/ConvolutionMM2d.cpp @@ -538,7 +538,7 @@ static void slow_conv2d_backward_weight_out_cpu_template( Tensor& slow_conv2d_forward_out_cpu( const Tensor& self, const Tensor& weight_, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor& output) { @@ -627,7 +627,7 @@ Tensor& slow_conv2d_forward_out_cpu( Tensor slow_conv2d_forward_cpu( const Tensor& self, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding) { // See [Note: hacky wrapper removal for optional tensor] @@ -726,7 +726,7 @@ std::tuple slow_conv2d_backward_cpu( return std::make_tuple(grad_input, grad_weight, grad_bias); } -Tensor & thnn_conv2d_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output) { +Tensor & thnn_conv2d_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; @@ -734,7 +734,7 @@ Tensor & thnn_conv2d_out(const Tensor & self, const Tensor & weight, IntArrayRef return at::_slow_conv2d_forward_out(output, self, weight, kernel_size, bias, stride, padding); } -Tensor thnn_conv2d(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional& bias_opt, IntArrayRef stride, IntArrayRef padding) { +Tensor thnn_conv2d(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp index 1d5e7a8333def..f361b3a819129 100644 --- a/aten/src/ATen/native/ConvolutionMM3d.cpp +++ b/aten/src/ATen/native/ConvolutionMM3d.cpp @@ -553,7 +553,7 @@ static void slow_conv3d_backward_parameters_out_cpu_template( Tensor& slow_conv3d_forward_out_cpu(const Tensor& self, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor& output) { @@ -668,7 +668,7 @@ Tensor& slow_conv3d_forward_out_cpu(const Tensor& self, Tensor slow_conv3d_forward_cpu( const Tensor& self, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding) { // See [Note: hacky wrapper removal for optional tensor] @@ -771,7 +771,7 @@ std::tuple slow_conv3d_backward_cpu( Tensor& slow_conv3d_out(const Tensor& self, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor& output) { @@ -792,7 +792,7 @@ Tensor& slow_conv3d_out(const Tensor& self, Tensor slow_conv3d( const Tensor& self, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding) { // See [Note: hacky wrapper removal for optional tensor] diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp index e6aa8493905d3..c5f81e98906dd 100644 --- a/aten/src/ATen/native/Copy.cpp +++ b/aten/src/ATen/native/Copy.cpp @@ -325,7 +325,7 @@ Tensor copy(const Tensor& self, const Tensor& src, bool non_blocking) { // Instead, generate an empty tensor with the right sizes/strides, since we should be able to assume // that copy_() will fully overwrite all data with that of src if (self_storage->nbytes() == 0) { - r = at::empty_strided(self.sizes(), self.strides()); + r = at::empty_strided(self.sizes(), self.strides(), self.options()); } else { r = clone_preserve_strides(self); } diff --git a/aten/src/ATen/native/Correlation.cpp b/aten/src/ATen/native/Correlation.cpp index 95384684961a4..5482a8e0a597a 100644 --- a/aten/src/ATen/native/Correlation.cpp +++ b/aten/src/ATen/native/Correlation.cpp @@ -24,8 +24,8 @@ namespace at::native { Tensor cov( const Tensor& self, int64_t correction, - const c10::optional& fweights, - const c10::optional& aweights) { + const std::optional& fweights, + const std::optional& aweights) { constexpr int64_t OBSERVATIONS_DIM = 1; TORCH_CHECK( diff --git a/aten/src/ATen/native/Cross.cpp b/aten/src/ATen/native/Cross.cpp index 99f0760fcc0f4..7297aaed80d38 100644 --- a/aten/src/ATen/native/Cross.cpp +++ b/aten/src/ATen/native/Cross.cpp @@ -40,7 +40,7 @@ namespace at::native { DEFINE_DISPATCH(cross_stub); -static int64_t _default_cross_dim(const c10::optional &dimension, SymIntArrayRef sizes) { +static int64_t _default_cross_dim(const std::optional &dimension, SymIntArrayRef sizes) { // If dimension is not given, it defaults to the first dimension found with the size 3. // Note that this behaviour might be unexpected. // _default_cross_dim is called internally inside the cross implementation to calculate @@ -57,7 +57,7 @@ static int64_t _default_cross_dim(const c10::optional &dimension, SymIn TORCH_CHECK(false, "no dimension of size 3 in input"); } -Tensor cross(const Tensor & input, const Tensor & other, const c10::optional dimension) { +Tensor cross(const Tensor & input, const Tensor & other, const std::optional dimension) { if (!dimension) { TORCH_WARN_ONCE( "Using torch.cross without specifying the dim arg is deprecated.\n", @@ -69,7 +69,7 @@ Tensor cross(const Tensor & input, const Tensor & other, const c10::optional dimension, Tensor & out) { +Tensor & cross_out(const Tensor & input, const Tensor & other, const std::optional dimension, Tensor & out) { auto dim = _default_cross_dim(dimension, input.sym_sizes()); return at::linalg_cross_out(out, input, other, dim); } diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp index 5af87802a1246..942461c7612c1 100644 --- a/aten/src/ATen/native/Distance.cpp +++ b/aten/src/ATen/native/Distance.cpp @@ -78,7 +78,7 @@ Tensor _euclidean_dist(const Tensor& x1, const Tensor& x2) { return result; } -static Tensor cdist_impl(const Tensor& x1, const Tensor& x2, const double p, c10::optional compute_mode) { +static Tensor cdist_impl(const Tensor& x1, const Tensor& x2, const double p, std::optional compute_mode) { TORCH_CHECK(at::isFloatingType(x1.scalar_type()), "cdist only supports floating-point dtypes, X1 got: ", x1.scalar_type()); auto device1 = x1.device().type(); TORCH_CHECK(at::isFloatingType(x2.scalar_type()), "cdist only supports floating-point dtypes, X2 got: ", x2.scalar_type()); @@ -147,7 +147,7 @@ static Tensor cdist_impl(const Tensor& x1, const Tensor& x2, const double p, c10 return result; } -Tensor cdist(const Tensor& x1, const Tensor& x2, const double p, c10::optional compute_mode) { +Tensor cdist(const Tensor& x1, const Tensor& x2, const double p, std::optional compute_mode) { TORCH_CHECK(x1.dim() >= 2, "cdist only supports at least 2D tensors, X1 got: ", x1.dim(), "D"); TORCH_CHECK(x2.dim() >= 2, "cdist only supports at least 2D tensors, X2 got: ", x2.dim(), "D"); TORCH_CHECK(x1.sym_size(-1) == x2.sym_size(-1), "X1 and X2 must have the same number of columns. X1: ", x1.sym_size(-1), " X2: ", x2.sym_size(-1)); @@ -175,7 +175,7 @@ Tensor cdist(const Tensor& x1, const Tensor& x2, const double p, c10::optional compute_mode) { +Tensor _cdist_forward(const Tensor& x1, const Tensor& x2, const double p, std::optional compute_mode) { TORCH_CHECK(x1.dim() >= 2, "cdist only supports at least 2D tensors, X1 got: ", x1.dim(), "D"); TORCH_CHECK(x2.dim() >= 2, "cdist only supports at least 2D tensors, X2 got: ", x2.dim(), "D"); TORCH_CHECK(x1.size(-1) == x2.size(-1), "X1 and X2 must have the same number of columns. X1: ", x1.size(-1), " X2: ", x2.size(-1)); diff --git a/aten/src/ATen/native/DistributionTemplates.h b/aten/src/ATen/native/DistributionTemplates.h index a5ed9526c270d..ba72f0df11a0a 100644 --- a/aten/src/ATen/native/DistributionTemplates.h +++ b/aten/src/ATen/native/DistributionTemplates.h @@ -81,7 +81,7 @@ int64_t update_to(int64_t to) { } template class random_kernel, typename RNG> -at::Tensor& random_impl(at::Tensor& self, c10::optional generator) { +at::Tensor& random_impl(at::Tensor& self, std::optional generator) { CHECK_EMPTY_AND_RETURN(self); auto iter = at::TensorIterator::borrowing_nullary_op(self); random_kernel()(iter, generator); @@ -132,7 +132,7 @@ static void check_from_to_in_range(int64_t from, int64_t to_inc, caffe2::TypeMet } template class random_from_to_kernel, typename RNG> -at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, c10::optional to_opt, c10::optional generator) { +at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, std::optional to_opt, c10::optional generator) { uint64_t range = 0; auto iter = at::TensorIterator::borrowing_nullary_op(self); if (to_opt.has_value()) { @@ -200,7 +200,7 @@ at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, c10::optional= 0.0, "normal expects std >= 0.0, but found std ", std); template class normal_kernel, typename RNG> -Tensor& normal_impl_(Tensor& self, double mean, double std, c10::optional gen) { +Tensor& normal_impl_(Tensor& self, double mean, double std, std::optional gen) { CHECK_NORMAL_STD(std); CHECK_EMPTY_AND_RETURN(self); @@ -216,7 +216,7 @@ Tensor& normal_impl_(Tensor& self, double mean, double std, c10::optional class normal_kernel, typename RNG> -Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, c10::optional gen) { +Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, std::optional gen) { CHECK_NORMAL_STD(std); auto std_tensor = at::empty_like(output, MemoryFormat::Contiguous); auto shape = at::infer_size(mean.sizes(), std_tensor.sizes()); @@ -227,7 +227,7 @@ Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, c10::opt } template class normal_kernel, typename RNG> -Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, c10::optional gen) { +Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, std::optional gen) { CHECK_NORMAL_TENSOR_STD(std); auto mean_tensor = at::full({}, mean, output.options()); auto shape = at::infer_size(mean_tensor.sizes(), std.sizes()); @@ -242,7 +242,7 @@ Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, c10::opt } template class normal_kernel, typename RNG> -Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, c10::optional gen) { +Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, std::optional gen) { CHECK_NORMAL_TENSOR_STD(std); auto shape = at::infer_size(mean.sizes(), std.sizes()); at::native::resize_output(output, shape); @@ -256,7 +256,7 @@ Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, c } template class normal_kernel, typename RNG> -Tensor normal_impl(const Tensor& mean, double std, c10::optional gen) { +Tensor normal_impl(const Tensor& mean, double std, std::optional gen) { CHECK_NORMAL_STD(std); Tensor ret = at::empty_like(mean, MemoryFormat::Contiguous); normal_out_impl(ret, mean, std, gen); @@ -264,7 +264,7 @@ Tensor normal_impl(const Tensor& mean, double std, c10::optional gen) } template class normal_kernel, typename RNG> -Tensor normal_impl(double mean, const Tensor& std, c10::optional gen) { +Tensor normal_impl(double mean, const Tensor& std, std::optional gen) { CHECK_NORMAL_TENSOR_STD(std); Tensor ret = at::empty_like(std, MemoryFormat::Contiguous); normal_out_impl(ret, mean, std, gen); @@ -272,7 +272,7 @@ Tensor normal_impl(double mean, const Tensor& std, c10::optional gen) } template class normal_kernel, typename RNG> -Tensor normal_impl(const Tensor& mean, const Tensor& std, c10::optional gen) { +Tensor normal_impl(const Tensor& mean, const Tensor& std, std::optional gen) { CHECK_NORMAL_TENSOR_STD(std); auto shape = at::infer_size(mean.sizes(), std.sizes()); Tensor ret = at::empty(shape, mean.options(), MemoryFormat::Contiguous); @@ -283,7 +283,7 @@ Tensor normal_impl(const Tensor& mean, const Tensor& std, c10::optional class uniform_kernel, typename RNG> -at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, c10::optional generator) { +at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, std::optional generator) { if (self.is_complex()) { CHECK_EMPTY_AND_RETURN(self); auto float_tensor = at::view_as_real(self); @@ -313,7 +313,7 @@ at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, c10::optiona // ================================================== LogNormal ======================================================= template class log_normal_kernel, typename RNG> -at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, c10::optional gen) { +at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, std::optional gen) { TORCH_CHECK(std > 0.0, "log_normal_ expects std > 0.0, but found std=", std); CHECK_EMPTY_AND_RETURN(self); auto iter = TensorIterator::borrowing_nullary_op(self); @@ -324,7 +324,7 @@ at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, c10::opt // =================================================== Geometric ====================================================== template class geometric_kernel, typename RNG> -Tensor& geometric_impl_(Tensor& self, double p, c10::optional gen) { +Tensor& geometric_impl_(Tensor& self, double p, std::optional gen) { TORCH_CHECK(0 < p && p < 1, "geometric_ expects p to be in (0, 1), but got p=", p); CHECK_EMPTY_AND_RETURN(self); auto iter = TensorIterator::borrowing_nullary_op(self); @@ -335,7 +335,7 @@ Tensor& geometric_impl_(Tensor& self, double p, c10::optional gen) { // ================================================== Exponential ===================================================== template class exponential_kernel, typename RNG> -Tensor& exponential_impl_(Tensor& self, double lambda, c10::optional gen) { +Tensor& exponential_impl_(Tensor& self, double lambda, std::optional gen) { TORCH_CHECK(lambda > 0.0, "exponential_ expects lambda > 0.0, but found lambda=", lambda); CHECK_EMPTY_AND_RETURN(self); auto iter = TensorIterator::borrowing_nullary_op(self); @@ -346,7 +346,7 @@ Tensor& exponential_impl_(Tensor& self, double lambda, c10::optional // ==================================================== Cauchy ======================================================== template class cauchy_kernel, typename RNG> -Tensor& cauchy_impl_(Tensor& self, double median, double sigma, c10::optional gen) { +Tensor& cauchy_impl_(Tensor& self, double median, double sigma, std::optional gen) { // TODO: instead of variable name 'sigma', use 'gamma' or 'scale' // the variance, squared sigma, is undefined for cauchy distribution TORCH_CHECK(sigma > 0.0, "cauchy_ expects sigma > 0.0, but found sigma=", sigma); @@ -360,7 +360,7 @@ Tensor& cauchy_impl_(Tensor& self, double median, double sigma, c10::optional class bernoulli_tensor_kernel, typename RNG> -Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, c10::optional gen) { +Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, std::optional gen) { CHECK_EMPTY_AND_RETURN(self); NoNamesGuard guard; at::assert_no_internal_overlap(self); @@ -369,7 +369,7 @@ Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, c10::optional } template class bernoulli_scalar_kernel, typename RNG> -Tensor& bernoulli_impl_(Tensor& self, double p, c10::optional gen) { +Tensor& bernoulli_impl_(Tensor& self, double p, std::optional gen) { TORCH_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p); CHECK_EMPTY_AND_RETURN(self); at::assert_no_internal_overlap(self); @@ -378,7 +378,7 @@ Tensor& bernoulli_impl_(Tensor& self, double p, c10::optional gen) { } template class bernoulli_tensor_kernel, typename RNG> -Tensor& bernoulli_out_impl(Tensor& result, const Tensor& self, c10::optional gen) { +Tensor& bernoulli_out_impl(Tensor& result, const Tensor& self, std::optional gen) { // result.resize_as_(self) requires self to have same dtype as result, so we // use resize_ instead. // TODO: Fix resize_as_. See pytorch/pytorch#11665. diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp index 4d4eb2efaf401..7ecb8ebb9ffc8 100644 --- a/aten/src/ATen/native/Distributions.cpp +++ b/aten/src/ATen/native/Distributions.cpp @@ -160,36 +160,36 @@ DEFINE_DISPATCH(random_full_64_bits_range_stub); template struct BernoulliStub { - void operator()(Tensor& self, const Tensor& p_, c10::optional gen) { + void operator()(Tensor& self, const Tensor& p_, std::optional gen) { bernoulli_tensor_stub(self.device().type(), self, p_, gen); } - void operator()(Tensor& self, double p, c10::optional gen) { + void operator()(Tensor& self, double p, std::optional gen) { bernoulli_scalar_stub(self.device().type(), self, p, gen); } }; -Tensor bernoulli(const Tensor& self, c10::optional gen) { +Tensor bernoulli(const Tensor& self, std::optional gen) { Tensor result = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT); result.bernoulli_(self, std::move(gen)); return result; } -Tensor bernoulli(const Tensor& self, double p, c10::optional gen) { +Tensor bernoulli(const Tensor& self, double p, std::optional gen) { Tensor result = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT); result.bernoulli_(p, std::move(gen)); return result; } -Tensor& bernoulli_out(const Tensor& self, c10::optional gen, Tensor& result) { +Tensor& bernoulli_out(const Tensor& self, std::optional gen, Tensor& result) { return at::native::templates::bernoulli_out_impl(result, self, std::move(gen)); } -Tensor& bernoulli_(Tensor& self, const Tensor& p_, c10::optional gen) { +Tensor& bernoulli_(Tensor& self, const Tensor& p_, std::optional gen) { return at::native::templates::bernoulli_impl_(self, p_, std::move(gen)); } -Tensor& bernoulli_(Tensor& self, double p, c10::optional gen) { +Tensor& bernoulli_(Tensor& self, double p, std::optional gen) { return at::native::templates::bernoulli_impl_(self, p, std::move(gen)); } @@ -197,12 +197,12 @@ Tensor& bernoulli_(Tensor& self, double p, c10::optional gen) { template struct LogNormalStub { - void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double mean, double std, std::optional gen) { log_normal_stub(iter.device_type(), iter, mean, std, gen); } }; -Tensor& log_normal_(Tensor& self, double mean, double std, c10::optional gen) { +Tensor& log_normal_(Tensor& self, double mean, double std, std::optional gen) { return at::native::templates::log_normal_impl_(self, mean, std, std::move(gen)); } @@ -210,12 +210,12 @@ Tensor& log_normal_(Tensor& self, double mean, double std, c10::optional struct CauchyStub { - void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double median, double sigma, std::optional gen) { cauchy_stub(iter.device_type(), iter, median, sigma, gen); } }; -Tensor& cauchy_(Tensor& self, double median, double sigma, c10::optional gen) { +Tensor& cauchy_(Tensor& self, double median, double sigma, std::optional gen) { return at::native::templates::cauchy_impl_(self, median, sigma, std::move(gen)); } @@ -223,12 +223,12 @@ Tensor& cauchy_(Tensor& self, double median, double sigma, c10::optional struct ExponentialStub { - void operator()(TensorIteratorBase& iter, double lambda, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double lambda, std::optional gen) { exponential_stub(iter.device_type(), iter, lambda, gen); } }; -Tensor& exponential_(Tensor& self, double lambda, c10::optional gen) { +Tensor& exponential_(Tensor& self, double lambda, std::optional gen) { return at::native::templates::exponential_impl_(self, lambda, std::move(gen)); } @@ -236,12 +236,12 @@ Tensor& exponential_(Tensor& self, double lambda, c10::optional gen) template struct GeometricStub { - void operator()(TensorIteratorBase& iter, double p, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double p, std::optional gen) { geometric_stub(iter.device_type(), iter, p, gen); } }; -Tensor& geometric_(Tensor& self, double p, c10::optional gen) { +Tensor& geometric_(Tensor& self, double p, std::optional gen) { return at::native::templates::geometric_impl_(self, p, std::move(gen)); } @@ -249,7 +249,7 @@ Tensor& geometric_(Tensor& self, double p, c10::optional gen) { template struct UniformStub { - void operator()(TensorIteratorBase& iter, double from, double to, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double from, double to, std::optional gen) { uniform_stub(iter.device_type(), iter, from, to, gen); } }; @@ -257,15 +257,15 @@ struct UniformStub { template struct UniformMeta { // No-op! - void operator()(TensorIteratorBase& iter, double from, double to, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double from, double to, std::optional gen) { } }; -Tensor& uniform_(Tensor& self, double from, double to, c10::optional gen) { +Tensor& uniform_(Tensor& self, double from, double to, std::optional gen) { return at::native::templates::uniform_impl_(self, from, to, std::move(gen)); } -Tensor& uniform_meta_(Tensor& self, double from, double to, c10::optional gen) { +Tensor& uniform_meta_(Tensor& self, double from, double to, std::optional gen) { return at::native::templates::uniform_impl_(self, from, to, std::move(gen)); } @@ -273,7 +273,7 @@ Tensor& uniform_meta_(Tensor& self, double from, double to, c10::optional struct NormalStub { - void operator()(Tensor& self, double mean, double std, c10::optional gen) { + void operator()(Tensor& self, double mean, double std, std::optional gen) { normal_stub(self.device().type(), self, mean, std, gen); } }; @@ -281,76 +281,76 @@ struct NormalStub { template struct NormalMeta { // No-op! - void operator()(Tensor& self, double mean, double std, c10::optional gen) { + void operator()(Tensor& self, double mean, double std, std::optional gen) { } }; // inplace -Tensor& normal_(Tensor& self, double mean, double std, c10::optional gen) { +Tensor& normal_(Tensor& self, double mean, double std, std::optional gen) { return at::native::templates::normal_impl_(self, mean, std, std::move(gen)); } -Tensor& normal_meta_(Tensor& self, double mean, double std, c10::optional gen) { +Tensor& normal_meta_(Tensor& self, double mean, double std, std::optional gen) { return at::native::templates::normal_impl_(self, mean, std, std::move(gen)); } // out tensor float -Tensor& normal_out(const Tensor& mean, double std, c10::optional gen, Tensor& output) { +Tensor& normal_out(const Tensor& mean, double std, std::optional gen, Tensor& output) { return at::native::templates::normal_out_impl(output, mean, std, std::move(gen)); } -Tensor& normal_out_meta(const Tensor& mean, double std, c10::optional gen, Tensor& output) { +Tensor& normal_out_meta(const Tensor& mean, double std, std::optional gen, Tensor& output) { return at::native::templates::normal_out_impl(output, mean, std, std::move(gen)); } // out float tensor -Tensor& normal_out(double mean, const Tensor& std, c10::optional gen, Tensor& output) { +Tensor& normal_out(double mean, const Tensor& std, std::optional gen, Tensor& output) { return at::native::templates::normal_out_impl(output, mean, std, std::move(gen)); } -Tensor& normal_out_meta(double mean, const Tensor& std, c10::optional gen, Tensor& output) { +Tensor& normal_out_meta(double mean, const Tensor& std, std::optional gen, Tensor& output) { return at::native::templates::normal_out_impl(output, mean, std, std::move(gen)); } // out tensor tensor -Tensor& normal_out(const Tensor& mean, const Tensor& std, c10::optional gen, Tensor& output) { +Tensor& normal_out(const Tensor& mean, const Tensor& std, std::optional gen, Tensor& output) { return at::native::templates::normal_out_impl(output, mean, std, std::move(gen)); } -Tensor& normal_out_meta(const Tensor& mean, const Tensor& std, c10::optional gen, Tensor& output) { +Tensor& normal_out_meta(const Tensor& mean, const Tensor& std, std::optional gen, Tensor& output) { return at::native::templates::normal_out_impl(output, mean, std, std::move(gen)); } // functional tensor float -Tensor normal(const Tensor& mean, double std, c10::optional gen) { +Tensor normal(const Tensor& mean, double std, std::optional gen) { return at::native::templates::normal_impl(mean, std, std::move(gen)); } -Tensor normal_meta(const Tensor& mean, double std, c10::optional gen) { +Tensor normal_meta(const Tensor& mean, double std, std::optional gen) { return at::native::templates::normal_impl(mean, std, std::move(gen)); } // functional float tensor -Tensor normal(double mean, const Tensor& std, c10::optional gen) { +Tensor normal(double mean, const Tensor& std, std::optional gen) { return at::native::templates::normal_impl(mean, std, std::move(gen)); } -Tensor normal_meta(double mean, const Tensor& std, c10::optional gen) { +Tensor normal_meta(double mean, const Tensor& std, std::optional gen) { return at::native::templates::normal_impl(mean, std, std::move(gen)); } // functional tensor tensor -Tensor normal(const Tensor& mean, const Tensor& std, c10::optional gen) { +Tensor normal(const Tensor& mean, const Tensor& std, std::optional gen) { return at::native::templates::normal_impl(mean, std, std::move(gen)); } -Tensor normal_meta(const Tensor& mean, const Tensor& std, c10::optional gen) { +Tensor normal_meta(const Tensor& mean, const Tensor& std, std::optional gen) { return at::native::templates::normal_impl(mean, std, std::move(gen)); } // functional variant, only used by the functionalization pass. -Tensor normal_functional(const Tensor& self, double mean, double std, c10::optional generator) { +Tensor normal_functional(const Tensor& self, double mean, double std, std::optional generator) { return self.clone().normal_(mean, std, std::move(generator)); } @@ -358,44 +358,44 @@ Tensor normal_functional(const Tensor& self, double mean, double std, c10::optio template struct RandomStub { - void operator()(TensorIteratorBase& iter, c10::optional gen) { + void operator()(TensorIteratorBase& iter, std::optional gen) { random_stub(iter.device_type(), iter, gen); } }; -Tensor& random_(Tensor& self, c10::optional gen) { +Tensor& random_(Tensor& self, std::optional gen) { return at::native::templates::random_impl(self, std::move(gen)); } template struct RandomFromToStub { - void operator()(TensorIteratorBase& iter, uint64_t range, int64_t from, c10::optional gen) { + void operator()(TensorIteratorBase& iter, uint64_t range, int64_t from, std::optional gen) { random_from_to_stub(iter.device_type(), iter, range, from, gen); } - void operator()(TensorIteratorBase& iter, c10::optional gen) { + void operator()(TensorIteratorBase& iter, std::optional gen) { random_full_64_bits_range_stub(iter.device_type(), iter, gen); } }; -Tensor& random_(Tensor& self, int64_t from, optional to, c10::optional gen) { +Tensor& random_(Tensor& self, int64_t from, optional to, std::optional gen) { return at::native::templates::random_from_to_impl(self, from, to, std::move(gen)); } -Tensor& random_(Tensor& self, int64_t to, c10::optional gen) { +Tensor& random_(Tensor& self, int64_t to, std::optional gen) { return random_(self, 0, to, std::move(gen)); } -Tensor& random_meta_(Tensor& self, c10::optional gen) { +Tensor& random_meta_(Tensor& self, std::optional gen) { // No error checking yay return self; } -Tensor& random_meta_(Tensor& self, int64_t from, optional to, c10::optional gen) { +Tensor& random_meta_(Tensor& self, int64_t from, optional to, std::optional gen) { // No error checking yay return self; } -Tensor& random_meta_(Tensor& self, int64_t to, c10::optional gen) { +Tensor& random_meta_(Tensor& self, int64_t to, std::optional gen) { // No error checking yay return self; } @@ -437,7 +437,7 @@ Tensor _dirichlet_grad_cpu(const Tensor& x, const Tensor& alpha, const Tensor& t * This section is a counterpart to Distributions.cu */ -Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, c10::optional gen) { +Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, std::optional gen) { Tensor ret = at::zeros(count.sizes(), count.options()); auto iter = TensorIteratorConfig() .add_output(ret) @@ -462,7 +462,7 @@ Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, c10::optional gen) { +Tensor _s_poisson_cpu(const Tensor& lambda, std::optional gen) { Tensor ret = at::zeros(lambda.sizes(), lambda.options()); auto iter = TensorIteratorConfig() .add_output(ret) @@ -479,7 +479,7 @@ Tensor _s_poisson_cpu(const Tensor& lambda, c10::optional gen) { return ret; } -Tensor _s_gamma_cpu(const Tensor& alpha, c10::optional gen) { +Tensor _s_gamma_cpu(const Tensor& alpha, std::optional gen) { Tensor ret = at::zeros(alpha.sizes(), alpha.options()); auto iter = TensorIteratorConfig() .add_output(ret) @@ -509,7 +509,7 @@ Tensor _s_gamma_cpu(const Tensor& alpha, c10::optional gen) { return ret; } -Tensor _s_dirichlet_cpu(const Tensor& alpha, c10::optional gen) { +Tensor _s_dirichlet_cpu(const Tensor& alpha, std::optional gen) { Tensor ret = at::zeros(alpha.sizes(), alpha.options()); AT_DISPATCH_FLOATING_TYPES(ret.scalar_type(), "dirichlet", [&] { Tensor gamma = at::zeros(alpha.sizes(), alpha.options().dtype(ScalarType::Double)); @@ -562,7 +562,7 @@ constexpr int64_t FLOAT32_MAX_CONSECUTIVE_INT = 1 << (FLT_MANT_DIG); Tensor& multinomial_out(const Tensor& self, int64_t n_sample, bool with_replacement, - c10::optional gen, + std::optional gen, Tensor& result) { TORCH_CHECK( result.device() == self.device(), @@ -647,7 +647,7 @@ Tensor multinomial( const Tensor& self, int64_t n_sample, bool with_replacement, - c10::optional gen) { + std::optional gen) { Tensor result = at::empty({0}, self.options().dtype(kLong)); native::multinomial_out(self, n_sample, with_replacement, std::move(gen), result); return result; diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp index 7014ec65d1f5a..8a5d4a702a0ca 100644 --- a/aten/src/ATen/native/Dropout.cpp +++ b/aten/src/ATen/native/Dropout.cpp @@ -102,7 +102,7 @@ ALIAS_SPECIALIZATION(_feature_alpha_dropout, true, true ) } // anonymous namespace std::tuple -native_dropout_cpu(const Tensor& input, double p, c10::optional train) { +native_dropout_cpu(const Tensor& input, double p, std::optional train) { if (input.numel() == 0) { return std::make_tuple(input, at::empty_like(input, input.options())); } diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp index 705b08ab39f06..b0c4644e579c2 100644 --- a/aten/src/ATen/native/Embedding.cpp +++ b/aten/src/ATen/native/Embedding.cpp @@ -88,7 +88,7 @@ Tensor embedding_sparse_backward( Tensor indices = indices_; Tensor grad = grad_; if (padding_idx != -1) { - c10::List> c({indices != padding_idx}); + c10::List> c({indices != padding_idx}); indices = indices.index(c); grad = grad.index(c); } diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp index 8b6c90dae2375..216fad05dc07f 100644 --- a/aten/src/ATen/native/EmbeddingBag.cpp +++ b/aten/src/ATen/native/EmbeddingBag.cpp @@ -103,7 +103,7 @@ bool is_fast_path_index_select_scale(const Tensor& src, const Tensor& scale, Ten } template -bool is_fast_path(const Tensor& src, const c10::optional& scale, Tensor& output, index_t padding_idx) { +bool is_fast_path(const Tensor& src, const std::optional& scale, Tensor& output, index_t padding_idx) { return (scale.has_value() && scale.value().defined()) ? is_fast_path_index_select_scale(src, scale.value(), output, padding_idx) : is_fast_path_index_select(src, output, padding_idx); @@ -891,7 +891,7 @@ void check_arguments( const Tensor& indices, const Tensor& offsets, const int64_t mode, - const c10::optional& per_sample_weights, + const std::optional& per_sample_weights, bool include_last_offset) { auto indices_arg = TensorArg(indices, "indices", 1); checkScalarTypes("embedding_bag", indices_arg, {kLong, kInt}); @@ -985,7 +985,7 @@ void make_offset2bag_out( const Tensor& indices, const Tensor& offsets, const int64_t mode, - const c10::optional& per_sample_weights, + const std::optional& per_sample_weights, const int64_t padding_idx) { // To save compute, if we are going to go down the fast path case for the 'sum' // mode, we skip calculating offset2bag, since it is not going to be used. @@ -1040,7 +1040,7 @@ static Tensor make_offset2bag( const Tensor& indices, const Tensor& offsets, const int64_t mode, - const c10::optional& per_sample_weights, + const std::optional& per_sample_weights, const int64_t padding_idx) { Tensor offset2bag = at::empty({0}, offsets.options()); make_offset2bag_out(offset2bag, output, weight, indices, offsets, mode, per_sample_weights, padding_idx); @@ -1144,7 +1144,7 @@ void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag, Tensor& bag_size, Tensor* max_indices, const Tensor &weight, const Tensor &indices, const Tensor &offsets, const int64_t mode, - const c10::optional& per_sample_weights, + const std::optional& per_sample_weights, bool include_last_offset, int64_t padding_idx, _EmbeddingBagKernelCache* fbgemm_kernel_cache) { if (mode == MODE_MEAN || mode == MODE_SUM) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, weight.scalar_type(), "embedding_bag_no_grad_cpu_out", @@ -1241,8 +1241,8 @@ static std::tuple _embedding_bag_cpu_impl( std::tuple embedding_bag(const Tensor &weight, const Tensor &indices, const Tensor &offsets, const bool scale_grad_by_freq, - const int64_t mode, bool sparse, const c10::optional& per_sample_weights_opt, - bool include_last_offset, c10::optional padding_idx_opt) { + const int64_t mode, bool sparse, const std::optional& per_sample_weights_opt, + bool include_last_offset, std::optional padding_idx_opt) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt); const Tensor& per_sample_weights = *per_sample_weights_maybe_owned; @@ -1273,7 +1273,7 @@ embedding_bag(const Tensor &weight, const Tensor &indices, std::tuple embedding_bag(const Tensor &weight, const Tensor &indices, const Tensor &offsets, const bool scale_grad_by_freq, - const int64_t mode, bool sparse, const c10::optional& per_sample_weights_opt, + const int64_t mode, bool sparse, const std::optional& per_sample_weights_opt, bool include_last_offset) { return at::native::embedding_bag(weight, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights_opt, include_last_offset, c10::nullopt); @@ -1284,7 +1284,7 @@ embedding_bag(const Tensor &weight, const Tensor &indices, std::tuple _embedding_bag_forward_only_cpu(const Tensor &weight, const Tensor &indices, const Tensor &offsets, const bool scale_grad_by_freq, - const int64_t mode, bool sparse, const c10::optional& per_sample_weights_opt, bool include_last_offset, + const int64_t mode, bool sparse, const std::optional& per_sample_weights_opt, bool include_last_offset, int64_t padding_idx) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt); @@ -1307,7 +1307,7 @@ _embedding_bag_forward_only_cpu(const Tensor &weight, const Tensor &indices, std::tuple _embedding_bag_cpu(const Tensor &weight, const Tensor &indices, const Tensor &offsets, const bool scale_grad_by_freq, - const int64_t mode, bool sparse, const c10::optional& per_sample_weights_opt, bool include_last_offset, + const int64_t mode, bool sparse, const std::optional& per_sample_weights_opt, bool include_last_offset, int64_t padding_idx) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt); @@ -1337,9 +1337,9 @@ void _embedding_bag_cpu_out( const bool /* scale_grad_by_freq */, const int64_t mode, const bool /* sparse */, - const c10::optional& per_sample_weights, + const std::optional& per_sample_weights, const bool include_last_offset, - const c10::optional& padding_idx, + const std::optional& padding_idx, _EmbeddingBagKernelCache* fbgemm_kernel_cache) { auto [indicesMaybeOwned, offsetsMaybeOwned] = promoteIndicesAndOffsets(indices_, offsets_); const auto& indices = *indicesMaybeOwned; @@ -1393,7 +1393,7 @@ Tensor _embedding_bag_backward(const Tensor &grad, const Tensor &indices_, const Tensor &max_indices_, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, - bool sparse, const c10::optional& per_sample_weights_opt, + bool sparse, const std::optional& per_sample_weights_opt, int64_t padding_idx) { return at::native::_embedding_bag_backward_symint( grad, indices_, offsets_, offset2bag, bag_size_, max_indices_, num_weights, scale_grad_by_freq, mode, sparse, per_sample_weights_opt, padding_idx); @@ -1408,7 +1408,7 @@ Tensor _embedding_bag_backward_symint(const Tensor &grad, const Tensor &indices_ const Tensor &max_indices_, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, - bool sparse, const c10::optional& per_sample_weights_opt, + bool sparse, const std::optional& per_sample_weights_opt, int64_t padding_idx) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt); @@ -1610,7 +1610,7 @@ Tensor _embedding_bag_dense_backward_cpu(const Tensor &grad_, const Tensor &indi const Tensor &offset2bag__, const Tensor &bag_size_, const Tensor& max_indices_, int64_t num_weights, - bool scale_grad_by_freq, int64_t mode, const c10::optional& per_sample_weights__opt, + bool scale_grad_by_freq, int64_t mode, const std::optional& per_sample_weights__opt, int64_t padding_idx) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned per_sample_weights__maybe_owned = at::borrow_from_optional_tensor(per_sample_weights__opt); @@ -1765,7 +1765,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu( Tensor _embedding_bag_sparse_backward_symint( const Tensor &grad_, const Tensor &indices, const Tensor &offsets, const Tensor &offset2bag, const Tensor &bag_size_, SymInt num_weights, - bool scale_grad_by_freq, int64_t mode, const c10::optional& per_sample_weights_opt, + bool scale_grad_by_freq, int64_t mode, const std::optional& per_sample_weights_opt, int64_t padding_idx) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt); diff --git a/aten/src/ATen/native/EmbeddingBag.h b/aten/src/ATen/native/EmbeddingBag.h index c2e61f280bf59..796127f0441ee 100644 --- a/aten/src/ATen/native/EmbeddingBag.h +++ b/aten/src/ATen/native/EmbeddingBag.h @@ -13,7 +13,7 @@ void check_arguments( const Tensor& indices, const Tensor& offsets, const int64_t mode, - const c10::optional& per_sample_weights, + const std::optional& per_sample_weights, bool include_last_offset); void make_bag_size_out( @@ -40,7 +40,7 @@ void make_offset2bag_out( const Tensor& indices, const Tensor& offsets, const int64_t mode, - const c10::optional& per_sample_weights, + const std::optional& per_sample_weights, const int64_t padding_idx = -1); #ifdef USE_FBGEMM @@ -64,7 +64,7 @@ struct _CallbackAndBlockSize { _CallbackAndBlockSize() = default; - explicit _CallbackAndBlockSize(c10::optional maybe_block_size) + explicit _CallbackAndBlockSize(std::optional maybe_block_size) : blockSize(maybe_block_size.value_or(-1)) , callback(maybe_block_size.has_value() ? generateCallback(maybe_block_size.value()) : nullptr) {} @@ -75,7 +75,7 @@ struct _EmbeddingBagKernelCacheImpl : private StorageMixins... { _EmbeddingBagKernelCacheImpl() = default; // use each of the mixins to store corresponding kernel and block size - explicit _EmbeddingBagKernelCacheImpl(c10::optional maybe_block_size) + explicit _EmbeddingBagKernelCacheImpl(std::optional maybe_block_size) : StorageMixins(maybe_block_size)... {} @@ -107,7 +107,7 @@ using _EmbeddingBagKernelCache = _EmbeddingBagKernelCacheImpl< _CallbackAndBlockSize>; #else struct _EmbeddingBagKernelCache { - explicit _EmbeddingBagKernelCache(c10::optional /* maybe_block_size */) {} + explicit _EmbeddingBagKernelCache(std::optional /* maybe_block_size */) {} }; #endif @@ -115,7 +115,7 @@ void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag, Tensor& bag_size, Tensor* max_indices, const Tensor &weight, const Tensor &indices, const Tensor &offsets, const int64_t mode = 0, - const c10::optional& per_sample_weights = c10::nullopt, + const std::optional& per_sample_weights = c10::nullopt, bool include_last_offset = false, int64_t padding_idx = -1, _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr); @@ -131,9 +131,9 @@ void _embedding_bag_cpu_out( const bool scale_grad_by_freq, const int64_t mode, const bool sparse, - const c10::optional& per_sample_weights, + const std::optional& per_sample_weights, const bool include_last_offset, - const c10::optional& padding_idx, + const std::optional& padding_idx, _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr); } // namespace at::native diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h index f44ae1179de8f..0839dd9a1560c 100644 --- a/aten/src/ATen/native/ForeachUtils.h +++ b/aten/src/ATen/native/ForeachUtils.h @@ -258,7 +258,7 @@ inline bool can_use_fast_route( using DeviceDtypeKey = std::pair; using IndicesT = std::vector; using nested_optional_tensorvec_t = - std::vector>>; + std::vector>>; using TensorsAndIndicesT = std::pair; using FlatMap = std::unordered_map< DeviceDtypeKey, @@ -339,7 +339,7 @@ inline FlatMap _group_tensors_by_first_tensors_device_and_dtype( nested_optional_tensorvec_t nested_tensorvec; nested_tensorvec.reserve(num_lists); for (const auto& i : c10::irange(num_lists)) { - std::vector> tensors; + std::vector> tensors; if (!nested_tensorlist[i].empty()) { // NB: num_tensors is the max possible length for any of // the inner lists of tensor references. Reserving the max diff --git a/aten/src/ATen/native/FusedAdagrad.cpp b/aten/src/ATen/native/FusedAdagrad.cpp deleted file mode 100644 index 1c5f553e6854c..0000000000000 --- a/aten/src/ATen/native/FusedAdagrad.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include -#include -#include - -#ifndef AT_PER_OPERATOR_HEADERS -#include -#include -#else -#include -#include -#endif -namespace at { - -namespace native { - -void _fused_adagrad_kernel_cpu_( - at::TensorList params, - at::TensorList grads, - at::TensorList state_sums, - at::TensorList state_steps, - const double lr, - const double lr_decay, - const double weight_decay, - const double eps, - const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { - const float* grad_scale_ptr = - grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; - const float* found_inf_ptr = - found_inf.has_value() ? found_inf->data_ptr() : nullptr; - if (found_inf_ptr && *found_inf_ptr == 1.0) { - return; - } - size_t n_tensors = params.size(); - TORCH_CHECK(grads.size() == n_tensors); - TORCH_CHECK(state_sums.size() == n_tensors); - TORCH_CHECK(state_steps.size() == n_tensors); - for (size_t i = 0; i < n_tensors; i++){ - fused_adagrad_stub( - kCPU, - params[i], - grads[i], - state_sums[i], - state_steps[i], - lr, - lr_decay, - weight_decay, - eps, - maximize, - grad_scale_ptr); - } -} - -DEFINE_DISPATCH(fused_adagrad_stub); - -} -} diff --git a/aten/src/ATen/native/FusedAdagrad.h b/aten/src/ATen/native/FusedAdagrad.h deleted file mode 100644 index 395cbdd43aa81..0000000000000 --- a/aten/src/ATen/native/FusedAdagrad.h +++ /dev/null @@ -1,23 +0,0 @@ -#include -#include - -namespace at { - -namespace native { - -using fused_adagrad_fn = void (*)( - const at::Tensor& param, - const at::Tensor& grad, - const at::Tensor& state_sum, - const at::Tensor& state_step, - const double lr, - const double lr_decay, - const double weight_decay, - const double eps, - const bool maximize, - const float* grad_scale_ptr); - -DECLARE_DISPATCH(fused_adagrad_fn, fused_adagrad_stub); - -} -} diff --git a/aten/src/ATen/native/FusedAdam.cpp b/aten/src/ATen/native/FusedAdam.cpp index b3be769b24f18..41ef04b02d548 100644 --- a/aten/src/ATen/native/FusedAdam.cpp +++ b/aten/src/ATen/native/FusedAdam.cpp @@ -30,8 +30,8 @@ void _fused_adam_kernel_cpu_( const double eps, const bool amsgrad, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { const float* grad_scale_ptr = grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; const float* found_inf_ptr = @@ -87,8 +87,8 @@ void _fused_adam_kernel_cpu_( const double eps, const bool amsgrad, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { _fused_adam_kernel_cpu_(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr.item(), beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf); } @@ -106,8 +106,8 @@ void _fused_adamw_kernel_cpu_( const double eps, const bool amsgrad, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { const float* grad_scale_ptr = grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; const float* found_inf_ptr = @@ -163,8 +163,8 @@ void _fused_adamw_kernel_cpu_( const double eps, const bool amsgrad, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { _fused_adamw_kernel_cpu_(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr.item(), beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf); } diff --git a/aten/src/ATen/native/FusedSGD.cpp b/aten/src/ATen/native/FusedSGD.cpp index 56e2e91759113..2fb1f5af9e02f 100644 --- a/aten/src/ATen/native/FusedSGD.cpp +++ b/aten/src/ATen/native/FusedSGD.cpp @@ -26,8 +26,8 @@ void _fused_sgd_kernel_cpu_( const bool nesterov, const bool maximize, const bool is_first_step, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { const float* grad_scale_ptr = grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; const float* found_inf_ptr = @@ -71,8 +71,8 @@ void _fused_sgd_kernel_cpu_( const bool nesterov, const bool maximize, const bool is_first_step, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { _fused_sgd_kernel_cpu_( params, grads, momentum_buffer_list, weight_decay, momentum, lr.item(), dampening, nesterov, diff --git a/aten/src/ATen/native/Histogram.cpp b/aten/src/ATen/native/Histogram.cpp index d5258866f8a34..9954edef94607 100644 --- a/aten/src/ATen/native/Histogram.cpp +++ b/aten/src/ATen/native/Histogram.cpp @@ -71,7 +71,7 @@ namespace { /* Checks properties of input tensors input, bins, and weight. */ -void histogramdd_check_inputs(const Tensor& input, const TensorList& bins, const c10::optional& weight) { +void histogramdd_check_inputs(const Tensor& input, const TensorList& bins, const std::optional& weight) { TORCH_CHECK(input.dim() >= 2, "torch.histogramdd: input tensor should have at least 2 dimensions, but got ", input.dim()); @@ -158,7 +158,7 @@ void histogramdd_prepare_out(const Tensor& input, TensorList bins, * assumes that input has already been reshaped to (M, N). */ std::pair, std::vector> -select_outer_bin_edges(const Tensor& input, c10::optional> range) { +select_outer_bin_edges(const Tensor& input, std::optional> range) { TORCH_INTERNAL_ASSERT(input.dim() == 2, "expected input to have shape (M, N)"); const int64_t N = input.size(-1); @@ -244,7 +244,7 @@ static std::vector allocate_bin_edges_tensors(const Tensor& self) { /* Versions of histogramdd in which bins is a Tensor[] defining the sequences of bin edges. */ static Tensor& histogramdd_out(const Tensor& self, TensorList bins, - const c10::optional& weight, bool density, + const std::optional& weight, bool density, Tensor& hist, TensorList& bin_edges) { histogramdd_check_inputs(self, bins, weight); histogramdd_prepare_out(self, bins, hist, bin_edges); @@ -258,7 +258,7 @@ static Tensor& histogramdd_out(const Tensor& self, TensorList bins, } Tensor _histogramdd(const Tensor& self, TensorList bins, - const c10::optional& weight, bool density) { + const std::optional& weight, bool density) { Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous); std::vector bin_edges_out = allocate_bin_edges_tensors(self); TensorList bin_edges_out_tl(bin_edges_out); @@ -271,8 +271,8 @@ Tensor _histogramdd(const Tensor& self, TensorList bins, * defining the number of bins in each dimension. */ static std::vector& histogramdd_bin_edges_out(const Tensor& self, IntArrayRef bin_ct, - c10::optional> range, - const c10::optional& weight, bool density, + std::optional> range, + const std::optional& weight, bool density, std::vector& bin_edges_out) { TensorList bin_edges_out_tl(bin_edges_out); @@ -296,15 +296,15 @@ static std::vector& histogramdd_bin_edges_out(const Tensor& self, IntArr } std::vector histogramdd_bin_edges(const Tensor& self, IntArrayRef bin_ct, - c10::optional> range, - const c10::optional& weight, bool density) { + std::optional> range, + const std::optional& weight, bool density) { std::vector bin_edges_out = allocate_bin_edges_tensors(self); return histogramdd_bin_edges_out(self, bin_ct, range, weight, density, bin_edges_out); } static Tensor& histogramdd_out(const Tensor& self, IntArrayRef bin_ct, - c10::optional> range, - const c10::optional& weight, bool density, + std::optional> range, + const std::optional& weight, bool density, Tensor& hist, TensorList& bin_edges) { std::vector bins = histogramdd_bin_edges(self, bin_ct, range, weight, density); @@ -320,8 +320,8 @@ static Tensor& histogramdd_out(const Tensor& self, IntArrayRef bin_ct, } Tensor _histogramdd(const Tensor& self, IntArrayRef bin_ct, - c10::optional> range, - const c10::optional& weight, bool density) { + std::optional> range, + const std::optional& weight, bool density) { Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous); std::vector bin_edges_out = allocate_bin_edges_tensors(self); TensorList bin_edges_out_tl(bin_edges_out); @@ -334,10 +334,10 @@ Tensor _histogramdd(const Tensor& self, IntArrayRef bin_ct, */ std::tuple histogram_out(const Tensor& self, const Tensor& bins, - const c10::optional& weight, bool density, + const std::optional& weight, bool density, Tensor& hist, Tensor& bin_edges) { Tensor reshaped_self = self.reshape({ self.numel(), 1 }); - c10::optional reshaped_weight = weight.has_value() + std::optional reshaped_weight = weight.has_value() ? weight.value().reshape({ weight.value().numel() }) : weight; TensorList bins_in = bins; TensorList bins_out = bin_edges; @@ -349,7 +349,7 @@ histogram_out(const Tensor& self, const Tensor& bins, std::tuple histogram(const Tensor& self, const Tensor& bins, - const c10::optional& weight, bool density) { + const std::optional& weight, bool density) { Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous); Tensor bin_edges = at::empty({0}, bins.options(), MemoryFormat::Contiguous); return histogram_out(self, bins, weight, density, hist, bin_edges); @@ -358,11 +358,11 @@ histogram(const Tensor& self, const Tensor& bins, /* Versions of histogram in which bins is an integer specifying the number of equal-width bins. */ std::tuple -histogram_out(const Tensor& self, int64_t bin_ct, c10::optional> range, - const c10::optional& weight, bool density, +histogram_out(const Tensor& self, int64_t bin_ct, std::optional> range, + const std::optional& weight, bool density, Tensor& hist, Tensor& bin_edges) { Tensor reshaped_self = self.reshape({ self.numel(), 1 }); - c10::optional reshaped_weight = weight.has_value() + std::optional reshaped_weight = weight.has_value() ? weight.value().reshape({ weight.value().numel() }) : weight; TensorList bins_in = bin_edges; TensorList bins_out = bin_edges; @@ -378,8 +378,8 @@ histogram_out(const Tensor& self, int64_t bin_ct, c10::optional -histogram(const Tensor& self, int64_t bin_ct, c10::optional> range, - const c10::optional& weight, bool density) { +histogram(const Tensor& self, int64_t bin_ct, std::optional> range, + const std::optional& weight, bool density) { Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous); Tensor bin_edges_out = at::empty({0}, self.options()); return histogram_out(self, bin_ct, range, weight, density, hist, bin_edges_out); @@ -403,7 +403,7 @@ Tensor& histogram_histc_out(const Tensor& self, int64_t bin_ct, histogramdd_check_inputs(reshaped, bins_in, {}); histogramdd_linear_stub(reshaped.device().type(), reshaped, - c10::optional(), false, hist, bin_edges, false); + std::optional(), false, hist, bin_edges, false); return hist; } @@ -414,16 +414,16 @@ Tensor histogram_histc(const Tensor& self, int64_t bin_ct, } std::tuple> histogramdd( - const Tensor &self, TensorList bins, c10::optional> /*range*/, - const c10::optional &weight, bool density) { + const Tensor &self, TensorList bins, std::optional> /*range*/, + const std::optional &weight, bool density) { auto hist = at::_histogramdd_from_bin_tensors(self, bins, weight, density); return std::tuple>{ std::move(hist), bins.vec()}; } std::tuple> histogramdd( - const Tensor &self, IntArrayRef bins, c10::optional> range, - const c10::optional &weight, bool density) { + const Tensor &self, IntArrayRef bins, std::optional> range, + const std::optional &weight, bool density) { auto bin_edges = at::_histogramdd_bin_edges(self, bins, range, weight, density); auto hist = at::_histogramdd_from_bin_cts(self, bins, range, weight, density); return std::tuple>{ @@ -431,8 +431,8 @@ std::tuple> histogramdd( } std::tuple> histogramdd( - const Tensor &self, int64_t bins, c10::optional> range, - const c10::optional &weight, bool density) { + const Tensor &self, int64_t bins, std::optional> range, + const std::optional &weight, bool density) { DimVector bins_v(self.size(-1), bins); return at::native::histogramdd(self, bins_v, range, weight, density); } diff --git a/aten/src/ATen/native/Histogram.h b/aten/src/ATen/native/Histogram.h index cd19fa4691ad0..fee7e06b87258 100644 --- a/aten/src/ATen/native/Histogram.h +++ b/aten/src/ATen/native/Histogram.h @@ -5,8 +5,8 @@ namespace at::native { -using histogramdd_fn = void(*)(const Tensor&, const c10::optional&, bool, Tensor&, const TensorList&); -using histogramdd_linear_fn = void(*)(const Tensor&, const c10::optional&, bool, Tensor&, const TensorList&, bool); +using histogramdd_fn = void(*)(const Tensor&, const std::optional&, bool, Tensor&, const TensorList&); +using histogramdd_linear_fn = void(*)(const Tensor&, const std::optional&, bool, Tensor&, const TensorList&, bool); using histogram_select_outer_bin_edges_fn = void(*)(const Tensor& input, const int64_t N, std::vector &leftmost_edges, std::vector &rightmost_edges); DECLARE_DISPATCH(histogramdd_fn, histogramdd_stub); diff --git a/aten/src/ATen/native/IndexingUtils.h b/aten/src/ATen/native/IndexingUtils.h index 72b39eb326a0c..fb382ccbc6f0a 100644 --- a/aten/src/ATen/native/IndexingUtils.h +++ b/aten/src/ATen/native/IndexingUtils.h @@ -65,8 +65,8 @@ static C10_UNUSED void checkIndexTensorTypes(IOptTensorListRef indices, bool all } } -inline torch::List> toListOfOptionalTensors(ArrayRef list) { - torch::List> result; +inline torch::List> toListOfOptionalTensors(ArrayRef list) { + torch::List> result; result.reserve(list.size()); for (const Tensor& a : list) { result.push_back(a); @@ -74,11 +74,11 @@ inline torch::List> toListOfOptionalTensors(ArrayRef> toListOfOptionalTensors(ArrayRef list) { - torch::List> result; +inline torch::List> toListOfOptionalTensors(ArrayRef list) { + torch::List> result; result.reserve(list.size()); for (const IValue& a : list) { - result.push_back(a.isTensor() ? c10::optional(a.toTensor()) : c10::optional()); + result.push_back(a.isTensor() ? std::optional(a.toTensor()) : c10::optional()); } return result; } diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index 9322776b03f5a..8a835410458ea 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -70,7 +70,7 @@ static inline Tensor _flatten_nd_linear(const Tensor& input, const Tensor& weigh } -Tensor linear(const Tensor& input, const Tensor& weight, const c10::optional& bias_opt) { +Tensor linear(const Tensor& input, const Tensor& weight, const std::optional& bias_opt) { // _matmul_impl checks this again later, but _flatten_nd_linear does not work on scalars inputs, // so let's try to catch this here already const auto input_dim = input.dim(); @@ -121,7 +121,7 @@ Tensor linear(const Tensor& input, const Tensor& weight, const c10::optional& bias_opt, Tensor& output) { +Tensor& linear_out(const Tensor& input, const Tensor& weight, const std::optional& bias_opt, Tensor& output) { TORCH_CHECK(!input.is_mkldnn(), "linear doesn't support out for MKLDNN tensors"); // See [Note: hacky wrapper removal for optional tensor] auto bias = bias_opt.has_value() @@ -707,7 +707,7 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_, return output; } -Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight, const c10::optional& bias_opt) { +Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight, const std::optional& bias_opt) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index 81f461f6c95b8..3389033ac9851 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -280,7 +280,7 @@ TORCH_META_FUNC(_linalg_slogdet)(const Tensor& A) { } template -void common_checks_baddbmm_bmm(Meta& meta, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, bool is_bmm, const c10::optional& self_baddbmm = nullopt) { +void common_checks_baddbmm_bmm(Meta& meta, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, bool is_bmm, const std::optional& self_baddbmm = nullopt) { TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor"); TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor"); @@ -635,7 +635,7 @@ namespace { Tensor linalg_matrix_power_impl( const Tensor& self, int64_t n, - c10::optional _out) { + std::optional _out) { NoTF32Guard disable_tf32; auto out = _out.value_or(Tensor()); @@ -929,7 +929,7 @@ Tensor matrix_chain_multiplication( } // Implements torch.linalg.multi_dot -Tensor multi_dot_impl(TensorList _tensors, c10::optional _out) { +Tensor multi_dot_impl(TensorList _tensors, std::optional _out) { const size_t n = _tensors.size(); TORCH_CHECK(n >= 2, "multi_dot(): expected at least 2 tensors but got ", n); diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp index e21d9f6008e8e..a170e4a868aa7 100644 --- a/aten/src/ATen/native/Loss.cpp +++ b/aten/src/ATen/native/Loss.cpp @@ -250,7 +250,7 @@ Tensor kl_div(const Tensor& input, const Tensor& target, int64_t reduction, bool return apply_loss_reduction(output, reduction); } -Tensor binary_cross_entropy_cpu(const Tensor& input, const Tensor& target, const c10::optional& weight_opt, int64_t reduction) { +Tensor binary_cross_entropy_cpu(const Tensor& input, const Tensor& target, const std::optional& weight_opt, int64_t reduction) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -260,7 +260,7 @@ Tensor binary_cross_entropy_cpu(const Tensor& input, const Tensor& target, const input, target, weight, reduction, loss); } -Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target, const c10::optional& weight_opt, int64_t reduction, Tensor& loss) { +Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target, const std::optional& weight_opt, int64_t reduction, Tensor& loss) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -307,7 +307,7 @@ Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target, return loss; } -Tensor binary_cross_entropy_backward_cpu(const Tensor& grad, const Tensor& input, const Tensor& target, const c10::optional& weight_opt, int64_t reduction) { +Tensor binary_cross_entropy_backward_cpu(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional& weight_opt, int64_t reduction) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -317,7 +317,7 @@ Tensor binary_cross_entropy_backward_cpu(const Tensor& grad, const Tensor& input grad, input, target, weight, reduction, grad_input); } -Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor& input, const Tensor& target, const c10::optional& weight_opt, int64_t reduction, Tensor& grad_input) { +Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional& weight_opt, int64_t reduction, Tensor& grad_input) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -359,7 +359,7 @@ Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor& return grad_input; } -Tensor binary_cross_entropy_with_logits(const Tensor& input, const Tensor& target, const c10::optional& weight_opt, const c10::optional& pos_weight_opt, int64_t reduction) { +Tensor binary_cross_entropy_with_logits(const Tensor& input, const Tensor& target, const std::optional& weight_opt, const c10::optional& pos_weight_opt, int64_t reduction) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; diff --git a/aten/src/ATen/native/LossMulti.h b/aten/src/ATen/native/LossMulti.h index f21269620f253..27697815ad594 100644 --- a/aten/src/ATen/native/LossMulti.h +++ b/aten/src/ATen/native/LossMulti.h @@ -41,7 +41,7 @@ namespace { const int64_t& ndims, const Tensor& input, const Tensor& target, - const c10::optional& weight) { + const std::optional& weight) { TORCH_CHECK( (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0, "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ", diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp index 5b2f5ae1863b7..e7620c7900c56 100644 --- a/aten/src/ATen/native/LossMultiMargin.cpp +++ b/aten/src/ATen/native/LossMultiMargin.cpp @@ -102,7 +102,7 @@ void multi_margin_loss_out_cpu_template( const Tensor& target, int p, const Scalar& margin, - const c10::optional& weight, + const std::optional& weight, int64_t reduction) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t nframe, dim; @@ -266,7 +266,7 @@ Tensor multi_margin_loss_cpu( const Tensor& target, const Scalar& p, const Scalar& margin, - const c10::optional& weight, + const std::optional& weight, int64_t reduction) { auto output = at::empty({0}, input.options()); multi_margin_loss_out_cpu_template( @@ -278,7 +278,7 @@ Tensor& multi_margin_loss_cpu_out(const Tensor& input, const Tensor& target, const Scalar& p, const Scalar& margin, - const c10::optional& weight, + const std::optional& weight, int64_t reduction, Tensor& output) { multi_margin_loss_out_cpu_template( @@ -291,7 +291,7 @@ Tensor multi_margin_loss_cpu_backward( const Tensor& input, const Tensor& target, const Scalar& p, - const Scalar& margin, const c10::optional& weight_opt, + const Scalar& margin, const std::optional& weight_opt, int64_t reduction) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); @@ -314,7 +314,7 @@ Tensor& multi_margin_loss_cpu_backward_out(const Tensor& grad_output, const Tensor& input, const Tensor& target, const Scalar& p, - const Scalar& margin, const c10::optional& weight_opt, + const Scalar& margin, const std::optional& weight_opt, int64_t reduction, Tensor& grad_input) { // See [Note: hacky wrapper removal for optional tensor] diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp index 0e7de9c27252a..b7809ab21dd5d 100644 --- a/aten/src/ATen/native/LossNLL.cpp +++ b/aten/src/ATen/native/LossNLL.cpp @@ -624,7 +624,7 @@ static Tensor cross_entropy_loss_label_smoothing( Tensor cross_entropy_loss_symint( const Tensor& self, const Tensor& target, - const c10::optional& weight, + const std::optional& weight, int64_t reduction, c10::SymInt ignore_index, double label_smoothing) { @@ -658,7 +658,7 @@ Tensor cross_entropy_loss_symint( return ret; } -Tensor & nll_loss_out(const Tensor & self, const Tensor & target, const c10::optional& weight_opt, int64_t reduction, int64_t ignore_index, Tensor & output) { +Tensor & nll_loss_out(const Tensor & self, const Tensor & target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index, Tensor & output) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -667,7 +667,7 @@ Tensor & nll_loss_out(const Tensor & self, const Tensor & target, const c10::opt return std::get<0>(at::nll_loss_forward_out(output, total_weight, self, target, weight, reduction, ignore_index)); } -Tensor nll_loss_symint(const Tensor & self, const Tensor & target, const c10::optional& weight_opt, int64_t reduction, c10::SymInt ignore_index) { +Tensor nll_loss_symint(const Tensor & self, const Tensor & target, const std::optional& weight_opt, int64_t reduction, c10::SymInt ignore_index) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -676,7 +676,7 @@ Tensor nll_loss_symint(const Tensor & self, const Tensor & target, const c10::op } // Duplicate of above code for non-symbolic ints. Kept for BC purposes and to minimize breakages. -static Tensor nll_loss(const Tensor & self, const Tensor & target, const c10::optional& weight_opt, int64_t reduction, int64_t ignore_index) { +static Tensor nll_loss(const Tensor & self, const Tensor & target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -687,7 +687,7 @@ static Tensor nll_loss(const Tensor & self, const Tensor & target, const c10::op Tensor nll_loss_nd_symint( const Tensor& self, const Tensor& target, - const c10::optional& weight, + const std::optional& weight, int64_t reduction, c10::SymInt ignore_index) { if (self.dim() < 1) { diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp index 94c667dcb1b2b..6f27884b8f24b 100644 --- a/aten/src/ATen/native/LossNLL2d.cpp +++ b/aten/src/ATen/native/LossNLL2d.cpp @@ -405,7 +405,7 @@ void nll_loss2d_backward_out_cpu_template( } // namespace std::tuple nll_loss2d_forward_out_cpu(const Tensor& self, - const Tensor& target, const c10::optional& weight_opt, + const Tensor& target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index, Tensor& output, @@ -421,7 +421,7 @@ std::tuple nll_loss2d_forward_out_cpu(const Tensor& self, std::tuple nll_loss2d_forward_cpu( const Tensor& self, - const Tensor& target, const c10::optional& weight_opt, + const Tensor& target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index) { // See [Note: hacky wrapper removal for optional tensor] @@ -437,7 +437,7 @@ std::tuple nll_loss2d_forward_cpu( Tensor& nll_loss2d_backward_out_cpu(const Tensor& grad_output, const Tensor& self, - const Tensor& target, const c10::optional& weight_opt, + const Tensor& target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index, const Tensor& total_weight, @@ -461,7 +461,7 @@ Tensor& nll_loss2d_backward_out_cpu(const Tensor& grad_output, Tensor nll_loss2d_backward_cpu( const Tensor& grad_output, const Tensor& self, - const Tensor& target, const c10::optional& weight_opt, + const Tensor& target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index, const Tensor& total_weight) { @@ -482,7 +482,7 @@ Tensor nll_loss2d_backward_cpu( return grad_input; } -Tensor & nll_loss2d_out(const Tensor & self, const Tensor & target, const c10::optional& weight_opt, int64_t reduction, int64_t ignore_index, Tensor & output) { +Tensor & nll_loss2d_out(const Tensor & self, const Tensor & target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index, Tensor & output) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -491,7 +491,7 @@ Tensor & nll_loss2d_out(const Tensor & self, const Tensor & target, const c10::o return std::get<0>(at::nll_loss2d_forward_out(output, total_weight, self, target, weight, reduction, ignore_index)); } -Tensor nll_loss2d_symint(const Tensor & self, const Tensor & target, const c10::optional& weight_opt, int64_t reduction, c10::SymInt ignore_index) { +Tensor nll_loss2d_symint(const Tensor & self, const Tensor & target, const std::optional& weight_opt, int64_t reduction, c10::SymInt ignore_index) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -500,7 +500,7 @@ Tensor nll_loss2d_symint(const Tensor & self, const Tensor & target, const c10:: } // Duplicate of above code for non-symbolic ints. Kept for BC purposes and to minimize breakages. -static Tensor nll_loss2d(const Tensor & self, const Tensor & target, const c10::optional& weight_opt, int64_t reduction, int64_t ignore_index) { +static Tensor nll_loss2d(const Tensor & self, const Tensor & target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; diff --git a/aten/src/ATen/native/MathBitsFallback.h b/aten/src/ATen/native/MathBitsFallback.h index 584d07aeca358..de2296634e045 100644 --- a/aten/src/ATen/native/MathBitsFallback.h +++ b/aten/src/ATen/native/MathBitsFallback.h @@ -56,7 +56,7 @@ struct MathOpFallback { const auto num_arguments = arguments.size(); const auto stack_start = stack->size() - num_arguments; - c10::optional is_write; + std::optional is_write; for (const auto i : c10::irange(num_arguments)) { // Three possible states: // 1. alias_info has no value --> out-of-place operation diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp index 0d07054f72eda..fefe9ab5a8d2b 100644 --- a/aten/src/ATen/native/Memory.cpp +++ b/aten/src/ATen/native/Memory.cpp @@ -23,11 +23,11 @@ int64_t _debug_has_internal_overlap(const Tensor& self) { // pinned memory, always return false", but this makes life a little easier when // you haven't loaded the backend extension at all (which can happen, e.g., on a // CPU build of PyTorch and you try to check if something is CUDA pinned) -bool is_pinned_default(const Tensor& self, c10::optional device) { +bool is_pinned_default(const Tensor& self, std::optional device) { return false; } -Tensor pin_memory(const Tensor& self, c10::optional device) { +Tensor pin_memory(const Tensor& self, std::optional device) { // Kind of mad that I have to do two dynamic dispatches here, pretty // annoying if (self.is_pinned(device)) { diff --git a/aten/src/ATen/native/MetaTensor.cpp b/aten/src/ATen/native/MetaTensor.cpp index 972d13dc8fb51..518466df84ce4 100644 --- a/aten/src/ATen/native/MetaTensor.cpp +++ b/aten/src/ATen/native/MetaTensor.cpp @@ -13,11 +13,11 @@ namespace at::native { Tensor empty_meta_symint( SymIntArrayRef size, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt ) { auto opt_size = asIntArrayRefSlowOpt(size); @@ -32,10 +32,10 @@ Tensor empty_meta_symint( static Tensor empty_strided_meta( IntArrayRef size, IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt ) { return empty_strided_meta_symint(c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), dtype_opt, layout_opt, device_opt, pin_memory_opt); } @@ -43,10 +43,10 @@ static Tensor empty_strided_meta( Tensor empty_strided_meta_symint( SymIntArrayRef size, SymIntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt ) { return at::detail::empty_strided_symint_meta( size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt); diff --git a/aten/src/ATen/native/NNPACK.cpp b/aten/src/ATen/native/NNPACK.cpp index e43bfdd627965..89b2f3ffc493b 100644 --- a/aten/src/ATen/native/NNPACK.cpp +++ b/aten/src/ATen/native/NNPACK.cpp @@ -22,7 +22,7 @@ namespace at::native { at::Tensor _nnpack_spatial_convolution( const Tensor& input, - const Tensor& weight, const c10::optional& bias_opt, + const Tensor& weight, const std::optional& bias_opt, const IntArrayRef padding, const IntArrayRef stride) { throw std::runtime_error( @@ -137,7 +137,7 @@ static thread_local Workspace workspace; Tensor _nnpack_spatial_convolution( const Tensor& input, - const Tensor& weight, const c10::optional& bias_opt, + const Tensor& weight, const std::optional& bias_opt, const IntArrayRef padding, const IntArrayRef stride) { // See [Note: hacky wrapper removal for optional tensor] diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp index 624e820c7ba66..f82354ace3b82 100644 --- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp +++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp @@ -819,7 +819,7 @@ void slow_conv_transpose3d_acc_grad_parameters_cpu( Tensor& slow_conv_transpose3d_out_cpu(const Tensor& input, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, IntArrayRef output_padding, @@ -846,7 +846,7 @@ Tensor& slow_conv_transpose3d_out_cpu(const Tensor& input, Tensor slow_conv_transpose3d_cpu( const Tensor& input, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, IntArrayRef output_padding, diff --git a/aten/src/ATen/native/NaiveDilatedConvolution.cpp b/aten/src/ATen/native/NaiveDilatedConvolution.cpp index 571eb16fc50e0..acf040259b135 100644 --- a/aten/src/ATen/native/NaiveDilatedConvolution.cpp +++ b/aten/src/ATen/native/NaiveDilatedConvolution.cpp @@ -524,7 +524,7 @@ void slow_conv_dilated_all_cpu_template( Tensor slow_conv_dilated2d_cpu( const Tensor& input, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride_size, IntArrayRef pad_size, IntArrayRef dilation_size) { @@ -579,7 +579,7 @@ Tensor slow_conv_dilated2d_cpu( Tensor slow_conv_dilated3d_cpu( const Tensor& input, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride_size, IntArrayRef pad_size, IntArrayRef dilation_size) { diff --git a/aten/src/ATen/native/NamedTensor.cpp b/aten/src/ATen/native/NamedTensor.cpp index f0330481c31a9..709d63bae6368 100644 --- a/aten/src/ATen/native/NamedTensor.cpp +++ b/aten/src/ATen/native/NamedTensor.cpp @@ -387,13 +387,13 @@ Tensor scatter_add(const Tensor& self, Dimname dim, const Tensor& index, const T static Tensor& scatter_add_(Tensor& self, Dimname dim, const Tensor& index, const Tensor& source) { reportNYIDimnameOverload("scatter_add"); } -std::tuple sort_out(const Tensor& self, c10::optional stable, Dimname dim, bool keepdim, Tensor& values, Tensor& indices) { +std::tuple sort_out(const Tensor& self, std::optional stable, Dimname dim, bool keepdim, Tensor& values, Tensor& indices) { reportNYIDimnameOverload("sort"); } std::tuple sort_out(const Tensor& self, Dimname dim, bool keepdim, Tensor& values, Tensor& indices) { reportNYIDimnameOverload("sort"); } -std::tuple sort(const Tensor& self, c10::optional stable, Dimname dim, bool keepdim) { +std::tuple sort(const Tensor& self, std::optional stable, Dimname dim, bool keepdim) { reportNYIDimnameOverload("sort"); } std::tuple sort(const Tensor& self, Dimname dim, bool keepdim) { diff --git a/aten/src/ATen/native/NonSymbolicBC.h b/aten/src/ATen/native/NonSymbolicBC.h index 589822a4ee013..037156ac23b15 100644 --- a/aten/src/ATen/native/NonSymbolicBC.h +++ b/aten/src/ATen/native/NonSymbolicBC.h @@ -9,15 +9,15 @@ namespace at::native { // In those cases, we will duplicate the signature here with non-symbolic ints, and also duplicate the C++ implementation. TORCH_API at::Tensor reshape(const at::Tensor& self, at::IntArrayRef proposed_shape); TORCH_API at::Tensor narrow(const at::Tensor& self, int64_t dim, int64_t start, int64_t length); -TORCH_API at::Tensor _sparse_coo_tensor_unsafe(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, c10::optional dtype=c10::nullopt, c10::optional layout=c10::nullopt, c10::optional device=c10::nullopt, c10::optional pin_memory=c10::nullopt, c10::optional is_coalesced=c10::nullopt); -TORCH_API at::Tensor nll_loss(const at::Tensor & self, const at::Tensor & target, const c10::optional& weight_opt, int64_t reduction, int64_t ignore_index); -TORCH_API at::Tensor nll_loss2d(const at::Tensor & self, const at::Tensor & target, const c10::optional& weight_opt, int64_t reduction, int64_t ignore_index); +TORCH_API at::Tensor _sparse_coo_tensor_unsafe(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, std::optional dtype=c10::nullopt, c10::optional layout=c10::nullopt, c10::optional device=c10::nullopt, c10::optional pin_memory=c10::nullopt, c10::optional is_coalesced=c10::nullopt); +TORCH_API at::Tensor nll_loss(const at::Tensor & self, const at::Tensor & target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index); +TORCH_API at::Tensor nll_loss2d(const at::Tensor & self, const at::Tensor & target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index); // The below ops don't get a duplicated C++ implementation. // They are backward ops, which make them very unlikely to be called directly // by external code (at::native::trace_backward). // They get their own declaration for BC purposes however. -TORCH_API at::Tensor _embedding_bag_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional & per_sample_weights, int64_t padding_idx=-1); -TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional & per_sample_weights, int64_t padding_idx=-1); +TORCH_API at::Tensor _embedding_bag_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse, const std::optional & per_sample_weights, int64_t padding_idx=-1); +TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const std::optional & per_sample_weights, int64_t padding_idx=-1); TORCH_API at::Tensor value_selecting_reduction_backward(const at::Tensor & grad, int64_t dim, const at::Tensor & indices, at::IntArrayRef sizes, bool keepdim); TORCH_API at::Tensor trace_backward(const at::Tensor & grad, at::IntArrayRef sizes); TORCH_API at::Tensor index_select_backward(const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index); diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 93d2ce11d934f..ce1b23c2bdf6f 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -519,6 +519,7 @@ BatchNormBackend _select_batch_norm_backend( && weight.defined() && bias.defined() && ((running_mean.defined() && running_var.defined()) || (!running_mean.defined() && !running_var.defined() && training)) + && (input.dim() >= 3) && detail::getCUDAHooks().compiledWithMIOpen() && cudnn_enabled && input.suggest_memory_format() != MemoryFormat::ChannelsLast @@ -537,7 +538,7 @@ BatchNormBackend _select_batch_norm_backend( // XXX: The indices of backends need to be kept synchronized between this function and its _backward. // TODO: remove cudnn_enabled arg std::tuple _batch_norm_impl_index( - const Tensor& input, const c10::optional& weight_opt /* optional */, const c10::optional& bias_opt /* optional */, const c10::optional& running_mean_opt /* optional */, const c10::optional& running_var_opt /* optional */, + const Tensor& input, const std::optional& weight_opt /* optional */, const c10::optional& bias_opt /* optional */, const c10::optional& running_mean_opt /* optional */, const c10::optional& running_var_opt /* optional */, bool training, double momentum, double eps, bool cudnn_enabled) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); @@ -619,7 +620,7 @@ std::tuple _batch_norm_impl_index( std::tuple _batch_norm_impl_index_backward( int64_t impl_index, - const Tensor& input, const Tensor& grad_output, const c10::optional& weight_opt /* optional */, const c10::optional& running_mean_opt /* optional */, const c10::optional& running_var_opt /* optional */, const c10::optional& save_mean_opt /* optional */, const c10::optional& save_var_transform_opt /* optional */, + const Tensor& input, const Tensor& grad_output, const std::optional& weight_opt /* optional */, const c10::optional& running_mean_opt /* optional */, const c10::optional& running_var_opt /* optional */, const c10::optional& save_mean_opt /* optional */, const c10::optional& save_var_transform_opt /* optional */, bool train, double epsilon, std::array output_mask, const Tensor &reservedSpace) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); @@ -665,8 +666,8 @@ std::tuple _batch_norm_impl_index_backward( // TODO: remove cudnn_enabled arg Tensor batch_norm( - const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, - const c10::optional& running_mean_opt, const c10::optional& running_var_opt, + const Tensor& input, const std::optional& weight_opt, const c10::optional& bias_opt, + const std::optional& running_mean_opt, const c10::optional& running_var_opt, bool training, double momentum, double eps, bool cudnn_enabled) { const Tensor& weight = c10::value_or_else(weight_opt, [] {return Tensor();}); const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();}); @@ -701,7 +702,7 @@ Tensor batch_norm( } Tensor instance_norm( - const Tensor& input, const c10::optional& weight_opt /* optional */, const c10::optional& bias_opt /* optional */, const c10::optional& running_mean_opt /* optional */, const c10::optional& running_var_opt /* optional */, + const Tensor& input, const std::optional& weight_opt /* optional */, const c10::optional& bias_opt /* optional */, const c10::optional& running_mean_opt /* optional */, const c10::optional& running_var_opt /* optional */, bool use_input_stats, double momentum, double eps, bool cudnn_enabled) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); @@ -739,7 +740,7 @@ Tensor instance_norm( } std::tuple batch_norm_update_stats_cpu( - const Tensor& self, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, double momentum) { + const Tensor& self, const std::optional& running_mean_opt, const c10::optional& running_var_opt, double momentum) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned running_mean_maybe_owned = at::borrow_from_optional_tensor(running_mean_opt); const Tensor& running_mean = *running_mean_maybe_owned; @@ -757,7 +758,7 @@ std::tuple batch_norm_update_stats_cpu( }); } -std::tuple batch_norm_cpu_out(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, +std::tuple batch_norm_cpu_out(const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); @@ -800,7 +801,7 @@ std::tuple batch_norm_cpu_out(const Tensor& self, con return std::tuple(out, save_mean, save_var); } -std::tuple batch_norm_cpu(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, +std::tuple batch_norm_cpu(const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, bool train, double momentum, double eps) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); @@ -850,7 +851,7 @@ std::tuple batch_norm_cpu(const Tensor& self, const c10: } std::tuple _batch_norm_with_update_cpu( - const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + const Tensor& input, const std::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, double momentum, double eps) { Tensor output, save_mean, save_var; std::tie(output, save_mean, save_var) = @@ -860,7 +861,7 @@ std::tuple _batch_norm_with_update_cpu( } std::tuple _batch_norm_with_update_cpu_out( - const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + const Tensor& input, const std::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var, Tensor& reserve) { std::tie(out, save_mean, save_var) = @@ -870,8 +871,8 @@ std::tuple _batch_norm_with_update_cpu_out( std::tuple _batch_norm_no_update( - const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, - const c10::optional& running_mean_opt, const c10::optional& running_var_opt, + const Tensor& input, const std::optional& weight_opt, const c10::optional& bias_opt, + const std::optional& running_mean_opt, const c10::optional& running_var_opt, double momentum, double eps) { const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();}); const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();}); @@ -883,41 +884,41 @@ std::tuple _batch_norm_no_update( } std::tuple _batch_norm_legit_cpu( - const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, + const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps) { return batch_norm_cpu(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, eps); } std::tuple _batch_norm_legit_no_stats_cpu( - const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, + const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, bool train, double momentum, double eps) { return batch_norm_cpu(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, eps); } std::tuple _batch_norm_legit_no_training( - const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, + const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, const Tensor& running_mean, const Tensor& running_var, double momentum, double eps) { return at::_native_batch_norm_legit(self, weight_opt, bias_opt, const_cast(running_mean), const_cast(running_var), /*train=*/false, momentum, eps); } -std::tuple _batch_norm_legit_cpu_out(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) { +std::tuple _batch_norm_legit_cpu_out(const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) { return batch_norm_cpu_out(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, eps, out, save_mean, save_var); } -std::tuple _batch_norm_legit_no_stats_cpu_out(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) { +std::tuple _batch_norm_legit_no_stats_cpu_out(const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) { return batch_norm_cpu_out(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, eps, out, save_mean, save_var); } std::tuple _new_batch_norm_backward_cpu( const Tensor& grad_output, const Tensor& input, const Tensor& weight, - const c10::optional& running_mean_opt, const c10::optional& running_var_opt, - const c10::optional& save_mean_opt, const c10::optional& save_var_opt, + const std::optional& running_mean_opt, const c10::optional& running_var_opt, + const std::optional& save_mean_opt, const c10::optional& save_var_opt, bool update, double eps, std::array grad_input_mask, const Tensor& reserve) { return batch_norm_backward_cpu(grad_output, input, weight, running_mean_opt, running_var_opt, save_mean_opt, save_var_opt, update, eps, grad_input_mask); } -std::tuple batch_norm_backward_cpu(const Tensor& grad_out, const Tensor& self, const c10::optional& weight_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_invstd_opt, +std::tuple batch_norm_backward_cpu(const Tensor& grad_out, const Tensor& self, const std::optional& weight_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_invstd_opt, bool train, double eps, std::array grad_input_mask) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp index aecab68c2be0f..e7172fe5a2c12 100644 --- a/aten/src/ATen/native/PadNd.cpp +++ b/aten/src/ATen/native/PadNd.cpp @@ -188,7 +188,7 @@ Tensor _pad_circular_symint(const Tensor &self, c10::SymIntArrayRef padding) { return out; } -Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mode_int, c10::optional value) { +Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mode_int, std::optional value) { const auto input_dim = self.dim(); TORCH_CHECK(pad.size() % 2 == 0, "Padding length must be divisible by 2"); TORCH_CHECK(static_cast(pad.size()) <= input_dim * 2, @@ -228,7 +228,7 @@ Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mod "Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now"); } -Tensor pad_symint(const Tensor &self, c10::SymIntArrayRef pad, c10::string_view mode, c10::optional value) { +Tensor pad_symint(const Tensor &self, c10::SymIntArrayRef pad, c10::string_view mode, std::optional value) { const auto mode_enum = [&] { if (mode == "reflect") { return at::padding_mode::reflect; diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h index 07940729fda8c..df73299ea2308 100644 --- a/aten/src/ATen/native/Pool.h +++ b/aten/src/ATen/native/Pool.h @@ -19,9 +19,9 @@ DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel); // averge pooling has same signature for forward and backward using avg_pool2d_fn = void(*)(const Tensor& output, const Tensor& input, int64_t kW, int64_t kH, - int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, c10::optional divisor_override); + int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, std::optional divisor_override); using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH, - int dW, int dH, int padW, int padH, bool count_include_pad, c10::optional divisor_override); + int dW, int dH, int padW, int padH, bool count_include_pad, std::optional divisor_override); DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel); DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel); @@ -30,11 +30,11 @@ DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel); using avg_pool3d_fn = void(*)(const Tensor& output, const Tensor& input, int64_t kW, int64_t kH, int64_t kD, int64_t dW, int64_t dH, int64_t dD, int64_t padW, int64_t padH, int64_t padD, bool count_include_pad, - c10::optional divisor_override); + std::optional divisor_override); using avg_pool3d_backward_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH, int kD, int dW, int dH, int dD, int padW, int padH, int padD, bool count_include_pad, - c10::optional divisor_override); + std::optional divisor_override); DECLARE_DISPATCH(avg_pool3d_fn, avg_pool3d_kernel); DECLARE_DISPATCH(avg_pool3d_backward_fn, avg_pool3d_backward_kernel); diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp index 97ce09ac8e51d..fccd3420d3f67 100644 --- a/aten/src/ATen/native/RNN.cpp +++ b/aten/src/ATen/native/RNN.cpp @@ -1163,7 +1163,7 @@ bool _use_cudnn_rnn_flatten_weight() { // NB: This a (composite) wrapper for _thnn_fused_lstm_cell_backward_impl. // It duplicates the outputs of this function so the non-composite version doesn't have to. // The point is so that we avoid triggering TensorImpl use count asserts in debug mode -std::tuple _thnn_fused_lstm_cell_backward( const c10::optional& grad_hy_opt, const c10::optional& grad_cy_opt, +std::tuple _thnn_fused_lstm_cell_backward( const std::optional& grad_hy_opt, const c10::optional& grad_cy_opt, const Tensor& cx, const Tensor& cy, const Tensor& workspace, bool has_bias) { TORCH_INTERNAL_ASSERT(!GradMode::is_enabled()); @@ -1523,7 +1523,7 @@ std::tuple lstm( std::tuple lstm_cell( const Tensor& input, TensorList hx, - const Tensor& w_ih, const Tensor& w_hh, const c10::optional& b_ih_opt, const c10::optional& b_hh_opt) { + const Tensor& w_ih, const Tensor& w_hh, const std::optional& b_ih_opt, const c10::optional& b_hh_opt) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned b_ih_maybe_owned = at::borrow_from_optional_tensor(b_ih_opt); const Tensor& b_ih = *b_ih_maybe_owned; @@ -1539,9 +1539,9 @@ std::tuple lstm_cell( } std::tuple -_thnn_differentiable_lstm_cell_backward( const c10::optional& grad_hy_opt, const c10::optional& grad_cy_opt, +_thnn_differentiable_lstm_cell_backward( const std::optional& grad_hy_opt, const c10::optional& grad_cy_opt, const Tensor& input_gates, - const Tensor& hidden_gates, const c10::optional& input_bias_opt, const c10::optional& hidden_bias_opt, + const Tensor& hidden_gates, const std::optional& input_bias_opt, const c10::optional& hidden_bias_opt, const Tensor& cx, const Tensor& cy) { // See [Note: hacky wrapper removal for optional tensor] @@ -1597,7 +1597,7 @@ std::tuple _thnn_differentiable_gru_cell const Tensor& grad_hy, const Tensor& input_gates, const Tensor& hidden_gates, - const Tensor& hx, const c10::optional& input_bias_opt, const c10::optional& hidden_bias_opt){ + const Tensor& hx, const std::optional& input_bias_opt, const c10::optional& hidden_bias_opt){ // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned input_bias_maybe_owned = at::borrow_from_optional_tensor(input_bias_opt); const Tensor& input_bias = *input_bias_maybe_owned; @@ -1637,7 +1637,7 @@ std::tuple _thnn_differentiable_gru_cell Tensor gru_cell( const Tensor& input, const Tensor& hx, - const Tensor& w_ih, const Tensor& w_hh, const c10::optional& b_ih_opt, const c10::optional& b_hh_opt) { + const Tensor& w_ih, const Tensor& w_hh, const std::optional& b_ih_opt, const c10::optional& b_hh_opt) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned b_ih_maybe_owned = at::borrow_from_optional_tensor(b_ih_opt); const Tensor& b_ih = *b_ih_maybe_owned; @@ -1651,7 +1651,7 @@ Tensor gru_cell( Tensor rnn_tanh_cell( const Tensor& input, const Tensor& hx, - const Tensor& w_ih, const Tensor& w_hh, const c10::optional& b_ih_opt, const c10::optional& b_hh_opt) { + const Tensor& w_ih, const Tensor& w_hh, const std::optional& b_ih_opt, const c10::optional& b_hh_opt) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned b_ih_maybe_owned = at::borrow_from_optional_tensor(b_ih_opt); const Tensor& b_ih = *b_ih_maybe_owned; @@ -1665,7 +1665,7 @@ Tensor rnn_tanh_cell( Tensor rnn_relu_cell( const Tensor& input, const Tensor& hx, - const Tensor& w_ih, const Tensor& w_hh, const c10::optional& b_ih_opt, const c10::optional& b_hh_opt) { + const Tensor& w_ih, const Tensor& w_hh, const std::optional& b_ih_opt, const c10::optional& b_hh_opt) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned b_ih_maybe_owned = at::borrow_from_optional_tensor(b_ih_opt); const Tensor& b_ih = *b_ih_maybe_owned; @@ -1693,7 +1693,7 @@ static std::tuple quantized_lstm_input( bool train, bool bidirectional, bool batch_first, - c10::optional dtype, + std::optional dtype, bool use_dynamic) { auto hx = hx_.vec(); std::vector params; @@ -1747,7 +1747,7 @@ static std::tuple quantized_lstm_input_legacy( bool train, bool bidirectional, bool batch_first, - c10::optional dtype, + std::optional dtype, bool use_dynamic) { TORCH_CHECK( false, @@ -1766,7 +1766,7 @@ static std::tuple quantized_lstm_data( double dropout_p, bool train, bool bidirectional, - c10::optional dtype, + std::optional dtype, bool use_dynamic) { auto hx = hx_.vec(); std::vector params; @@ -1813,7 +1813,7 @@ static std::tuple quantized_lstm_data_legacy( double dropout_p, bool train, bool bidirectional, - c10::optional dtype, + std::optional dtype, bool use_dynamic) { TORCH_CHECK( false, diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index d29b177c13960..96f6d6f90c87d 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -148,7 +148,7 @@ static ScalarType infer_dtype_from_optional( } } -static IntArrayRef optional_to_arrayref(const c10::optional& opt) { +static IntArrayRef optional_to_arrayref(const std::optional& opt) { return opt.has_value() ? opt.value() : IntArrayRef{}; } @@ -217,7 +217,7 @@ TORCH_META_FUNC(any)(const Tensor& self) { static void check_argmax_argmin( const char* name, const Tensor& self, - const c10::optional& dim) { + const std::optional& dim) { if (dim.has_value()) { auto dim_ = maybe_wrap_dim(dim.value(), self.dim()); native::zero_numel_check_dims(self, dim_, name); @@ -229,13 +229,13 @@ static void check_argmax_argmin( } TORCH_META_FUNC(argmax) -(const Tensor& self, c10::optional dim, bool keepdim) { +(const Tensor& self, std::optional dim, bool keepdim) { check_argmax_argmin("argmax()", self, dim); resize_reduction(*this, self, optional_to_arrayref(dim), keepdim, kLong); } TORCH_META_FUNC(argmin) -(const Tensor& self, c10::optional dim, bool keepdim) { +(const Tensor& self, std::optional dim, bool keepdim) { check_argmax_argmin("argmin()", self, dim); resize_reduction(*this, self, optional_to_arrayref(dim), keepdim, kLong); } @@ -245,7 +245,7 @@ static void meta_func_cum_ops( const char* name, const Tensor& self, int64_t dim, - c10::optional dtype) { + std::optional dtype) { // Checking whether 'dim' is valid. maybe_wrap_dim(dim, self.dim()); @@ -264,12 +264,12 @@ static void meta_func_cum_ops( } TORCH_META_FUNC(cumsum) -(const Tensor& self, int64_t dim, c10::optional dtype) { +(const Tensor& self, int64_t dim, std::optional dtype) { meta_func_cum_ops(*this, "cumsum", self, dim, dtype); } TORCH_META_FUNC(cumprod) -(const Tensor& self, int64_t dim, c10::optional dtype) { +(const Tensor& self, int64_t dim, std::optional dtype) { meta_func_cum_ops(*this, "cumprod", self, dim, dtype); } @@ -283,7 +283,7 @@ TORCH_META_FUNC2(prod, dim_int) (const Tensor& self, int64_t dim, bool keepdim, - c10::optional dtype) { + std::optional dtype) { auto out_dtype = infer_dtype_from_optional(self, dtype, maybe_get_output()); resize_reduction(*this, self, dim, keepdim, out_dtype); } @@ -315,7 +315,7 @@ TORCH_META_FUNC2(mean, dim) static ScalarType get_result_or_self_value_dtype( const Tensor& self, const Tensor& result, - const c10::optional& dtype) { + const std::optional& dtype) { if (result.defined()) { return result.scalar_type(); } else { @@ -350,7 +350,7 @@ TORCH_META_FUNC2(norm, ScalarOpt_dim_dtype) } TORCH_META_FUNC(aminmax) -(const Tensor& self, c10::optional dim_opt, bool keepdim) { +(const Tensor& self, std::optional dim_opt, bool keepdim) { DimVector shape; if (dim_opt.has_value()) { auto dim = maybe_wrap_dim(dim_opt.value(), self.ndimension()); @@ -407,7 +407,7 @@ DEFINE_DISPATCH(aminmax_allreduce_stub); TORCH_IMPL_FUNC(aminmax_out) (const Tensor& self, - c10::optional dim_opt, + std::optional dim_opt, bool keepdim, const Tensor& min, const Tensor& max) { @@ -491,7 +491,7 @@ void impl_func_cum_ops( TORCH_IMPL_FUNC(cumsum_out) (const Tensor& self, int64_t dim, - c10::optional dtype, + std::optional dtype, const Tensor& result) { impl_func_cum_ops(self, dim, result, cumsum_stub); } @@ -499,7 +499,7 @@ TORCH_IMPL_FUNC(cumsum_out) TORCH_IMPL_FUNC(cumprod_out) (const Tensor& self, int64_t dim, - c10::optional dtype, + std::optional dtype, const Tensor& result) { impl_func_cum_ops(self, dim, result, cumprod_stub); } @@ -869,7 +869,7 @@ Tensor cummaxmin_backward(const Tensor& grad, const Tensor& input, const Tensor& return result.scatter_add_(dim, indices, grad); } -static Tensor prepend_append_on_dim(const Tensor& self, const c10::optional& prepend, const c10::optional& append, int64_t dim) { +static Tensor prepend_append_on_dim(const Tensor& self, const std::optional& prepend, const c10::optional& append, int64_t dim) { // Helper for diff that handles prepending and appending when at least one is present TORCH_INTERNAL_ASSERT(prepend.has_value() || append.has_value(), "either prepend or append must be have value"); if (!prepend.has_value() && append.has_value()) { @@ -881,7 +881,7 @@ static Tensor prepend_append_on_dim(const Tensor& self, const c10::optional&other, int64_t dim) { +static inline void diff_check_compatible_shape(const Tensor& self, const std::optional&other, int64_t dim) { // Helper for diff that checks whether the shape of the tensor to prepend or append // is compatible with that of input if (other.has_value()) { @@ -902,7 +902,7 @@ static inline void diff_check_compatible_shape(const Tensor& self, const c10::op } } -static inline void diff_check(const Tensor& self, int64_t n, int64_t dim, const c10::optional&prepend, const c10::optional& append) { +static inline void diff_check(const Tensor& self, int64_t n, int64_t dim, const std::optional&prepend, const c10::optional& append) { // Helper for diff that checks whether its parameters are valid TORCH_CHECK( self.dim() >= 1, @@ -943,7 +943,7 @@ static inline Tensor diff_helper(const Tensor& self, int64_t n, int64_t dim) { return result; } -Tensor diff(const Tensor& self, int64_t n, int64_t dim, const c10::optional& prepend, const c10::optional& append) { +Tensor diff(const Tensor& self, int64_t n, int64_t dim, const std::optional& prepend, const c10::optional& append) { diff_check(self, n, dim, prepend, append); if ((!prepend.has_value() && !append.has_value()) || n == 0) { return diff_helper(self, n, dim); @@ -987,7 +987,7 @@ static inline Tensor& diff_out_helper(const Tensor& self, int64_t n, int64_t dim return result; } -Tensor& diff_out(const Tensor& self, int64_t n, int64_t dim, const c10::optional& prepend, const c10::optional& append, Tensor& result) { +Tensor& diff_out(const Tensor& self, int64_t n, int64_t dim, const std::optional& prepend, const c10::optional& append, Tensor& result) { diff_check(self, n, dim, prepend, append); if ((!prepend.has_value() && !append.has_value()) || n == 0) { return diff_out_helper(self, n, dim, result); @@ -997,7 +997,7 @@ Tensor& diff_out(const Tensor& self, int64_t n, int64_t dim, const c10::optional } } -static void pre_check_gradient(const Tensor& self, c10::optional spacing_size, at::OptionalIntArrayRef dim, int64_t edge_order) { +static void pre_check_gradient(const Tensor& self, std::optional spacing_size, at::OptionalIntArrayRef dim, int64_t edge_order) { // Helper for gradient function to make sure input data satisfies prerequisites TORCH_CHECK(self.scalar_type() != ScalarType::Byte, "torch.gradient does not support uint8 input."); if (spacing_size.has_value() && !dim.has_value()) { @@ -1088,7 +1088,7 @@ static std::vector gradient_helper_float(const Tensor& self, ArrayRef gradient_dim_preprocess(const Tensor& self, c10::optional dim) { +static std::vector gradient_dim_preprocess(const Tensor& self, std::optional dim) { // if gradient dim is provided as an integer, then we need to compute gradient only on this direction. // Moreover, if it's not provided at all, then we are interested in gradient for all directions. // Finally, if dim is provided as vector of ints, then it is not expected to be called by this function. @@ -1103,16 +1103,16 @@ static std::vector gradient_dim_preprocess(const Tensor& self, c10::opt std::vector gradient(const Tensor& self, TensorList coordinates, IntArrayRef dim, int64_t edge_order) { pre_check_gradient(self, - c10::optional(coordinates.size()), + std::optional(coordinates.size()), at::OptionalIntArrayRef(dim), edge_order); return gradient_helper(self, coordinates, dim, edge_order); } -std::vector gradient(const Tensor& self, TensorList coordinates, c10::optional dim, int64_t edge_order) { +std::vector gradient(const Tensor& self, TensorList coordinates, std::optional dim, int64_t edge_order) { const auto processed_dim = gradient_dim_preprocess(self, dim); pre_check_gradient(self, - c10::optional(coordinates.size()), + std::optional(coordinates.size()), dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : c10::nullopt, edge_order); return gradient_helper(self, coordinates, processed_dim, edge_order); @@ -1120,16 +1120,16 @@ std::vector gradient(const Tensor& self, TensorList coordinates, c10::op std::vector gradient(const Tensor& self, c10::ArrayRef spacing, IntArrayRef dim, int64_t edge_order) { pre_check_gradient(self, - c10::optional(spacing.size()), + std::optional(spacing.size()), at::OptionalIntArrayRef(dim), edge_order); return gradient_helper_float(self, spacing, dim, edge_order); } -std::vector gradient(const Tensor& self, ArrayRef spacing, c10::optional dim, int64_t edge_order) { +std::vector gradient(const Tensor& self, ArrayRef spacing, std::optional dim, int64_t edge_order) { const auto processed_dim = gradient_dim_preprocess(self, dim); pre_check_gradient(self, - c10::optional(spacing.size()), + std::optional(spacing.size()), dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : c10::nullopt, edge_order); return gradient_helper_float(self, spacing, processed_dim, edge_order); @@ -1140,13 +1140,13 @@ std::vector gradient(const Tensor& self, const Scalar& unit_size, IntArr // be taken as unit size at every given dimension element of - dim. std::vector spacing(dim.size(), unit_size); pre_check_gradient(self, - c10::optional(spacing.size()), + std::optional(spacing.size()), at::OptionalIntArrayRef(dim), edge_order); return gradient_helper_float(self, spacing, dim, edge_order); } -std::vector gradient(const Tensor& self, const c10::optional& unit_size, c10::optional dim, int64_t edge_order) { +std::vector gradient(const Tensor& self, const std::optional& unit_size, c10::optional dim, int64_t edge_order) { const auto processed_dim = gradient_dim_preprocess(self, dim); // When unit_size not provided, it is always assumed to be equal to 1. // When dim has integer value it implies we are looking for gradient in the specific direction, however when @@ -1154,7 +1154,7 @@ std::vector gradient(const Tensor& self, const c10::optional& un std::vector spacing(dim.has_value() ? 1 : self.dim(), unit_size.has_value() ? unit_size.value() : 1.0) ; pre_check_gradient(self, - unit_size.has_value() ? c10::optional(spacing.size()) : c10::nullopt, + unit_size.has_value() ? std::optional(spacing.size()) : c10::nullopt, dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : c10::nullopt, edge_order); return gradient_helper_float(self, spacing, processed_dim, edge_order); @@ -1163,7 +1163,7 @@ std::vector gradient(const Tensor& self, const c10::optional& un std::vector gradient(const Tensor& self, IntArrayRef dim, int64_t edge_order) { std::vector spacing(dim.size(), 1.0) ; pre_check_gradient(self, - c10::optional(spacing.size()), + std::optional(spacing.size()), at::OptionalIntArrayRef(dim), edge_order); return gradient_helper_float(self, spacing, dim, edge_order); @@ -1217,11 +1217,11 @@ TORCH_IMPL_FUNC(sum_out) } } -Tensor sum(const Tensor &self, c10::optional dtype) { +Tensor sum(const Tensor &self, std::optional dtype) { return at::sum(self, IntArrayRef{}, false, dtype); } -Tensor sum(const Tensor& self, DimnameList dim, bool keepdim, c10::optional dtype) { +Tensor sum(const Tensor& self, DimnameList dim, bool keepdim, std::optional dtype) { return at::sum(self, dimnames_to_positions(self, dim), keepdim, dtype); } @@ -1252,7 +1252,7 @@ Tensor& nansum_out(const Tensor& self, at::OptionalIntArrayRef dim, return result; } -Tensor nansum(const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, c10::optional opt_dtype) { +Tensor nansum(const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, std::optional opt_dtype) { ScalarType dtype = get_dtype_from_self(self, opt_dtype, true); Tensor result = create_reduction_result(self, dim, keepdim, dtype); return at::native::nansum_out(self, dim, keepdim, dtype, result); @@ -1306,7 +1306,7 @@ static void impl_func_prod( const Tensor& self, IntArrayRef dims, bool keepdim, - c10::optional dtype, + std::optional dtype, const Tensor& result) { auto iter = meta::make_reduction_from_out_ty(self, result, dims, keepdim, result.scalar_type()); if (iter.numel() == 0) { @@ -1320,12 +1320,12 @@ TORCH_IMPL_FUNC(prod_out) (const Tensor& self, int64_t dim, bool keepdim, - c10::optional dtype, + std::optional dtype, const Tensor& result) { impl_func_prod(self, dim, keepdim, dtype, result); } -Tensor prod(const Tensor &self, c10::optional opt_dtype) { +Tensor prod(const Tensor &self, std::optional opt_dtype) { auto dtype = get_dtype_from_self(self, opt_dtype, true); auto shape = meta::get_reduction_shape(self, {}, false); Tensor result = at::empty(shape, self.options().dtype(dtype)); @@ -1333,7 +1333,7 @@ Tensor prod(const Tensor &self, c10::optional opt_dtype) { return result; } -Tensor prod(const Tensor& self, Dimname dim, bool keepdim, c10::optional dtype) { +Tensor prod(const Tensor& self, Dimname dim, bool keepdim, std::optional dtype) { return at::prod(self, dimname_to_position(self, dim), keepdim, dtype); } @@ -1346,7 +1346,7 @@ TORCH_IMPL_FUNC(mean_out) (const Tensor& self, OptionalIntArrayRef opt_dim, bool keepdim, - c10::optional opt_dtype, + std::optional opt_dtype, const Tensor& result) { ScalarType dtype = result.scalar_type(); // TODO: the TensorIterator reduction implementation of mean @@ -1407,7 +1407,7 @@ Tensor mean(const Tensor& self, DimnameList dim, bool keepdim, optional opt_dtype, Tensor& result) { + bool keepdim, std::optional opt_dtype, Tensor& result) { return at::mean_out(result, self, dimnames_to_positions(self, dim), keepdim, opt_dtype); } @@ -1416,7 +1416,7 @@ Tensor& nanmean_out( const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, - c10::optional opt_dtype, + std::optional opt_dtype, Tensor& result) { TORCH_CHECK( self.is_floating_point() || self.is_complex(), @@ -1703,7 +1703,7 @@ TORCH_IMPL_FUNC(amax_out) (const Tensor& self, IntArrayRef dim, bool keepdim, co template void argmax_argmin_impl( const Tensor& self, - c10::optional dim, + std::optional dim, bool keepdim, const Tensor& result, Stub& stub) { @@ -1737,7 +1737,7 @@ void argmax_argmin_impl( TORCH_IMPL_FUNC(argmax_out) (const Tensor& self, - c10::optional dim, + std::optional dim, bool keepdim, const Tensor& result) { argmax_argmin_impl(self, dim, keepdim, result, argmax_stub); @@ -1745,7 +1745,7 @@ TORCH_IMPL_FUNC(argmax_out) TORCH_IMPL_FUNC(argmin_out) (const Tensor& self, - c10::optional dim, + std::optional dim, bool keepdim, const Tensor& result) { argmax_argmin_impl(self, dim, keepdim, result, argmin_stub); @@ -1812,7 +1812,7 @@ namespace { static Tensor& std_var_out( const char* fname, Tensor& result, const Tensor& self, - at::OptionalIntArrayRef dim, const c10::optional& correction_opt, + at::OptionalIntArrayRef dim, const std::optional& correction_opt, bool keepdim, bool take_sqrt) { TORCH_CHECK(self.device().is_cpu() || self.device().is_cuda(), "std and var only supports tensors on a CPU or CUDA device, but got: ", @@ -1884,7 +1884,7 @@ static Tensor& std_var_out( static std::tuple std_var_mean_out( const char* fname, Tensor& result1, Tensor& result2, const Tensor& self, - at::OptionalIntArrayRef dim, const c10::optional& correction_opt, + at::OptionalIntArrayRef dim, const std::optional& correction_opt, bool keepdim, bool take_sqrt) { AT_ASSERT(result1.defined() && result2.defined()); TORCH_CHECK(self.device().is_cpu() || self.is_cuda(), @@ -1995,7 +1995,7 @@ static TensorOptions options_to_value_type(TensorOptions opts) { std::tuple var_mean( const Tensor& self, at::OptionalIntArrayRef dim, - const c10::optional& correction, bool keepdim) { + const std::optional& correction, bool keepdim) { Tensor result1 = at::empty({0}, options_to_value_type(self.options())); Tensor result2 = at::empty({0}, self.options()); return std_var_mean_out( @@ -2004,7 +2004,7 @@ std::tuple var_mean( std::tuple std_mean( const Tensor& self, at::OptionalIntArrayRef dim, - const c10::optional& correction, bool keepdim) { + const std::optional& correction, bool keepdim) { Tensor result1 = at::empty({0}, options_to_value_type(self.options())); Tensor result2 = at::empty({0}, self.options()); return std_var_mean_out( @@ -2047,26 +2047,26 @@ Tensor& std_out(const Tensor& self, at::OptionalIntArrayRef opt_dim, bool unbias } Tensor std(const Tensor& self, at::OptionalIntArrayRef dim, - const c10::optional& correction, bool keepdim) { + const std::optional& correction, bool keepdim) { Tensor result = at::empty({0}, options_to_value_type(self.options())); return std_var_out("std", result, self, dim, correction, keepdim, true); } Tensor& std_out( const Tensor& self, at::OptionalIntArrayRef dim, - const c10::optional& correction, bool keepdim, Tensor& result) { + const std::optional& correction, bool keepdim, Tensor& result) { return std_var_out("std", result, self, dim, correction, keepdim, true); } Tensor& var_out( const Tensor& self, at::OptionalIntArrayRef dim, - const c10::optional& correction, bool keepdim, Tensor& result) { + const std::optional& correction, bool keepdim, Tensor& result) { return std_var_out("var", result, self, dim, correction, keepdim, false); } Tensor var( const Tensor& self, at::OptionalIntArrayRef dim, - const c10::optional& correction, bool keepdim) { + const std::optional& correction, bool keepdim) { Tensor result = at::empty({0}, options_to_value_type(self.options())); return std_var_out("var", result, self, dim, correction, keepdim, false); } @@ -2096,32 +2096,32 @@ std::tuple std_mean(const Tensor& self, DimnameList dim, bool unb return at::std_mean(self, dimnames_to_positions(self, dim), unbiased, keepdim); } -Tensor std(const Tensor& self, DimnameList dim, const c10::optional& correction, bool keepdim) { +Tensor std(const Tensor& self, DimnameList dim, const std::optional& correction, bool keepdim) { return at::std(self, dimnames_to_positions(self, dim), correction, keepdim); } -Tensor& std_out(const Tensor& self, DimnameList dim, const c10::optional& correction, +Tensor& std_out(const Tensor& self, DimnameList dim, const std::optional& correction, bool keepdim, Tensor& result) { return at::std_out(result, self, dimnames_to_positions(self, dim), correction, keepdim); } -Tensor var(const Tensor& self, DimnameList dim, const c10::optional& correction, bool keepdim) { +Tensor var(const Tensor& self, DimnameList dim, const std::optional& correction, bool keepdim) { return at::var(self, dimnames_to_positions(self, dim), correction, keepdim); } -Tensor& var_out(const Tensor& self, DimnameList dim, const c10::optional& correction, +Tensor& var_out(const Tensor& self, DimnameList dim, const std::optional& correction, bool keepdim, Tensor& result) { return at::var_out( result, self, dimnames_to_positions(self, dim), correction, keepdim); } std::tuple var_mean(const Tensor& self, DimnameList dim, - const c10::optional& correction, bool keepdim) { + const std::optional& correction, bool keepdim) { return at::var_mean(self, dimnames_to_positions(self, dim), correction, keepdim); } std::tuple std_mean(const Tensor& self, DimnameList dim, - const c10::optional& correction, bool keepdim) { + const std::optional& correction, bool keepdim) { return at::std_mean(self, dimnames_to_positions(self, dim), correction, keepdim); } @@ -2167,22 +2167,22 @@ Tensor logcumsumexp(const Tensor& self, Dimname dim) { Tensor& logcumsumexp_out(const Tensor& self, Dimname dim, Tensor& result) { return at::logcumsumexp_out(result, self, dimname_to_position(self, dim)); } -Tensor cumsum(const Tensor& self, Dimname dim, c10::optional dtype) { +Tensor cumsum(const Tensor& self, Dimname dim, std::optional dtype) { return at::cumsum(self, dimname_to_position(self, dim), dtype); } -Tensor& cumsum_(Tensor& self, Dimname dim, c10::optional dtype) { +Tensor& cumsum_(Tensor& self, Dimname dim, std::optional dtype) { return at::cumsum_out(self, self, dimname_to_position(self, dim), dtype); } -Tensor& cumsum_out(const Tensor& self, Dimname dim, c10::optional dtype, Tensor& result) { +Tensor& cumsum_out(const Tensor& self, Dimname dim, std::optional dtype, Tensor& result) { return at::cumsum_out(result, self, dimname_to_position(self, dim), dtype); } -Tensor cumprod(const Tensor& self, Dimname dim, c10::optional dtype) { +Tensor cumprod(const Tensor& self, Dimname dim, std::optional dtype) { return at::cumprod(self, dimname_to_position(self, dim), dtype); } -Tensor& cumprod_(Tensor& self, Dimname dim, c10::optional dtype) { +Tensor& cumprod_(Tensor& self, Dimname dim, std::optional dtype) { return at::cumprod_out(self, self, dimname_to_position(self, dim), dtype); } -Tensor& cumprod_out(const Tensor& self, Dimname dim, c10::optional dtype, Tensor& result) { +Tensor& cumprod_out(const Tensor& self, Dimname dim, std::optional dtype, Tensor& result) { return at::cumprod_out(result, self, dimname_to_position(self, dim), dtype); } std::tuple cummax(const Tensor& self, Dimname dim) { @@ -2303,15 +2303,15 @@ Tensor value_selecting_reduction_backward_symint(const Tensor& grad, int64_t dim return inplace_scatter_if_not_tensor_subclass(grad, indices); } -Tensor sum_csr(const Tensor &self, c10::optional dtype) { +Tensor sum_csr(const Tensor &self, std::optional dtype) { return self.values().sum(dtype); } -Tensor sum_coo(const Tensor &self, c10::optional dtype) { +Tensor sum_coo(const Tensor &self, std::optional dtype) { return self._values().sum(dtype); } -Tensor sum_sparse_coo(const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, c10::optional dtype) { +Tensor sum_sparse_coo(const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, std::optional dtype) { Tensor result; if (dim.has_value()) { if (dtype.has_value()) { @@ -2341,7 +2341,7 @@ Tensor sum_sparse_compressed( const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, - c10::optional dtype) { + std::optional dtype) { // TODO: The signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype is a little // bit different in the second parameters `dim`, which causes the conversion of `dim` // to call into `_sparse_csr_sum`. Align the signatures would be a better choice. diff --git a/aten/src/ATen/native/ReduceOps.h b/aten/src/ATen/native/ReduceOps.h index 604d6ae8a74ef..d834f17a6d774 100644 --- a/aten/src/ATen/native/ReduceOps.h +++ b/aten/src/ATen/native/ReduceOps.h @@ -33,7 +33,7 @@ using reduce_std_var_function = DECLARE_DISPATCH(reduce_std_var_function, std_var_stub); using reduce_norm_fn = - void (*)(Tensor&, const Tensor&, const c10::Scalar&, c10::optional); + void (*)(Tensor&, const Tensor&, const c10::Scalar&, std::optional); DECLARE_DISPATCH(reduce_norm_fn, norm_kernel); using reduce_fn_flag = void(*)(TensorIterator &, const c10::Scalar&); diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h index 6989b00f6f3e6..505cf3bb3a778 100644 --- a/aten/src/ATen/native/ReduceOpsUtils.h +++ b/aten/src/ATen/native/ReduceOpsUtils.h @@ -77,7 +77,7 @@ inline bool _dimreduce_return_trivial_no_ident(Tensor &result, const Tensor &sel return false; } -inline c10::optional _allreduce_return_trivial( +inline std::optional _allreduce_return_trivial( const Tensor& self, const Scalar& ident) { // Return identity @@ -102,7 +102,7 @@ static inline void check_scalar_type_device_layout_equal(const Tensor& out, cons OPTION_TYPE_EQUALITY_CHECK(layout, out.options(), self.options()); } -static inline Tensor integer_upcast(const Tensor& self, c10::optional dtype) { +static inline Tensor integer_upcast(const Tensor& self, std::optional dtype) { ScalarType scalarType = self.scalar_type(); TORCH_CHECK(!isBarebonesUnsignedType(scalarType), "integer upcasting for uint16, uint32 and uint64 is not currently implemented"); ScalarType upcast_scalarType = dtype.value_or(at::isIntegralType(scalarType, /*includeBool=*/true) ? ScalarType::Long : scalarType); @@ -323,7 +323,7 @@ static C10_UNUSED void zero_numel_tensor_resize(Tensor& result, Tensor& result_i inline ScalarType get_dtype_from_self( const Tensor& self, - const c10::optional& dtype, + const std::optional& dtype, bool promote_integers) { if (dtype.has_value()) { return dtype.value(); @@ -335,7 +335,7 @@ inline ScalarType get_dtype_from_self( return src_type; } -inline ScalarType get_dtype_from_result(Tensor& result, c10::optional dtype) { +inline ScalarType get_dtype_from_result(Tensor& result, std::optional dtype) { TORCH_CHECK(result.defined(), "Cannot create a new tensor inside a reduction op. You likely tried to call an operator with an out argument but the out argument was an undefined tensor."); if (dtype.has_value()) { return dtype.value(); diff --git a/aten/src/ATen/native/Repeat.cpp b/aten/src/ATen/native/Repeat.cpp index dd87cead1f480..8bd253134b7a9 100644 --- a/aten/src/ATen/native/Repeat.cpp +++ b/aten/src/ATen/native/Repeat.cpp @@ -41,7 +41,7 @@ namespace at::native { Tensor repeat_interleave_cpu( const Tensor& repeat, - c10::optional output_size) { + std::optional output_size) { Tensor output; AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_cpu", [&]() { output = repeat_interleave_common>( @@ -54,8 +54,8 @@ Tensor repeat_interleave_cpu( Tensor repeat_interleave_symint( const Tensor& self, const Tensor& repeats, - c10::optional dim, - c10::optional output_size) { + std::optional dim, + std::optional output_size) { Tensor input = self; // Store conj and neg bits @@ -101,8 +101,8 @@ Tensor repeat_interleave_symint( Tensor repeat_interleave_symint( const Tensor& self, c10::SymInt repeats, - c10::optional dim_opt, - c10::optional output_size) { + std::optional dim_opt, + std::optional output_size) { Tensor input = dim_opt ? self : self.flatten(); int64_t dim = c10::maybe_wrap_dim(dim_opt.value_or(0), self.dim()); TORCH_CHECK(repeats >= 0, "Repeats must be non-negative"); diff --git a/aten/src/ATen/native/Repeat.h b/aten/src/ATen/native/Repeat.h index e9a471d16f931..879a09bddd99b 100644 --- a/aten/src/ATen/native/Repeat.h +++ b/aten/src/ATen/native/Repeat.h @@ -17,7 +17,7 @@ template < void compute(const index_t*, const int64_t*, index_t*, int64_t, int64_t)> static inline Tensor repeat_interleave_common( const Tensor& repeats, - c10::optional output_size) { + std::optional output_size) { TORCH_CHECK( repeats.dim() == 1, "repeat_interleave only accept 1D vector as repeat"); TORCH_CHECK( diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp index be88538ed7082..fd06627b70277 100644 --- a/aten/src/ATen/native/Resize.cpp +++ b/aten/src/ATen/native/Resize.cpp @@ -136,7 +136,7 @@ const Tensor& resize_as_sparse_(const Tensor& self, const Tensor& src); const Tensor& resize_as_( const Tensor& self, const Tensor& the_template, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { if (self.is_sparse() && the_template.is_sparse()) { TORCH_CHECK( !optional_memory_format.has_value(), @@ -243,7 +243,7 @@ template const Tensor& _resize_( const Tensor& self, ArrayRef size, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { auto* self_ = self.unsafeGetTensorImpl(); int64_t old_storage_nbytes = self_->unsafe_storage() ? self_->unsafe_storage().sym_nbytes().maybe_as_int().value_or(-1) : 0; // NOLINTNEXTLINE(bugprone-argument-comment) @@ -267,7 +267,7 @@ const Tensor& _resize_( const Tensor& resize_( const Tensor& self, IntArrayRef size, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { if (self.has_names()) { return resize_named_tensor_(self, size, optional_memory_format); } @@ -277,7 +277,7 @@ const Tensor& resize_( const Tensor& resize__symint( const Tensor& self, c10::SymIntArrayRef size, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { TORCH_INTERNAL_ASSERT(!self.has_names()) return _resize_(self, size, optional_memory_format); } diff --git a/aten/src/ATen/native/ResizeCommon.h b/aten/src/ATen/native/ResizeCommon.h index 02d1e95c42efe..cea2612a22127 100644 --- a/aten/src/ATen/native/ResizeCommon.h +++ b/aten/src/ATen/native/ResizeCommon.h @@ -32,7 +32,7 @@ inline T storage_size_for(ArrayRef size, ArrayRef stride) { inline const Tensor& resize_named_tensor_( const Tensor& self, IntArrayRef size, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { TORCH_INTERNAL_ASSERT(self.has_names()); TORCH_CHECK( self.sizes() == size, diff --git a/aten/src/ATen/native/ScatterGatherChecks.h b/aten/src/ATen/native/ScatterGatherChecks.h index 829959c347035..4aad28eb1f73a 100644 --- a/aten/src/ATen/native/ScatterGatherChecks.h +++ b/aten/src/ATen/native/ScatterGatherChecks.h @@ -15,7 +15,7 @@ static void scatter_gather_dtype_check( const std::string& method_name, const Tensor& self, const Tensor& index, - const c10::optional& src_opt = c10::nullopt + const std::optional& src_opt = c10::nullopt ) { if (index.numel() != 0) { TORCH_CHECK( @@ -66,7 +66,7 @@ static C10_UNUSED void gather_shape_check(const Tensor& self, int64_t dim, // 3. index.dim() == self.dim() == src.dim() static C10_UNUSED void scatter_shape_check( const Tensor& self, int64_t dim, const Tensor& index, - const c10::optional& src_opt = c10::nullopt + const std::optional& src_opt = c10::nullopt ) { if (index.numel() == 0) return; TORCH_CHECK( diff --git a/aten/src/ATen/native/SegmentReduce.cpp b/aten/src/ATen/native/SegmentReduce.cpp index 3c7b539ee4b6d..0ab01bbe8c0bd 100644 --- a/aten/src/ATen/native/SegmentReduce.cpp +++ b/aten/src/ATen/native/SegmentReduce.cpp @@ -33,7 +33,7 @@ void _segment_reduce_lengths_cpu_kernel1( const Tensor& data, const T* lengths_data, int64_t axis, - const c10::optional& initial, + const std::optional& initial, Tensor& output, int64_t segment_count, int64_t lengths_stride_axis) { @@ -132,7 +132,7 @@ Tensor _segment_reduce_lengths_cpu_kernel( const Tensor& data, const Tensor& lengths, int64_t axis, - const c10::optional& initial) { + const std::optional& initial) { // data and lengths should be contiguous from the call to .contiguous in segment_reduce_kernel TORCH_CHECK(data.is_contiguous(), "Expected data to be contiguous."); TORCH_CHECK(lengths.is_contiguous(), "Expected lengths to be contiguous."); @@ -158,7 +158,7 @@ Tensor _segment_reduce_offsets_cpu_kernel( const Tensor& data, const Tensor& offsets, int64_t axis, - const c10::optional& initial) { + const std::optional& initial) { // data and lengths should be contiguous from the call to .contiguous in segment_reduce_kernel TORCH_CHECK(data.is_contiguous(), "Expected data to be contiguous."); TORCH_CHECK(offsets.is_contiguous(), "Expected offsets to be contiguous."); @@ -187,7 +187,7 @@ void _segment_reduce_cpu_lengths_backward_kernel1( ReductionType reduction, const T* lengths_data, int64_t axis, - const c10::optional& initial, + const std::optional& initial, Tensor& grad_input, int64_t segment_count, int64_t lengths_stride_axis) { @@ -323,7 +323,7 @@ Tensor _segment_reduce_cpu_lengths_backward_kernel( ReductionType reduction, const Tensor& lengths_contig, int64_t axis, - const c10::optional& initial) { + const std::optional& initial) { axis = lengths_contig.dim() - 1; int64_t segment_count = lengths_contig.size(axis); int64_t lengths_stride_axis = lengths_contig.stride(axis); @@ -356,7 +356,7 @@ Tensor _segment_reduce_cpu_offsets_backward_kernel( ReductionType reduction, const Tensor& offsets_contig, int64_t axis, - const c10::optional& initial) { + const std::optional& initial) { axis = offsets_contig.dim() - 1; int64_t segment_count = offsets_contig.size(axis) - 1; int64_t offsets_stride_axis = offsets_contig.stride(axis); @@ -386,12 +386,12 @@ Tensor _segment_reduce_cpu_offsets_backward_kernel( Tensor segment_reduce_kernel( const Tensor& data, c10::string_view reduce, - const c10::optional& lengths, - const c10::optional& indices, - const c10::optional& offsets, + const std::optional& lengths, + const std::optional& indices, + const std::optional& offsets, int64_t axis, bool unsafe, - const c10::optional& initial) { + const std::optional& initial) { axis = maybe_wrap_dim(axis, data.ndimension()); TORCH_CHECK(data.numel() >= 0); @@ -484,13 +484,13 @@ Tensor _segment_reduce_backward_kernel( const Tensor& output, const Tensor& data, c10::string_view reduce, - const c10::optional& lengths, - const c10::optional& offsets, + const std::optional& lengths, + const std::optional& offsets, int64_t axis, - const c10::optional& initial) { + const std::optional& initial) { axis = maybe_wrap_dim(axis, data.ndimension()); // check that one of lengths or offsets is defined - // codegen for derivatives.yaml passes an undefined Tensor for None rather than a c10::optional + // codegen for derivatives.yaml passes an undefined Tensor for None rather than a std::optional // so checking .has_value() doesn't work unlike in the forward pass auto lengths_has_value = lengths.has_value() && lengths.value().defined(); auto offsets_has_value = offsets.has_value() && offsets.value().defined(); diff --git a/aten/src/ATen/native/SegmentReduce.h b/aten/src/ATen/native/SegmentReduce.h index 0f14aff64f887..44429d0594bfc 100644 --- a/aten/src/ATen/native/SegmentReduce.h +++ b/aten/src/ATen/native/SegmentReduce.h @@ -15,7 +15,7 @@ using segment_reduce_lengths_fn = Tensor (*)( const Tensor&, const Tensor&, int64_t, - const c10::optional&); + const std::optional&); DECLARE_DISPATCH(segment_reduce_lengths_fn, _segment_reduce_lengths_stub); using segment_reduce_offsets_fn = Tensor (*)( @@ -23,7 +23,7 @@ using segment_reduce_offsets_fn = Tensor (*)( const Tensor&, const Tensor&, int64_t, - const c10::optional&); + const std::optional&); DECLARE_DISPATCH(segment_reduce_offsets_fn, _segment_reduce_offsets_stub); using segment_reduce_lengths_backward_fn = Tensor (*)( @@ -33,7 +33,7 @@ using segment_reduce_lengths_backward_fn = Tensor (*)( ReductionType, const Tensor&, int64_t, - const c10::optional&); + const std::optional&); DECLARE_DISPATCH(segment_reduce_lengths_backward_fn, _segment_reduce_lengths_backward_stub); using segment_reduce_offsets_backward_fn = Tensor (*)( @@ -43,7 +43,7 @@ using segment_reduce_offsets_backward_fn = Tensor (*)( ReductionType, const Tensor&, int64_t, - const c10::optional&); + const std::optional&); DECLARE_DISPATCH(segment_reduce_offsets_backward_fn, _segment_reduce_offsets_backward_stub); } // namespace native diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp index bd321a0a88e7a..3188479b931f3 100644 --- a/aten/src/ATen/native/SoftMax.cpp +++ b/aten/src/ATen/native/SoftMax.cpp @@ -155,7 +155,7 @@ void host_softmax( const Tensor& input, const int64_t dim, bool* mask = nullptr, - const c10::optional mask_type_ = {}) { + const std::optional mask_type_ = {}) { if (MaskedSoftMax) { TORCH_CHECK(mask_type_.has_value(), "Mask Type should be defined"); @@ -449,7 +449,7 @@ static Tensor softmax(const Tensor& input_, const int64_t dim_) { return result; } -Tensor softmax(const Tensor& input_, const int64_t dim_, c10::optional dtype) { +Tensor softmax(const Tensor& input_, const int64_t dim_, std::optional dtype) { auto result = [&]() { NoNamesGuard guard; if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){ @@ -466,7 +466,7 @@ Tensor softmax(const Tensor& input_, const int64_t dim_, c10::optional dtype, + std::optional dtype, Tensor& output_) { Tensor output_temp; if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && @@ -501,7 +501,7 @@ Tensor& softmax_out( } // special_softmax, alias for softmax -Tensor special_softmax(const Tensor& input_, const int64_t dim_, c10::optional dtype) { +Tensor special_softmax(const Tensor& input_, const int64_t dim_, std::optional dtype) { return at::softmax(input_, dim_, dtype); } @@ -514,7 +514,7 @@ static Tensor log_softmax(const Tensor& input_, const int64_t dim_) { return result; } -Tensor log_softmax(const Tensor& input_, const int64_t dim_, c10::optional dtype) { +Tensor log_softmax(const Tensor& input_, const int64_t dim_, std::optional dtype) { auto result = [&]() { NoNamesGuard guard; if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){ @@ -531,7 +531,7 @@ Tensor log_softmax(const Tensor& input_, const int64_t dim_, c10::optional dtype, + std::optional dtype, Tensor& output_) { Tensor output_temp; if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && @@ -565,7 +565,7 @@ Tensor& log_softmax_out( return output_; } -Tensor special_log_softmax(const Tensor& input, const int64_t dim, c10::optional dtype) { +Tensor special_log_softmax(const Tensor& input, const int64_t dim, std::optional dtype) { return at::log_softmax(input, dim, dtype); } @@ -587,7 +587,7 @@ Tensor log_softmax(const Tensor& self, Dimname dim, optional dtype) return at::log_softmax(self, dimname_to_position(self, dim), dtype); } -Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const c10::optional dim_, const c10::optional mask_type_) { +Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const std::optional dim_, const c10::optional mask_type_) { auto mask = mask_.contiguous(); auto mask_type = mask_type_; // Mask type might get transformed below @@ -652,7 +652,7 @@ Tensor masked_softmax_backward_cpu( const Tensor& grad_, const Tensor& output_, const Tensor& mask_, - const c10::optional dim_) { + const std::optional dim_) { TORCH_CHECK( grad_.sizes() == mask_.sizes(), "Mask shape should match grad shape"); TORCH_CHECK( diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp index b31007408c7ae..f9980ffd7229d 100644 --- a/aten/src/ATen/native/Sorting.cpp +++ b/aten/src/ATen/native/Sorting.cpp @@ -71,7 +71,7 @@ TORCH_META_FUNC(topk) } TORCH_META_FUNC2(sort, stable) -(const Tensor& self, c10::optional stable, int64_t dim, bool descending) { +(const Tensor& self, std::optional stable, int64_t dim, bool descending) { maybe_wrap_dim(dim, self.dim()); // See issue: https://github.com/pytorch/pytorch/issues/65863 @@ -939,7 +939,7 @@ Tensor nanmedian_cpu(const Tensor& self) { TORCH_IMPL_FUNC(sort_stable_out) (const Tensor& self, - c10::optional stable, + std::optional stable, int64_t dim, bool descending, const Tensor& values, diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index 7ed068874e68a..5f9ff1b838220 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -114,7 +114,7 @@ Tensor promote_tensor_fft(const Tensor& t, bool require_complex=false) { // Convert NumPy compatible normalization mode string to enum values // NOTE: NumPy's normalization modes have direction-specific meanings. For example, // "forward" translates to `by_n` for a forward transform and `none` for backward. -fft_norm_mode norm_from_string(c10::optional norm, bool forward) { +fft_norm_mode norm_from_string(std::optional norm, bool forward) { if (!norm || *norm == "backward") { return forward ? fft_norm_mode::none : fft_norm_mode::by_n; } @@ -197,8 +197,8 @@ Tensor fft_c2c_maybe_out( // Complex to real FFT Tensor fft_c2r(c10::string_view function_name, - Tensor out, Tensor input, c10::optional n_opt, - int64_t unwrapped_dim, c10::optional norm_str, + Tensor out, Tensor input, std::optional n_opt, + int64_t unwrapped_dim, std::optional norm_str, bool forward) { TORCH_CHECK(!out.defined() || out.is_floating_point(), function_name, " expects a floating point output tensor, but got ", out.scalar_type()); @@ -221,8 +221,8 @@ Tensor fft_c2r(c10::string_view function_name, // Real to complex FFT Tensor fft_r2c(c10::string_view function_name, - Tensor out, Tensor input, c10::optional n_opt, - int64_t unwrapped_dim, c10::optional norm_str, + Tensor out, Tensor input, std::optional n_opt, + int64_t unwrapped_dim, std::optional norm_str, bool forward, bool onesided) { TORCH_CHECK(!input.is_complex(), function_name, " expects a real input tensor, but got ", input.scalar_type()); @@ -256,8 +256,8 @@ Tensor fft_r2c(c10::string_view function_name, // Complex to complex FFT Tensor fft_c2c(c10::string_view function_name, - Tensor out, Tensor input, c10::optional n_opt, - int64_t unwrapped_dim, c10::optional norm_str, + Tensor out, Tensor input, std::optional n_opt, + int64_t unwrapped_dim, std::optional norm_str, bool forward) { TORCH_CHECK(input.is_complex(), function_name, " expects a complex input tensor, but got ", input.scalar_type()); @@ -346,7 +346,7 @@ ShapeAndDims canonicalize_fft_shape_and_dim_args( Tensor fftn_c2c( c10::string_view function_name, Tensor out, const Tensor& input, SymIntArrayRef shape, - IntArrayRef dim, c10::optional norm_str, bool forward) { + IntArrayRef dim, std::optional norm_str, bool forward) { TORCH_CHECK(input.is_complex(), function_name, " expects a complex input tensor, but got", input.scalar_type()); Tensor x = resize_fft_input(input, dim, shape); const auto norm = static_cast(norm_from_string(norm_str, forward)); @@ -357,15 +357,15 @@ Tensor fftn_c2c( } // namespace (anonymous) // torch.fft.fft, analogous to NumPy's numpy.fft.fft -Tensor fft_fft_symint(const Tensor& self, c10::optional n, int64_t dim, - c10::optional norm) { +Tensor fft_fft_symint(const Tensor& self, std::optional n, int64_t dim, + std::optional norm) { return self.is_complex() ? fft_c2c("fft", {}, self, n, dim, norm, /*forward=*/true) : fft_r2c("fft", {}, self, n, dim, norm, /*forward=*/true, /*onesided=*/false); } -Tensor& fft_fft_symint_out(const Tensor& self, c10::optional n, - int64_t dim, c10::optional norm, Tensor& out) { +Tensor& fft_fft_symint_out(const Tensor& self, std::optional n, + int64_t dim, std::optional norm, Tensor& out) { if (self.is_complex()) { fft_c2c("fft", out, self, n, dim, norm, /*forward=*/true); } else { @@ -374,15 +374,15 @@ Tensor& fft_fft_symint_out(const Tensor& self, c10::optional n, return out; } -Tensor fft_ifft_symint(const Tensor& self, c10::optional n, int64_t dim, - c10::optional norm) { +Tensor fft_ifft_symint(const Tensor& self, std::optional n, int64_t dim, + std::optional norm) { return self.is_complex() ? fft_c2c("ifft", {}, self, n, dim, norm, /*forward=*/false) : fft_r2c("ifft", {}, self, n, dim, norm, /*forward=*/false, /*onesided=*/false); } -Tensor& fft_ifft_symint_out(const Tensor& self, c10::optional n, - int64_t dim, c10::optional norm, Tensor& out) { +Tensor& fft_ifft_symint_out(const Tensor& self, std::optional n, + int64_t dim, std::optional norm, Tensor& out) { if (self.is_complex()) { fft_c2c("ifft", out, self, n, dim, norm, /*forward=*/false); } else { @@ -391,53 +391,53 @@ Tensor& fft_ifft_symint_out(const Tensor& self, c10::optional n, return out; } -Tensor fft_rfft_symint(const Tensor& self, c10::optional n, int64_t dim, - c10::optional norm) { +Tensor fft_rfft_symint(const Tensor& self, std::optional n, int64_t dim, + std::optional norm) { return fft_r2c("rfft", {}, self, n, dim, norm, /*forward=*/true, /*onesided=*/true); } -Tensor& fft_rfft_symint_out(const Tensor& self, c10::optional n, - int64_t dim, c10::optional norm, Tensor& out) { +Tensor& fft_rfft_symint_out(const Tensor& self, std::optional n, + int64_t dim, std::optional norm, Tensor& out) { fft_r2c("rfft", out, self, n, dim, norm, /*forward=*/true, /*onesided=*/true); return out; } -Tensor fft_irfft_symint(const Tensor& self, c10::optional n, int64_t dim, - c10::optional norm) { +Tensor fft_irfft_symint(const Tensor& self, std::optional n, int64_t dim, + std::optional norm) { return fft_c2r("irfft", {}, self, n, dim, norm, /*forward=*/false); } -Tensor& fft_irfft_symint_out(const Tensor& self, c10::optional n, - int64_t dim, c10::optional norm, Tensor& out) { +Tensor& fft_irfft_symint_out(const Tensor& self, std::optional n, + int64_t dim, std::optional norm, Tensor& out) { fft_c2r("irfft", out, self, n, dim, norm, /*forward=*/false); return out; } -Tensor fft_hfft_symint(const Tensor& self, c10::optional n, int64_t dim, - c10::optional norm) { +Tensor fft_hfft_symint(const Tensor& self, std::optional n, int64_t dim, + std::optional norm) { return fft_c2r("hfft", {}, self, n, dim, norm, /*forward=*/true); } -Tensor& fft_hfft_symint_out(const Tensor& self, c10::optional n, - int64_t dim, c10::optional norm, Tensor& out) { +Tensor& fft_hfft_symint_out(const Tensor& self, std::optional n, + int64_t dim, std::optional norm, Tensor& out) { fft_c2r("hfft", out, self, n, dim, norm, /*forward=*/true); return out; } -Tensor fft_ihfft_symint(const Tensor& self, c10::optional n, int64_t dim, - c10::optional norm) { +Tensor fft_ihfft_symint(const Tensor& self, std::optional n, int64_t dim, + std::optional norm) { return fft_r2c("ihfft", {}, self, n, dim, norm, /*forward=*/false, /*onesided=*/true); } -Tensor& fft_ihfft_symint_out(const Tensor& self, c10::optional n, - int64_t dim, c10::optional norm, Tensor& out) { +Tensor& fft_ihfft_symint_out(const Tensor& self, std::optional n, + int64_t dim, std::optional norm, Tensor& out) { fft_r2c("ihfft", out, self, n, dim, norm, /*forward=*/false, /*onesided=*/true); return out; } Tensor fft_fftn_symint(const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - c10::optional norm) { + std::optional norm) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); // TODO: For real input, perform rfftn then mirror with conjugate symmetry Tensor input = promote_tensor_fft(self, /*require_complex=*/true); @@ -447,7 +447,7 @@ Tensor fft_fftn_symint(const Tensor& self, at::OptionalSymIntArrayRef s, Tensor& fft_fftn_symint_out(const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - c10::optional norm, Tensor& out) { + std::optional norm, Tensor& out) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); // TODO: For real input, perform rfftn then mirror with conjugate symmetry Tensor input = promote_tensor_fft(self, /*require_complex=*/true); @@ -457,7 +457,7 @@ Tensor& fft_fftn_symint_out(const Tensor& self, Tensor fft_ifftn_symint(const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - c10::optional norm) { + std::optional norm) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); Tensor input = promote_tensor_fft(self, /*require_complex=*/true); return fftn_c2c("ifftn", {}, input, desc.shape, desc.dim, norm, /*forward=*/false); @@ -466,7 +466,7 @@ Tensor fft_ifftn_symint(const Tensor& self, at::OptionalSymIntArrayRef s, Tensor& fft_ifftn_symint_out(const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - c10::optional norm, Tensor& out) { + std::optional norm, Tensor& out) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); Tensor input = promote_tensor_fft(self, /*require_complex=*/true); fftn_c2c("ifftn", out, input, desc.shape, desc.dim, norm, /*forward=*/false); @@ -476,7 +476,7 @@ Tensor& fft_ifftn_symint_out(const Tensor& self, static Tensor fft_rfftn_impl(Tensor out, const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - const c10::optional& norm_str) { + const std::optional& norm_str) { TORCH_CHECK(!self.is_complex(), "rfftn expects a real-valued input tensor, but got ", self.scalar_type()); auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); TORCH_CHECK(!desc.shape.empty(), "rfftn must transform at least one axis"); @@ -489,14 +489,14 @@ static Tensor fft_rfftn_impl(Tensor out, const Tensor& self, Tensor fft_rfftn_symint(const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - c10::optional norm_str) { + std::optional norm_str) { return fft_rfftn_impl({}, self, s, dim, norm_str); } Tensor& fft_rfftn_symint_out(const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - c10::optional norm_str, Tensor& out) { + std::optional norm_str, Tensor& out) { fft_rfftn_impl(out, self, s, dim, norm_str); return out; } @@ -528,7 +528,7 @@ static ShapeAndDims canonicalize_fft_c2r_shape_and_dim_args( static Tensor fft_irfftn_impl(Tensor out, const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - const c10::optional& norm_str) { + const std::optional& norm_str) { SymInt last_dim_size = 0; auto desc = canonicalize_fft_c2r_shape_and_dim_args( "irfftn", self, s, dim, last_dim_size); @@ -542,14 +542,14 @@ static Tensor fft_irfftn_impl(Tensor out, const Tensor& self, Tensor fft_irfftn_symint(const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - c10::optional norm_str) { + std::optional norm_str) { return fft_irfftn_impl({}, self, s, dim, norm_str); } Tensor& fft_irfftn_symint_out(const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - c10::optional norm_str, Tensor& out) { + std::optional norm_str, Tensor& out) { fft_irfftn_impl(out, self, s, dim, norm_str); return out; } @@ -558,7 +558,7 @@ static Tensor fft_hfftn_impl( const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - c10::optional norm_str, + std::optional norm_str, const Tensor& out) { constexpr c10::string_view fname = "hfftn"; SymInt last_dim_size = 0; @@ -586,14 +586,14 @@ Tensor fft_hfftn_symint( const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - c10::optional norm) { + std::optional norm) { return fft_hfftn_impl(self, s, dim, norm, {}); } const Tensor& fft_hfftn_symint_out( const Tensor& self, at::OptionalSymIntArrayRef s, - at::OptionalIntArrayRef dim, c10::optional norm, + at::OptionalIntArrayRef dim, std::optional norm, const Tensor& out) { fft_hfftn_impl(self, s, dim, norm, out); return out; @@ -603,7 +603,7 @@ static Tensor fft_ihfftn_impl( const Tensor& self, const at::OptionalSymIntArrayRef& s, const at::OptionalIntArrayRef& dim, - const c10::optional& norm_str, + const std::optional& norm_str, const Tensor& out) { constexpr c10::string_view fname = "ihfftn"; auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); @@ -628,7 +628,7 @@ Tensor fft_ihfftn_symint( const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - c10::optional norm) { + std::optional norm) { return fft_ihfftn_impl(self, s, dim, norm, {}); } @@ -636,71 +636,71 @@ const Tensor& fft_ihfftn_symint_out( const Tensor& self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, - c10::optional norm, + std::optional norm, const Tensor& out) { fft_ihfftn_impl(self, s, dim, norm, out); return out; } Tensor fft_fft2_symint(const Tensor& self, at::OptionalSymIntArrayRef s, - IntArrayRef dim, c10::optional norm) { + IntArrayRef dim, std::optional norm) { return native::fft_fftn_symint(self, s, dim, std::move(norm)); } Tensor& fft_fft2_symint_out(const Tensor& self, at::OptionalSymIntArrayRef s, - IntArrayRef dim, c10::optional norm, Tensor& out) { + IntArrayRef dim, std::optional norm, Tensor& out) { return native::fft_fftn_symint_out(self, s, dim, std::move(norm), out); } Tensor fft_ifft2_symint(const Tensor& self, at::OptionalSymIntArrayRef s, - IntArrayRef dim, c10::optional norm) { + IntArrayRef dim, std::optional norm) { return native::fft_ifftn_symint(self, s, dim, std::move(norm)); } Tensor& fft_ifft2_symint_out(const Tensor& self, at::OptionalSymIntArrayRef s, - IntArrayRef dim, c10::optional norm, Tensor& out) { + IntArrayRef dim, std::optional norm, Tensor& out) { return native::fft_ifftn_symint_out(self, s, dim, std::move(norm), out); } Tensor fft_rfft2_symint(const Tensor& self, at::OptionalSymIntArrayRef s, - IntArrayRef dim, c10::optional norm) { + IntArrayRef dim, std::optional norm) { return native::fft_rfftn_symint(self, s, dim, std::move(norm)); } Tensor& fft_rfft2_symint_out(const Tensor& self, at::OptionalSymIntArrayRef s, - IntArrayRef dim, c10::optional norm, Tensor& out) { + IntArrayRef dim, std::optional norm, Tensor& out) { return native::fft_rfftn_symint_out(self, s, dim, std::move(norm), out); } Tensor fft_irfft2_symint(const Tensor& self, at::OptionalSymIntArrayRef s, - IntArrayRef dim, c10::optional norm) { + IntArrayRef dim, std::optional norm) { return native::fft_irfftn_symint(self, s, dim, std::move(norm)); } Tensor& fft_irfft2_symint_out(const Tensor& self, at::OptionalSymIntArrayRef s, - IntArrayRef dim, c10::optional norm, Tensor& out) { + IntArrayRef dim, std::optional norm, Tensor& out) { return native::fft_irfftn_symint_out(self, s, dim, std::move(norm), out); } const Tensor& fft_hfft2_symint_out( const Tensor& self, at::OptionalSymIntArrayRef s, IntArrayRef dim, - c10::optional norm, const Tensor& out) { + std::optional norm, const Tensor& out) { return native::fft_hfftn_symint_out(self, s, dim, std::move(norm), out); } Tensor fft_hfft2_symint(const Tensor& self, at::OptionalSymIntArrayRef s, - IntArrayRef dim, c10::optional norm) { + IntArrayRef dim, std::optional norm) { return native::fft_hfftn_symint(self, s, dim, std::move(norm)); } const Tensor& fft_ihfft2_symint_out( const Tensor& self, at::OptionalSymIntArrayRef s, IntArrayRef dim, - c10::optional norm, const Tensor& out) { + std::optional norm, const Tensor& out) { return native::fft_ihfftn_symint_out(self, s, dim, std::move(norm), out); } Tensor fft_ihfft2_symint(const Tensor& self, at::OptionalSymIntArrayRef s, - IntArrayRef dim, c10::optional norm) { + IntArrayRef dim, std::optional norm) { return native::fft_ihfftn_symint(self, s, dim, std::move(norm)); } @@ -716,10 +716,10 @@ Tensor& fft_fftfreq_out(int64_t n, double d, Tensor& out) { } Tensor fft_fftfreq(int64_t n, double d, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -737,10 +737,10 @@ Tensor& fft_rfftfreq_out(int64_t n, double d, Tensor& out) { } Tensor fft_rfftfreq(int64_t n, double d, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -824,7 +824,7 @@ static Stream& write_opt(Stream& SS, const optional& value) { * signals and complex windows. */ Tensor stft(const Tensor& self, const int64_t n_fft, const optional hop_lengthOpt, - const optional win_lengthOpt, const c10::optional& window_opt, + const optional win_lengthOpt, const std::optional& window_opt, const bool center, c10::string_view mode, const bool normalized, const optional onesidedOpt, const optional return_complexOpt) { // See [Note: hacky wrapper removal for optional tensor] @@ -980,7 +980,7 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional hop Tensor stft( const Tensor& self, const int64_t n_fft, const optional hop_lengthOpt, - const optional win_lengthOpt, const c10::optional& window_opt, + const optional win_lengthOpt, const std::optional& window_opt, const bool normalized, const optional onesidedOpt, const optional return_complexOpt) { return at::stft( @@ -1011,8 +1011,8 @@ static Tensor as_complex(const Tensor& self) { * signals and complex windows. */ Tensor istft(const Tensor& self, const int64_t n_fft, const optional hop_lengthOpt, - const optional win_lengthOpt, const c10::optional& window_opt, - const bool center, const bool normalized, const c10::optional onesidedOpt, + const optional win_lengthOpt, const std::optional& window_opt, + const bool center, const bool normalized, const std::optional onesidedOpt, const optional lengthOpt, const bool return_complex) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned window_maybe_owned = at::borrow_from_optional_tensor(window_opt); diff --git a/aten/src/ATen/native/SummaryOps.cpp b/aten/src/ATen/native/SummaryOps.cpp index 4c158f81a47e9..1866f4353b535 100644 --- a/aten/src/ATen/native/SummaryOps.cpp +++ b/aten/src/ATen/native/SummaryOps.cpp @@ -68,7 +68,7 @@ Tensor _bincount_cpu_template( } // namespace Tensor -_bincount_cpu(const Tensor& self, const c10::optional& weights_opt, int64_t minlength) { +_bincount_cpu(const Tensor& self, const std::optional& weights_opt, int64_t minlength) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weights_maybe_owned = at::borrow_from_optional_tensor(weights_opt); const Tensor& weights = *weights_maybe_owned; diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index f1e385d8eeac8..395af8e5ef139 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -190,8 +190,8 @@ void scatter_meta_impl( const Tensor& self, int64_t dim, const Tensor& index, - const c10::optional& src = nullopt, - const c10::optional reduce = nullopt) { + const std::optional& src = nullopt, + const std::optional reduce = nullopt) { int64_t wrapped_dim = at::maybe_wrap_dim(dim, self.dim()); at::native::scatter_gather_dtype_check("scatter", self, index, src); at::native::scatter_shape_check(self, wrapped_dim, index, src); @@ -629,7 +629,7 @@ TORCH_IMPL_FUNC(index_out) index_stub(device_type(), *this, sizes, strides); } -Tensor quantized_index(const Tensor & self, const torch::List>& indices) { +Tensor quantized_index(const Tensor & self, const torch::List>& indices) { TORCH_INTERNAL_ASSERT( self.qscheme() == c10::kPerTensorAffine || self.qscheme() == c10::kPerTensorSymmetric, @@ -643,7 +643,7 @@ Tensor quantized_index(const Tensor & self, const torch::List>& indices) { +Tensor _unsafe_index(const Tensor& self, const torch::List>& indices) { // Disallow boolean indexing since it leads to dynamic output shapes for (auto i : c10::irange(indices.size())) { auto index = indices.get(i); @@ -702,15 +702,15 @@ Tensor put(const Tensor & self, const Tensor& index, const Tensor & source, cons return self.clone(at::MemoryFormat::Preserve).put_(index, source, accumulate); } -Tensor index_put(const Tensor & self, const torch::List>& indices, const Tensor & value, bool accumulate) { +Tensor index_put(const Tensor & self, const torch::List>& indices, const Tensor & value, bool accumulate) { return self.clone(at::MemoryFormat::Preserve).index_put_(indices, value, accumulate); } -Tensor _unsafe_index_put(const Tensor& self, const torch::List>& indices, const Tensor& value, bool accumulate) { +Tensor _unsafe_index_put(const Tensor& self, const torch::List>& indices, const Tensor& value, bool accumulate) { return at::index_put(self, indices, value, accumulate); } -Tensor & _index_put_impl_(Tensor & self, const torch::List>& indices, const Tensor & value, const bool accumulate, const bool unsafe) { +Tensor & _index_put_impl_(Tensor & self, const torch::List>& indices, const Tensor & value, const bool accumulate, const bool unsafe) { TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); if (at::has_internal_overlap(self) == MemOverlap::Yes) { TORCH_WARN( @@ -730,7 +730,7 @@ Tensor & _index_put_impl_(Tensor & self, const torch::List } at::assert_no_overlap(self, value); // NOLINTNEXTLINE(performance-implicit-conversion-in-loop) - for (const c10::optional& index: indices) { + for (const std::optional& index: indices) { if (index.has_value()) { at::assert_no_overlap(self, *index); } @@ -788,7 +788,7 @@ Tensor take(const Tensor& self, const Tensor& index) { return out; } -Tensor & index_put_(Tensor & self, const torch::List>& indices, const Tensor & value, const bool accumulate) { +Tensor & index_put_(Tensor & self, const torch::List>& indices, const Tensor & value, const bool accumulate) { return at::_index_put_impl_(self, indices, value, accumulate, /*unsafe=*/false); } @@ -798,7 +798,7 @@ TORCH_IMPL_FUNC(index_copy_out) // See Note [Enabling Deterministic Operations] if (result.is_cuda() && globalContext().deterministicAlgorithms()){ - torch::List> indices; + torch::List> indices; indices.reserve(dim + 1); for (const auto i: c10::irange(dim)) { (void)i; @@ -1624,7 +1624,7 @@ static void _scatter_via_index_put( const Tensor& mut_out, bool accumulate) { if (self.dim() == 1) { - torch::List> indices; + torch::List> indices; indices.reserve(1); indices.push_back(index); mut_out.index_put_(indices, src, accumulate); @@ -1698,7 +1698,7 @@ static void _scatter_via_index_put( src.strides() ).flatten(); - torch::List> indices; + torch::List> indices; indices.reserve(1); indices.push_back(index_flat); @@ -1719,7 +1719,7 @@ void scatter_impl( const Tensor& out, ReduceStub& reduce_stub, FillStub& fill_stub, - const c10::optional reduce = nullopt, + const std::optional reduce = nullopt, bool reduce_includes_self = true) { dim = at::maybe_wrap_dim(dim, self.dim()); @@ -2123,7 +2123,7 @@ static inline void checkDevice(CheckedFrom c, at::ArrayRef tensors, Devi } // anonymous namespace -Tensor take_along_dim(const Tensor& self, const Tensor& indices, c10::optional opt_dim) { +Tensor take_along_dim(const Tensor& self, const Tensor& indices, std::optional opt_dim) { checkDevice("torch.take_along_dim():", {self, indices}, self.device()); if (opt_dim.has_value()) { auto [self_broadcasted, indices_broadcasted, dim] = @@ -2135,7 +2135,7 @@ Tensor take_along_dim(const Tensor& self, const Tensor& indices, c10::optional opt_dim, Tensor& result) { +Tensor& take_along_dim_out(const Tensor& self, const Tensor& indices, std::optional opt_dim, Tensor& result) { checkDevice("torch.take_along_dim():", {self, indices, result}, self.device()); if (opt_dim.has_value()) { auto [self_broadcasted, indices_broadcasted, dim] = @@ -2241,7 +2241,7 @@ Tensor count_nonzero_cpu(const Tensor& self, IntArrayRef dims){ } -Tensor count_nonzero(const Tensor& self, c10::optional dim) { +Tensor count_nonzero(const Tensor& self, std::optional dim) { if (dim) { return at::count_nonzero(self, IntArrayRef{*dim}); } diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.h b/aten/src/ATen/native/TensorAdvancedIndexing.h index c1464092a8e28..7b02b4201ffaa 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.h +++ b/aten/src/ATen/native/TensorAdvancedIndexing.h @@ -13,8 +13,8 @@ struct TensorIterator; namespace at::native { -using index_put_with_sort_fn = void(*)(Tensor &, const c10::List> &, const Tensor &, bool accumulate, bool unsafe); -using index_put_with_sort_quantized_fn = void(*)(Tensor& self, const c10::List>& indices, const Tensor& value, double scale, int zero_point, bool unsafe); +using index_put_with_sort_fn = void(*)(Tensor &, const c10::List> &, const Tensor &, bool accumulate, bool unsafe); +using index_put_with_sort_quantized_fn = void(*)(Tensor& self, const c10::List>& indices, const Tensor& value, double scale, int zero_point, bool unsafe); using gather_fn = void (*)(const Tensor & result, const Tensor & self, int64_t dim, const Tensor & index); using scatter_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src); using scatter_fill_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src); @@ -36,7 +36,7 @@ DECLARE_DISPATCH(scatter_reduce_fn, scatter_reduce_stub); DECLARE_DISPATCH(scatter_scalar_reduce_fn, scatter_scalar_reduce_stub); DECLARE_DISPATCH(scatter_reduce_two_fn, scatter_reduce_two_stub); -TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List>& indices); +TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List>& indices); using scatter_add_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&); using scatter_reduce_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const ReductionType& reduce, bool); diff --git a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h index 7b9d1446a087b..e46be1f878f72 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h +++ b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h @@ -21,7 +21,7 @@ static std::string shapes_as_str(TensorList tensors) { } } // anonymous namespace -static std::tuple canDispatchToMaskedFill(const Tensor& self, const torch::List>& indices, +static std::tuple canDispatchToMaskedFill(const Tensor& self, const torch::List>& indices, const Tensor& value){ if (!(value.numel() ==1 && value.device().is_cpu())){ return std::make_tuple(false,Tensor()); @@ -29,7 +29,7 @@ const Tensor& value){ int64_t num_ind = 0; Tensor mask; auto self_device = self.device(); - for (const c10::optional& i: indices) { + for (const std::optional& i: indices) { if (!i.has_value() || !(*i).defined()){ num_ind++; } else { diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp index e9599b4898fcd..cbb79dfabc7eb 100644 --- a/aten/src/ATen/native/TensorCompare.cpp +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -491,7 +491,7 @@ static void isin_sorting( if (assume_unique) { out.copy_(mask.slice(0, 0, elements.numel()).view_as(out)); } else { - out.copy_(at::index(mask, {c10::optional(unique_order)})); + out.copy_(at::index(mask, {std::optional(unique_order)})); } } @@ -746,27 +746,27 @@ TORCH_IMPL_FUNC(clamp_min_Tensor_out) } // Implements the "clip" alias for clamp -Tensor& clip_out(const Tensor& self, const c10::optional& min, const c10::optional& max, Tensor& result) { +Tensor& clip_out(const Tensor& self, const std::optional& min, const c10::optional& max, Tensor& result) { return at::clamp_outf(self, min, max, result); } -Tensor& clip_out(const Tensor& self, const c10::optional& min, const c10::optional& max, Tensor& result) { +Tensor& clip_out(const Tensor& self, const std::optional& min, const c10::optional& max, Tensor& result) { return at::clamp_outf(self, min, max, result); } -Tensor clip(const Tensor& self, const c10::optional& min, const c10::optional& max) { +Tensor clip(const Tensor& self, const std::optional& min, const c10::optional& max) { return at::clamp(self, min, max); } -Tensor clip(const Tensor& self, const c10::optional& min, const c10::optional& max) { +Tensor clip(const Tensor& self, const std::optional& min, const c10::optional& max) { return at::clamp(self, min, max); } -Tensor& clip_(Tensor& self, const c10::optional& min, const c10::optional& max) { +Tensor& clip_(Tensor& self, const std::optional& min, const c10::optional& max) { return at::clamp_(self, min, max); } -Tensor& clip_(Tensor& self, const c10::optional& min, const c10::optional& max) { +Tensor& clip_(Tensor& self, const std::optional& min, const c10::optional& max) { return at::clamp_(self, min, max); } diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp index c70da8334a5e9..dfb0fe4eb0a05 100644 --- a/aten/src/ATen/native/TensorConversions.cpp +++ b/aten/src/ATen/native/TensorConversions.cpp @@ -229,12 +229,12 @@ static inline optional ensure_has_index(optional device) { Tensor _to_copy( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, bool non_blocking, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { TORCH_CHECK(!layout.has_value() || self.layout() == layout.value(), "to(options) doesn't support converting to a different layout, " "but got self.layout being ", self.layout(), @@ -387,7 +387,7 @@ Tensor _to_copy( } template -static inline bool is_null_or_equal_to(const c10::optional& test, const T& value) { +static inline bool is_null_or_equal_to(const std::optional& test, const T& value) { if (!test.has_value()) { return true; } @@ -399,11 +399,11 @@ static inline bool is_null_or_equal_to(const c10::optional& test, const T& va // well. bool to_will_alias( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, + std::optional dtype, + std::optional layout, + std::optional device, bool copy, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve); return is_null_or_equal_to(dtype, self.dtype().toScalarType()) && @@ -416,13 +416,13 @@ bool to_will_alias( static inline Tensor to_impl( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, bool non_blocking, bool copy, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { // fast path if (to_will_alias(self, dtype, layout, device, copy, optional_memory_format)) { @@ -471,13 +471,13 @@ Tensor _autocast_to_full_precision(const Tensor& self, bool cuda_enabled, bool c Tensor to( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, bool non_blocking, bool copy, - c10::optional optional_memory_format + std::optional optional_memory_format ) { return to_impl( self, @@ -490,7 +490,7 @@ Tensor to( optional_memory_format); } -Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking, bool copy, c10::optional optional_memory_format) { +Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking, bool copy, std::optional optional_memory_format) { return to_impl( self, dtype, @@ -502,7 +502,7 @@ Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking optional_memory_format); } -Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking, bool copy, c10::optional optional_memory_format) { +Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking, bool copy, std::optional optional_memory_format) { return to_impl( self, dtype, @@ -514,7 +514,7 @@ Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking, bool copy, c1 optional_memory_format); } -Tensor to(const Tensor& self, const Tensor& other, bool non_blocking, bool copy, c10::optional optional_memory_format) { +Tensor to(const Tensor& self, const Tensor& other, bool non_blocking, bool copy, std::optional optional_memory_format) { auto options = other.options(); return to_impl( self, @@ -538,7 +538,7 @@ std::vector _to_cpu(TensorList tensors) { return cpu_tensors; } -Tensor to_dense_backward(const Tensor& grad, const Tensor& input_, c10::optional masked_grad_) { +Tensor to_dense_backward(const Tensor& grad, const Tensor& input_, std::optional masked_grad_) { /* For historical reasons, to_dense backward implements masked semantics for sparse tensors, that is, gradients with respect to @@ -598,7 +598,7 @@ Tensor to_mkldnn_backward(const Tensor& grad, const Tensor& input_) { return grad.to_dense(input_.scalar_type()); } -Tensor to_dense(const Tensor& tensor, c10::optional dtype, c10::optional masked_grad) { +Tensor to_dense(const Tensor& tensor, std::optional dtype, c10::optional masked_grad) { if (tensor.layout() == c10::kSparse) { return tensor._to_dense(dtype, masked_grad); } @@ -621,7 +621,7 @@ Tensor to_dense(const Tensor& tensor, c10::optional dtype, c10: return tensor; } -Tensor sparse_to_dense(const Tensor& self, c10::optional dtype, c10::optional masked) { +Tensor sparse_to_dense(const Tensor& self, std::optional dtype, c10::optional masked) { TORCH_CHECK( !dtype.has_value(), "dtype argument is not supported by sparse_to_dense"); Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided)); @@ -630,8 +630,8 @@ Tensor sparse_to_dense(const Tensor& self, c10::optional dtype, c10: Tensor sparse_compressed_to_dense( const Tensor& self, - c10::optional dtype, - c10::optional masked_grad) { + std::optional dtype, + std::optional masked_grad) { TORCH_CHECK( !dtype.has_value(), "dtype argument is not supported by sparse_csr_to_dense"); @@ -954,7 +954,7 @@ void _to_sparse_check_arguments(const std::string& funcname, const Tensor& self, } static inline -void _to_sparse_check_arguments(const std::string& funcname, const Tensor& self, c10::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt) { +void _to_sparse_check_arguments(const std::string& funcname, const Tensor& self, std::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt) { auto layout_from = self.layout(); auto layout_to = layout.value_or(kSparse); @@ -1036,7 +1036,7 @@ void _to_sparse_check_arguments(const std::string& funcname, const Tensor& self, } template -static Tensor dense_to_sparse_compressed(const Tensor& self, const Tensor& self_mask, IntArrayRef blocksize, c10::optional dense_dim_opt) { +static Tensor dense_to_sparse_compressed(const Tensor& self, const Tensor& self_mask, IntArrayRef blocksize, std::optional dense_dim_opt) { static_assert(target_layout == Layout::SparseCsr || target_layout == Layout::SparseCsc || target_layout == Layout::SparseBsr || target_layout == Layout::SparseBsc, "invalid layout template parameter for dense_to_sparse_compressed"); @@ -1109,7 +1109,7 @@ static Tensor dense_to_sparse_compressed(const Tensor& self, const Tensor& self_ self.options().layout(target_layout)); } -Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, c10::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt) { +Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, std::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt) { auto layout_to = layout.value_or(kSparse); TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "dense_to_sparse: unexpected same input and output layout"); TORCH_INTERNAL_ASSERT(self.layout() == mask.layout(), @@ -1137,35 +1137,35 @@ Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, c10::op return Tensor{}; } -Tensor dense_to_sparse_csr(const Tensor& self, c10::optional dense_dim_opt) { +Tensor dense_to_sparse_csr(const Tensor& self, std::optional dense_dim_opt) { auto layout_to = kSparseCsr; _to_sparse_check_arguments("dense_to_sparse_csr", self, layout_to, {}, dense_dim_opt); return dense_to_sparse_compressed(self, self != 0, {}, dense_dim_opt); } -Tensor dense_to_sparse_csc(const Tensor& self, c10::optional dense_dim_opt) { +Tensor dense_to_sparse_csc(const Tensor& self, std::optional dense_dim_opt) { auto layout_to = kSparseCsc; _to_sparse_check_arguments("dense_to_sparse_csc", self, layout_to, {}, dense_dim_opt); return dense_to_sparse_compressed(self, self != 0, {}, dense_dim_opt); } -Tensor dense_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, c10::optional dense_dim_opt) { +Tensor dense_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, std::optional dense_dim_opt) { auto layout_to = kSparseBsr; _to_sparse_check_arguments("dense_to_sparse_bsr", self, layout_to, blocksize, dense_dim_opt); return dense_to_sparse_compressed(self, self != 0, blocksize, dense_dim_opt); } -Tensor dense_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, c10::optional dense_dim_opt) { +Tensor dense_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, std::optional dense_dim_opt) { auto layout_to = kSparseBsc; _to_sparse_check_arguments("dense_to_sparse_bsc", self, layout_to, blocksize, dense_dim_opt); return dense_to_sparse_compressed(self, self != 0, blocksize, dense_dim_opt); } -Tensor dense_to_sparse(const Tensor& self, c10::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt) { +Tensor dense_to_sparse(const Tensor& self, std::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt) { auto layout_to = layout.value_or(kSparse); TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "dense_to_sparse: unexpected same input and output layout"); _to_sparse_check_arguments("dense_to_sparse", self, layout, blocksize, dense_dim_opt); @@ -1234,7 +1234,7 @@ Tensor dense_to_sparse(const Tensor& self, int64_t sparse_dim) { static Tensor sparse_compressed_to_flipped( const Tensor& self, - c10::optional blocksize, + std::optional blocksize, const std::string& name) { const auto layout = self.layout(); // NOTE: errors on non-compressed sparse layouts. @@ -1435,7 +1435,7 @@ static Tensor sparse_compressed_to_flipped( self.options().layout(flipped_layout)); } -Tensor sparse_compressed_to_sparse_csr(const Tensor& self, c10::optional dense_dim_opt) { +Tensor sparse_compressed_to_sparse_csr(const Tensor& self, std::optional dense_dim_opt) { auto layout_to = kSparseCsr; TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "sparse_compressed_to_sparse_csr: unexpected same input and output layout"); _to_sparse_check_arguments("sparse_compressed_to_sparse_csr", self, layout_to, {}, dense_dim_opt); @@ -1448,7 +1448,7 @@ Tensor sparse_compressed_to_sparse_csr(const Tensor& self, c10::optional dense_dim_opt) { +Tensor sparse_compressed_to_sparse_csc(const Tensor& self, std::optional dense_dim_opt) { auto layout_to = kSparseCsc; TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "sparse_compressed_to_sparse_csc: unexpected same input and output layout"); _to_sparse_check_arguments("sparse_compressed_to_sparse_csc", self, layout_to, {}, dense_dim_opt); @@ -1461,7 +1461,7 @@ Tensor sparse_compressed_to_sparse_csc(const Tensor& self, c10::optional dense_dim_opt) { +Tensor coo_to_sparse_csr(const Tensor& self, std::optional dense_dim_opt) { auto layout_to = kSparseCsr; _to_sparse_check_arguments("coo_to_sparse_csr", self, layout_to, {}, dense_dim_opt); @@ -1480,7 +1480,7 @@ Tensor coo_to_sparse_csr(const Tensor& self, c10::optional dense_dim_op coalesced_self.device()); } -Tensor coo_to_sparse_csc(const Tensor& self, c10::optional dense_dim_opt) { +Tensor coo_to_sparse_csc(const Tensor& self, std::optional dense_dim_opt) { auto layout_to = kSparseCsc; _to_sparse_check_arguments("coo_to_sparse_csc", self, layout_to, {}, dense_dim_opt); @@ -1495,14 +1495,14 @@ Tensor coo_to_sparse_csc(const Tensor& self, c10::optional dense_dim_op transposed_csr.device()); } -Tensor coo_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, c10::optional dense_dim_opt) { +Tensor coo_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, std::optional dense_dim_opt) { auto layout_to = kSparseBsr; _to_sparse_check_arguments("coo_to_sparse_bsr", self, layout_to, blocksize, dense_dim_opt); return self.to_sparse_csr(dense_dim_opt).to_sparse_bsr(blocksize); } -Tensor coo_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, c10::optional dense_dim_opt) { +Tensor coo_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, std::optional dense_dim_opt) { auto layout_to = kSparseBsc; _to_sparse_check_arguments("coo_to_sparse_bsc", self, layout_to, blocksize, dense_dim_opt); @@ -1814,7 +1814,7 @@ Tensor _compressed_to_block_compressed_cpu(const Tensor& self, IntArrayRef block self.options().layout(target_layout)); } -Tensor sparse_compressed_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, c10::optional dense_dim_opt) { +Tensor sparse_compressed_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, std::optional dense_dim_opt) { auto layout_to = kSparseBsr; TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "sparse_compressed_to_sparse_bsr: unexpected same input and output layout"); _to_sparse_check_arguments("sparse_compressed_to_sparse_bsr", self, layout_to, blocksize, dense_dim_opt); @@ -1836,7 +1836,7 @@ Tensor sparse_compressed_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize return Tensor{}; } -Tensor sparse_compressed_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, c10::optional dense_dim_opt) { +Tensor sparse_compressed_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, std::optional dense_dim_opt) { auto layout_to = kSparseBsc; TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "sparse_compressed_to_sparse_bsc: unexpected same input and output layout"); _to_sparse_check_arguments("sparse_compressed_to_sparse_bsc", self, layout_to, blocksize, dense_dim_opt); @@ -1909,7 +1909,7 @@ Tensor sparse_compressed_to_sparse(const Tensor& self, const int64_t sparse_dim) return at::native::_sparse_coo_tensor_unsafe(indices, values, self.sizes())._coalesced_(coalesced); } -Tensor sparse_compressed_to_sparse(const Tensor& self, c10::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt) { +Tensor sparse_compressed_to_sparse(const Tensor& self, std::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt) { auto layout_to = layout.value_or(kSparse); TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "sparse_compressed_to_sparse: unexpected same input and output layout"); _to_sparse_check_arguments("sparse_compressed_to_sparse", self, layout_to, blocksize, dense_dim_opt); @@ -1936,7 +1936,7 @@ Tensor sparse_compressed_to_sparse(const Tensor& self, c10::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt) { +Tensor sparse_coo_to_sparse(const Tensor& self, std::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt) { auto layout_to = layout.value_or(kSparse); TORCH_INTERNAL_ASSERT(self.layout() != layout_to, "sparse_coo_to_sparse: unexpected same input and output layout"); _to_sparse_check_arguments("sparse_coo_to_sparse", self, layout_to, blocksize, dense_dim_opt); @@ -1969,7 +1969,7 @@ Tensor to_sparse(const Tensor& self, const int64_t sparse_dim) { return self._to_sparse(sparse_dim); } -Tensor to_sparse(const Tensor& self, c10::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt) { +Tensor to_sparse(const Tensor& self, std::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt) { auto layout_to = layout.value_or(kSparse); if (self.layout() == layout_to) { _to_sparse_check_arguments("to_sparse", self, layout, blocksize, dense_dim_opt); @@ -1978,7 +1978,7 @@ Tensor to_sparse(const Tensor& self, c10::optional layout, Optional return self._to_sparse(layout, blocksize, dense_dim_opt); } -Tensor to_sparse_csr(const Tensor& self, c10::optional dense_dim_opt) { +Tensor to_sparse_csr(const Tensor& self, std::optional dense_dim_opt) { auto layout_to = kSparseCsr; if (self.layout() == layout_to) { _to_sparse_check_arguments("to_sparse_csr", self, layout_to, {}, dense_dim_opt); @@ -1987,7 +1987,7 @@ Tensor to_sparse_csr(const Tensor& self, c10::optional dense_dim_opt) { return self._to_sparse_csr(dense_dim_opt); } -Tensor to_sparse_csc(const Tensor& self, c10::optional dense_dim_opt) { +Tensor to_sparse_csc(const Tensor& self, std::optional dense_dim_opt) { auto layout_to = kSparseCsc; if (self.layout() == layout_to) { _to_sparse_check_arguments("to_sparse_csc", self, layout_to, {}, dense_dim_opt); @@ -1996,7 +1996,7 @@ Tensor to_sparse_csc(const Tensor& self, c10::optional dense_dim_opt) { return self._to_sparse_csc(dense_dim_opt); } -Tensor to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, c10::optional dense_dim_opt) { +Tensor to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, std::optional dense_dim_opt) { auto layout_to = kSparseBsr; if (self.layout() == layout_to) { _to_sparse_check_arguments("to_sparse_bsr", self, layout_to, blocksize, dense_dim_opt); @@ -2005,7 +2005,7 @@ Tensor to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, c10::optional dense_dim_opt) { +Tensor to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, std::optional dense_dim_opt) { auto layout_to = kSparseBsc; if (self.layout() == layout_to) { _to_sparse_check_arguments("to_sparse_bsc", self, layout_to, blocksize, dense_dim_opt); @@ -2026,7 +2026,7 @@ Tensor to_meta(const Tensor& tensor) { } return out; } -c10::optional to_meta(const c10::optional& tensor) { +std::optional to_meta(const c10::optional& tensor) { if (tensor.has_value()) { return to_meta(*tensor); } diff --git a/aten/src/ATen/native/TensorConversions.h b/aten/src/ATen/native/TensorConversions.h index fa0d58f3c1299..0e2fd30c288ce 100644 --- a/aten/src/ATen/native/TensorConversions.h +++ b/aten/src/ATen/native/TensorConversions.h @@ -11,16 +11,16 @@ namespace at { namespace native { bool to_will_alias( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, + std::optional dtype, + std::optional layout, + std::optional device, bool copy, - c10::optional optional_memory_format); + std::optional optional_memory_format); Tensor to_meta(const Tensor& tensor); -c10::optional to_meta(const c10::optional& tensor); +std::optional to_meta(const c10::optional& tensor); std::vector to_meta(at::ITensorListRef t_list); -Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, c10::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt); +Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, std::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt); } // namespace native } // namespace at diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index c8fddc3756353..195a792600f9b 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -133,18 +133,18 @@ DEFINE_DISPATCH(polar_stub); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arange ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tensor arange(const Scalar& end, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::arange(/*start=*/0, end, dtype, layout, device, pin_memory); } Tensor arange(const Scalar& start, const Scalar& end, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::arange( start, end, /*step=*/1, dtype, layout, device, pin_memory); } @@ -153,10 +153,10 @@ Tensor arange( const Scalar& start, const Scalar& end, const Scalar& step, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -252,8 +252,8 @@ Tensor polar(const Tensor& abs, const Tensor& angle) { } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Tensor empty_cpu(IntArrayRef size, c10::optional dtype_opt, c10::optional layout_opt, - c10::optional device_opt, c10::optional pin_memory_opt, c10::optional memory_format_opt) { +Tensor empty_cpu(IntArrayRef size, std::optional dtype_opt, c10::optional layout_opt, + std::optional device_opt, c10::optional pin_memory_opt, c10::optional memory_format_opt) { Tensor result = at::detail::empty_cpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt); // See Note [Enabling Deterministic Operations] if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) { @@ -264,11 +264,11 @@ Tensor empty_cpu(IntArrayRef size, c10::optional dtype_opt, c10::opt Tensor empty_names( IntArrayRef size, - c10::optional names, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, + std::optional names, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, optional optional_memory_format) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -285,8 +285,8 @@ Tensor empty_names( return result; } -Tensor empty_permuted_symint(SymIntArrayRef size, IntArrayRef physical_layout, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt +Tensor empty_permuted_symint(SymIntArrayRef size, IntArrayRef physical_layout, std::optional dtype_opt, + std::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt ) { // size is logical; aka, the output size you'll get from the operation overall // @@ -324,8 +324,8 @@ Tensor empty_permuted_symint(SymIntArrayRef size, IntArrayRef physical_layout, c return phys_tensor.as_strided_symint(size, strides); } -Tensor empty_strided_cpu(IntArrayRef size, IntArrayRef stride, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { +Tensor empty_strided_cpu(IntArrayRef size, IntArrayRef stride, std::optional dtype_opt, + std::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { Tensor result = at::detail::empty_strided_cpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt); // See Note [Enabling Deterministic Operations] if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) { @@ -335,7 +335,7 @@ Tensor empty_strided_cpu(IntArrayRef size, IntArrayRef stride, c10::optional optional_memory_format, + std::optional optional_memory_format, Tensor& result) { // Preferably, this argument would not be accepted by _out, but the code // generator requires the out and non-out overloads to match exactly @@ -377,11 +377,11 @@ C10_DIAGNOSTIC_POP() Tensor empty_like( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -430,11 +430,11 @@ Tensor empty_like( Tensor empty_like_quantized( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -458,7 +458,7 @@ Tensor empty_like_quantized( // TODO: To support all features of MemoryFormat::Preserve we need to add // _empty_affine_quantized_strided function and use it similarly to - // Tensor clone(const Tensor& src, c10::optional optional_memory_format) + // Tensor clone(const Tensor& src, std::optional optional_memory_format) // if (self.is_non_overlapping_and_dense()) -> _empty_affine_quantized_strided if (memory_format == MemoryFormat::Preserve) { memory_format = self.suggest_memory_format(); @@ -508,10 +508,10 @@ Tensor empty_like_quantized( Tensor new_empty_symint( const Tensor& self, SymIntArrayRef size, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt ) { auto dtype = dtype_opt.has_value() ? dtype_opt : optTypeMetaToScalarType(self.options().dtype_opt()); auto layout = layout_opt.has_value() ? layout_opt : self.options().layout_opt(); @@ -524,10 +524,10 @@ Tensor new_empty_strided_symint( const Tensor& self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory ) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -538,19 +538,19 @@ Tensor new_empty_strided_symint( // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eye ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tensor eye(int64_t n, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // the default value of `m` equals to `n` return at::eye(n, n, dtype, layout, device, pin_memory); } Tensor eye(int64_t n, int64_t m, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -614,10 +614,10 @@ TensorOptions infer_full_options( } // anonymous namespace Tensor full(IntArrayRef size, const Scalar& fill_value, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -639,11 +639,11 @@ Tensor& full_out(IntArrayRef size, const Scalar& fill_value, Tensor& result) { Tensor full_like( const Tensor& self, const Scalar& fill_value, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -655,10 +655,10 @@ Tensor new_full( const Tensor& self, IntArrayRef size, const Scalar& fill_value, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory ) { Tensor r = self.new_empty(size, TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory)); @@ -693,10 +693,10 @@ Tensor linspace( const Scalar& start, const Scalar& end, int64_t steps, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -710,10 +710,10 @@ Tensor linspace( const Tensor& start, const Tensor& end, int64_t steps, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { TORCH_CHECK(start.dim() == 0 && end.dim() == 0, "linspace only supports 0-dimensional start and end tensors, " "but got start with ", start.dim(), " dimension(s) and end with ", end.dim()," dimension(s)."); return at::linspace(start.item(), end.item(), steps, dtype, layout, device, pin_memory); @@ -723,10 +723,10 @@ Tensor linspace( const Tensor& start, const Scalar& end, int64_t steps, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { TORCH_CHECK(start.dim() == 0, "linspace only supports 0-dimensional start and end tensors, " "but got start with ", start.dim(), " dimension(s)."); return at::linspace(start.item(), end, steps, dtype, layout, device, pin_memory); @@ -736,10 +736,10 @@ Tensor linspace( const Scalar& start, const Tensor& end, int64_t steps, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { TORCH_CHECK(end.dim() == 0, "linspace only supports 0-dimensional start and end tensors, " "but got end with ", end.dim()," dimension(s)."); return at::linspace(start, end.item(), steps, dtype, layout, device, pin_memory); @@ -752,10 +752,10 @@ Tensor logspace( const Scalar& end, int64_t steps, double base, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -770,10 +770,10 @@ Tensor logspace( const Tensor& end, int64_t steps, double base, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { TORCH_CHECK(start.dim() == 0 && end.dim() == 0, "logspace only supports 0-dimensional start and end tensors, " "but got start with ", start.dim(), " dimension(s) and end with ", end.dim()," dimension(s)."); return at::logspace(start.item(), end.item(), steps, base, dtype, layout, device, pin_memory); @@ -784,10 +784,10 @@ Tensor logspace( const Scalar& end, int64_t steps, double base, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { TORCH_CHECK(start.dim() == 0, "logspace only supports 0-dimensional start and end tensors, " "but got start with ", start.dim(), " dimension(s)."); return at::logspace(start.item(), end, steps, base, dtype, layout, device, pin_memory); @@ -798,10 +798,10 @@ Tensor logspace( const Tensor& end, int64_t steps, double base, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { TORCH_CHECK(end.dim() == 0, "logspace only supports 0-dimensional start and end tensors, " "but got end with ", end.dim()," dimension(s)."); return at::logspace(start, end.item(), steps, base, dtype, layout, device, pin_memory); @@ -810,10 +810,10 @@ Tensor logspace( // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ones ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tensor ones(IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::full(size, /*fill_value=*/1., dtype, layout, device, pin_memory); } @@ -823,11 +823,11 @@ Tensor& ones_out(IntArrayRef size, Tensor& result) { Tensor ones_like( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { auto result = at::empty_like(self, dtype, layout, device, pin_memory, optional_memory_format); return result.fill_(1.); } @@ -835,10 +835,10 @@ Tensor ones_like( Tensor new_ones( const Tensor& self, IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] Tensor r = self.new_empty(size, TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory)); r.fill_(1.); @@ -848,10 +848,10 @@ Tensor new_ones( // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ scalar_tensor ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tensor scalar_tensor(const Scalar& s, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -874,18 +874,18 @@ Tensor scalar_tensor(const Scalar& s, // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ rand ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tensor rand(IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { - return native::rand(size, static_cast>(c10::nullopt), dtype, layout, device, pin_memory); -} - -Tensor rand(IntArrayRef size, c10::optional generator, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { + return native::rand(size, static_cast>(c10::nullopt), dtype, layout, device, pin_memory); +} + +Tensor rand(IntArrayRef size, std::optional generator, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -897,18 +897,18 @@ Tensor& rand_out(IntArrayRef size, Tensor& result) { return native::rand_out(size, c10::nullopt, result); } -Tensor& rand_out(IntArrayRef size, c10::optional generator, Tensor& result) { +Tensor& rand_out(IntArrayRef size, std::optional generator, Tensor& result) { result.resize_(size); return result.uniform_(0, 1, std::move(generator)); } Tensor rand_like( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -919,21 +919,21 @@ Tensor rand_like( // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randint ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tensor randint(int64_t high, IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::randint(high, size, c10::nullopt /* generator*/, dtype, layout, device, pin_memory); } Tensor randint( int64_t high, IntArrayRef size, - c10::optional generator, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional generator, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::randint(0, high, size, std::move(generator), dtype, layout, device, pin_memory); } @@ -941,10 +941,10 @@ Tensor randint( int64_t low, int64_t high, IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::randint(low, high, size, c10::nullopt, dtype, layout, device, pin_memory); } @@ -952,11 +952,11 @@ Tensor randint( int64_t low, int64_t high, IntArrayRef size, - c10::optional generator, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional generator, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -970,7 +970,7 @@ Tensor& randint_out(int64_t high, IntArrayRef size, Tensor& result) { Tensor& randint_out(int64_t high, IntArrayRef size, - c10::optional generator, + std::optional generator, Tensor& result) { result.resize_(size); return result.random_(0, high, std::move(generator)); @@ -983,7 +983,7 @@ Tensor& randint_out(int64_t low, int64_t high, IntArrayRef size, Tensor& result) Tensor& randint_out(int64_t low, int64_t high, IntArrayRef size, - c10::optional generator, + std::optional generator, Tensor& result) { result.resize_(size); return result.random_(low, high, std::move(generator)); @@ -992,11 +992,11 @@ Tensor& randint_out(int64_t low, Tensor randint_like( const Tensor& self, int64_t high, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1008,11 +1008,11 @@ Tensor randint_like( const Tensor& self, int64_t low, int64_t high, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1023,18 +1023,18 @@ Tensor randint_like( // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tensor randn(IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { - return native::randn(size, static_cast>(c10::nullopt), dtype, layout, device, pin_memory); -} - -Tensor randn(IntArrayRef size, c10::optional generator, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { + return native::randn(size, static_cast>(c10::nullopt), dtype, layout, device, pin_memory); +} + +Tensor randn(IntArrayRef size, std::optional generator, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1046,17 +1046,17 @@ Tensor& randn_out(IntArrayRef size, Tensor& result) { return native::randn_out(size, c10::nullopt, result); } -Tensor& randn_out(IntArrayRef size, c10::optional generator, Tensor& result) { +Tensor& randn_out(IntArrayRef size, std::optional generator, Tensor& result) { result.resize_(size); return result.normal_(0, 1, std::move(generator)); } Tensor normal(double mean, double std, IntArrayRef size, - c10::optional generator, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional generator, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1065,18 +1065,18 @@ Tensor normal(double mean, double std, IntArrayRef size, } Tensor& normal_out(double mean, double std, - IntArrayRef size, c10::optional generator, Tensor& result) { + IntArrayRef size, std::optional generator, Tensor& result) { result.resize_(size); return result.normal_(mean, std, std::move(generator)); } Tensor randn_like( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1113,18 +1113,18 @@ void randperm_cpu(Tensor& result, int64_t n, CPUGeneratorImpl* generator) { } // namespace Tensor randperm(int64_t n, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::randperm(n, c10::nullopt, dtype, layout, device, pin_memory); } -Tensor randperm(int64_t n, c10::optional generator, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { +Tensor randperm(int64_t n, std::optional generator, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { if (!dtype.has_value()) { dtype = ScalarType::Long; } @@ -1140,7 +1140,7 @@ Tensor& randperm_out(int64_t n, Tensor& result) { return at::randperm_out(result, n, c10::nullopt); } -Tensor& randperm_out_cpu(int64_t n, c10::optional generator, Tensor& result) { +Tensor& randperm_out_cpu(int64_t n, std::optional generator, Tensor& result) { TORCH_CHECK(n >= 0, "n must be non-negative, got", n); TORCH_CHECK(!generator.has_value() || (generator.has_value() && result.device() == generator->device()), "Expected a '", result.device(), "' generator device but found '", generator->device(), "'"); check_supported_max_int_with_precision(n, result); @@ -1161,10 +1161,10 @@ Tensor range( const Scalar& start, const Scalar& end, const Scalar& step, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1175,18 +1175,18 @@ Tensor range( Tensor range( const Scalar& start, const Scalar& end, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return at::native::range(start, end, 1, dtype, layout, device, pin_memory); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangle ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tensor tril_indices_cpu( - int64_t row, int64_t col, int64_t offset, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { + int64_t row, int64_t col, int64_t offset, std::optional dtype_opt, + std::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { if (!dtype_opt.has_value()) { dtype_opt = ScalarType::Long; } @@ -1235,8 +1235,8 @@ Tensor tril_indices_cpu( } Tensor triu_indices_cpu( - int64_t row, int64_t col, int64_t offset, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { + int64_t row, int64_t col, int64_t offset, std::optional dtype_opt, + std::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { if (!dtype_opt.has_value()) { dtype_opt = ScalarType::Long; } @@ -1278,10 +1278,10 @@ Tensor triu_indices_cpu( // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ zeros ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ static Tensor zeros_sparse_compressed_symint(c10::SymIntArrayRef size, - c10::optional dtype, + std::optional dtype, Layout layout, - c10::optional device, - c10::optional pin_memory) { + std::optional device, + std::optional pin_memory) { check_size_nonnegative(size); TORCH_CHECK(size.size() >= 2, "torch.zeros: Only batched sparse compressed (non-block) tensors are supported, but got size ", size); auto size_ = C10_AS_INTARRAYREF_SLOW(size); @@ -1312,10 +1312,10 @@ static Tensor zeros_sparse_compressed_symint(c10::SymIntArrayRef size, } Tensor zeros_symint(SymIntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { Layout layout_ = layout.value_or(Layout::Strided); if (at::sparse_csr::is_sparse_compressed(layout_)) { return zeros_sparse_compressed_symint(size, dtype, layout_, device, pin_memory); @@ -1327,10 +1327,10 @@ Tensor zeros_symint(SymIntArrayRef size, } Tensor _efficientzerotensor(IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { auto device_ = device_or_default(device); auto allocator = at::native::ZeroTensorAllocator(device_); auto dtype_ = dtype_or_default(dtype); @@ -1340,10 +1340,10 @@ Tensor _efficientzerotensor(IntArrayRef size, } Tensor _efficientzerotensor_meta_symint(SymIntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { auto device_ = device_or_default(device); auto allocator = at::native::ZeroTensorAllocator(device_); auto dtype_ = dtype_or_default(dtype); @@ -1372,11 +1372,11 @@ Tensor& zeros_out(IntArrayRef size, Tensor& result) { Tensor zeros_like( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { // See [Note: hacky wrapper removal for TensorOptions] auto other_options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); // Prefer values passed in explicitly, but default to value from self. @@ -1423,10 +1423,10 @@ Tensor zeros_like( Tensor new_zeros( const Tensor& self, IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory ) { Tensor r = self.new_empty(size, TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory)); r.zero_(); @@ -1436,10 +1436,10 @@ Tensor new_zeros( // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ bartlett_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tensor bartlett_window(int64_t window_length, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::bartlett_window( window_length, /*periodic=*/true, dtype, layout, device, pin_memory); } @@ -1447,10 +1447,10 @@ Tensor bartlett_window(int64_t window_length, Tensor bartlett_window( int64_t window_length, bool periodic, - c10::optional dtype_opt, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype_opt, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] ScalarType dtype = c10::dtype_or_default(dtype_opt); TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1475,10 +1475,10 @@ Tensor bartlett_window( // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ blackman_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tensor blackman_window(int64_t window_length, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::blackman_window( window_length, /*periodic=*/true, dtype, layout, device, pin_memory); } @@ -1486,10 +1486,10 @@ Tensor blackman_window(int64_t window_length, Tensor blackman_window( int64_t window_length, bool periodic, - c10::optional dtype_opt, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype_opt, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] ScalarType dtype = c10::dtype_or_default(dtype_opt); TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1515,10 +1515,10 @@ Tensor blackman_window( // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ hamming_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tensor hamming_window(int64_t window_length, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::hamming_window( window_length, /*periodic=*/true, dtype, layout, device, pin_memory); } @@ -1526,10 +1526,10 @@ Tensor hamming_window(int64_t window_length, Tensor hamming_window( int64_t window_length, bool periodic, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::hamming_window( window_length, periodic, @@ -1544,10 +1544,10 @@ Tensor hamming_window( int64_t window_length, bool periodic, double alpha, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::hamming_window( window_length, periodic, alpha, /*beta=*/0.46, dtype, layout, device, pin_memory); } @@ -1557,10 +1557,10 @@ Tensor hamming_window( bool periodic, double alpha, double beta, - c10::optional dtype_opt, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype_opt, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] ScalarType dtype = c10::dtype_or_default(dtype_opt); TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1583,20 +1583,20 @@ Tensor hamming_window( // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ hann_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tensor hann_window(int64_t window_length, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::hann_window(window_length, /*periodic=*/true, dtype, layout, device, pin_memory); } Tensor hann_window( int64_t window_length, bool periodic, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1608,10 +1608,10 @@ Tensor hann_window( // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kaiser_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tensor kaiser_window(int64_t window_length, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::kaiser_window( window_length, /*periodic=*/true, @@ -1623,10 +1623,10 @@ Tensor kaiser_window(int64_t window_length, } Tensor kaiser_window(int64_t window_length, bool periodic, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::kaiser_window(window_length, periodic, /*beta=*/12.0, dtype, layout, device, pin_memory); } @@ -1634,10 +1634,10 @@ Tensor kaiser_window( int64_t window_length, bool periodic, double beta, - c10::optional dtype_opt, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype_opt, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] ScalarType dtype = c10::dtype_or_default(dtype_opt); TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1667,7 +1667,7 @@ Tensor kaiser_window( // ~~~~~~~~~~~~~~~~~~~~~~~~~~ vandermonde_matrix ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Tensor vander(const Tensor& x, c10::optional N, bool increasing) { +Tensor vander(const Tensor& x, std::optional N, bool increasing) { TORCH_CHECK(x.dim() == 1, "x must be a one-dimensional tensor."); // Acquires n, defaulting to size if not provided @@ -1717,11 +1717,11 @@ Tensor tensor_complex_backend(ArrayRef values, const TensorOptions& options) return at::detail::tensor_complex_backend(values, options); } -Tensor from_file(c10::string_view filename, c10::optional shared, c10::optional size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { +Tensor from_file(c10::string_view filename, std::optional shared, c10::optional size, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1745,7 +1745,7 @@ Tensor from_file(c10::string_view filename, c10::optional shared, c10::opt // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ clone ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Tensor clone(const Tensor& src, c10::optional optional_memory_format) { +Tensor clone(const Tensor& src, std::optional optional_memory_format) { auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve); Tensor self; @@ -1777,10 +1777,10 @@ Tensor full( IntArrayRef size, const Scalar& fill_value, optional names, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1795,10 +1795,10 @@ Tensor full( Tensor ones( IntArrayRef size, optional names, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] return native::full( @@ -1808,31 +1808,31 @@ Tensor ones( Tensor zeros( IntArrayRef size, optional names, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::full(size, /*fill_value=*/0., names, dtype, layout, device, pin_memory); } Tensor randn( IntArrayRef size, optional names, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::randn(size, c10::nullopt, names, dtype, layout, device, pin_memory); } Tensor randn( IntArrayRef size, - c10::optional generator, + std::optional generator, optional names, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -1843,21 +1843,21 @@ Tensor randn( Tensor rand( IntArrayRef size, optional names, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return native::rand(size, c10::nullopt, names, dtype, layout, device, pin_memory); } Tensor rand( IntArrayRef size, - c10::optional generator, + std::optional generator, optional names, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h index f9b2893d768a9..58cbbfc4df334 100644 --- a/aten/src/ATen/native/TensorFactories.h +++ b/aten/src/ATen/native/TensorFactories.h @@ -63,7 +63,7 @@ inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) { } inline void check_args( - int64_t row, int64_t col, c10::optional layout_opt) { + int64_t row, int64_t col, std::optional layout_opt) { TORCH_CHECK(row >= 0, "row must be non-negative, got", row); TORCH_CHECK(col >= 0, "col must be non-negative, got", col); if (layout_opt.has_value()) { diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index a99e6e3a50c11..c4b8b12b67307 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -228,7 +228,7 @@ inline void cat_check_no_zero_dim(const MaterializedITensorListRef& tensors) { } inline c10::MemoryFormat cat_compute_output_memory_format(const MaterializedITensorListRef& inputs) { - c10::optional format = c10::nullopt; + std::optional format = c10::nullopt; for (const Tensor& t : inputs) { auto f = t.suggest_memory_format(); if (f == c10::MemoryFormat::Contiguous) { @@ -2511,8 +2511,8 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in Tensor slice( const Tensor& self, int64_t dim, - c10::optional start, - c10::optional end, + std::optional start, + std::optional end, int64_t step) { int64_t ndim = self.dim(); if (ndim == 0) { @@ -2568,8 +2568,8 @@ Tensor slice_inverse_symint( const Tensor& self, const Tensor& base, int64_t /* dim */, - c10::optional /* start */, - c10::optional /* end */, + std::optional /* start */, + std::optional /* end */, SymInt /* step */) { // assume self has enough to storage to be viewed with base's metadata return self.as_strided_symint(base.sym_sizes(), base.sym_strides(), base.sym_storage_offset()); @@ -3227,16 +3227,28 @@ static inferSqueezeGeometry(const Tensor &tensor, std::bitset d namespace { // Named type instead of a pair/tuple so that we can be sure to // construct the vectors in place and get NRVO. +template struct InferUnsqueezeGeometryResult { - DimVector sizes; - DimVector strides; - InferUnsqueezeGeometryResult(IntArrayRef tensor_sizes, IntArrayRef tensor_strides) + SmallVectorsizes; + SmallVector strides; + InferUnsqueezeGeometryResult(ArrayRef tensor_sizes, ArrayRef tensor_strides) : sizes(tensor_sizes.begin(), tensor_sizes.end()) , strides(tensor_strides.begin(), tensor_strides.end()) {} }; -InferUnsqueezeGeometryResult + +InferUnsqueezeGeometryResult +inferUnsqueezeGeometry_symint(const Tensor& tensor, int64_t dim) { + InferUnsqueezeGeometryResult result(tensor.sym_sizes(), tensor.sym_strides()); + c10::SymInt new_stride = dim >= tensor.dim() ? 1 : result.sizes[dim] * result.strides[dim]; + result.sizes.insert(result.sizes.begin() + dim, 1); + result.strides.insert(result.strides.begin() + dim, new_stride); + + return result; +} + +InferUnsqueezeGeometryResult inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) { - InferUnsqueezeGeometryResult result(tensor.sizes(), tensor.strides()); + InferUnsqueezeGeometryResult result(tensor.sizes(), tensor.strides()); int64_t new_stride = dim >= tensor.dim() ? 1 : result.sizes[dim] * result.strides[dim]; result.sizes.insert(result.sizes.begin() + dim, 1); result.strides.insert(result.strides.begin() + dim, new_stride); @@ -3377,8 +3389,8 @@ Tensor _unsafe_view(const Tensor& self, IntArrayRef size) { Tensor unsqueeze(const Tensor& self, int64_t dim) { dim = maybe_wrap_dim(dim, self.dim() + 1); - auto g = inferUnsqueezeGeometry(self, dim); - return self.as_strided(g.sizes, g.strides); + auto g = inferUnsqueezeGeometry_symint(self, dim); + return self.as_strided_symint(g.sizes, g.strides); } Tensor unsqueeze_sparse(Tensor const &self, int64_t dim) { @@ -3507,7 +3519,7 @@ static inline void handle_unflatten_exception(const std::runtime_error &e, const Tensor &self, int64_t dim, SymIntArrayRef sizes, - c10::optional names) { + std::optional names) { if (!strstr(e.what(), "is invalid for input of size")) { TORCH_CHECK(false, "unflatten got an unexpected error:\n", e.what()); } @@ -3524,7 +3536,7 @@ static inline void handle_unflatten_exception(const std::runtime_error &e, } } -static Tensor unflatten_impl(const Tensor& self, int64_t dim, SymIntArrayRef sizes, c10::optional names) { +static Tensor unflatten_impl(const Tensor& self, int64_t dim, SymIntArrayRef sizes, std::optional names) { dim = maybe_wrap_dim(dim, self.dim()); TORCH_CHECK(!sizes.empty(), "unflatten: sizes must be non-empty"); @@ -4001,7 +4013,7 @@ at::Tensor clone_preserve_strides(const at::Tensor& self) { } -at::Tensor slice_scatter(const at::Tensor& self, const at::Tensor& src, int64_t dim, c10::optional start, c10::optional end, int64_t step) { +at::Tensor slice_scatter(const at::Tensor& self, const at::Tensor& src, int64_t dim, std::optional start, c10::optional end, int64_t step) { // See Note [*_scatter ops preserve strides] auto output = clone_preserve_strides(self); auto slice = output.slice(dim, start, end, step); @@ -4024,7 +4036,7 @@ at::Tensor diagonal_scatter(const at::Tensor& self, const at::Tensor& src, int64 slice.copy_(src); return output; } -at::Tensor as_strided_scatter_symint(const at::Tensor& self, const at::Tensor& src, at::SymIntArrayRef size, at::SymIntArrayRef stride, c10::optional storage_offset) { +at::Tensor as_strided_scatter_symint(const at::Tensor& self, const at::Tensor& src, at::SymIntArrayRef size, at::SymIntArrayRef stride, std::optional storage_offset) { // See Note [as_strided_scatter backward support] TORCH_INTERNAL_ASSERT(!self.requires_grad() || self.is_contiguous(), "as_strided_scatter is currently only supported for contiguous inputs"); // See Note [*_scatter ops preserve strides] diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp index 5a7c3a6de965f..b13f28d56a86a 100644 --- a/aten/src/ATen/native/TensorTransformations.cpp +++ b/aten/src/ATen/native/TensorTransformations.cpp @@ -230,7 +230,7 @@ std::vector atleast_3d(TensorList tensors) { return result; } -Tensor chalf(const Tensor& self, c10::optional memory_format) { +Tensor chalf(const Tensor& self, std::optional memory_format) { return self.to(kComplexHalf, false, false, memory_format); } diff --git a/aten/src/ATen/native/TestOps.cpp b/aten/src/ATen/native/TestOps.cpp index e2fce123035ba..f9fa0839a51ae 100644 --- a/aten/src/ATen/native/TestOps.cpp +++ b/aten/src/ATen/native/TestOps.cpp @@ -49,7 +49,7 @@ Tensor _test_optional_intlist( /// Else, return a new tensor containing the elementwise sums. Tensor _test_optional_floatlist( const Tensor& values, - c10::optional> addends) { + std::optional> addends) { if (!addends) { return values; } diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index 6c22d2583f130..3520620280fee 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -772,23 +772,23 @@ Tensor square(const Tensor& self) { return at::pow(self, 2); } Tensor& square_(Tensor& self) { return self.pow_(2); } Tensor& logit_out(const Tensor& self, - c10::optional eps, + std::optional eps, Tensor& result) { return unary_op_impl_float_out( result, self, logit_stub, Scalar(eps ? eps.value() : -1.0)); } -Tensor logit(const Tensor& self, c10::optional eps) { +Tensor logit(const Tensor& self, std::optional eps) { return unary_op_impl_float( self, logit_stub, Scalar(eps ? eps.value() : -1.0)); } -Tensor& logit_(Tensor& self, c10::optional eps) { +Tensor& logit_(Tensor& self, std::optional eps) { return at::logit_out(self, self, eps); } -Tensor& special_logit_out(const Tensor& self, c10::optional eps, Tensor& result) { +Tensor& special_logit_out(const Tensor& self, std::optional eps, Tensor& result) { return at::logit_out(result, self, eps); } -Tensor special_logit(const Tensor& self, c10::optional eps) { +Tensor special_logit(const Tensor& self, std::optional eps) { return self.logit(eps); } @@ -801,9 +801,9 @@ Tensor special_expit(const Tensor& self) { } Tensor& nan_to_num_out(const Tensor& self, - c10::optional nan, - c10::optional pos_inf, - c10::optional neg_inf, + std::optional nan, + std::optional pos_inf, + std::optional neg_inf, Tensor& result) { TORCH_CHECK( self.scalar_type() == result.scalar_type(), @@ -825,18 +825,18 @@ Tensor& nan_to_num_out(const Tensor& self, Tensor nan_to_num( const Tensor& self, - c10::optional nan, - c10::optional pos_inf, - c10::optional neg_inf) { + std::optional nan, + std::optional pos_inf, + std::optional neg_inf) { auto result = at::empty_like(self); return at::nan_to_num_out(result, self, nan, pos_inf, neg_inf); } Tensor& nan_to_num_( Tensor& self, - c10::optional nan, - c10::optional pos_inf, - c10::optional neg_inf) { + std::optional nan, + std::optional pos_inf, + std::optional neg_inf) { return at::nan_to_num_out(self, self, nan, pos_inf, neg_inf); } diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h index 91d4d84d4630c..3d99fdc40d048 100644 --- a/aten/src/ATen/native/UnaryOps.h +++ b/aten/src/ATen/native/UnaryOps.h @@ -93,30 +93,30 @@ DECLARE_DISPATCH(unary_fn, special_scaled_modified_bessel_k1_stub); DECLARE_DISPATCH(unary_fn, special_spherical_bessel_j0_stub); // NB: these are actually defined in Distribution -DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, c10::optional), bernoulli_tensor_stub); -DECLARE_DISPATCH(void(*)(const TensorBase&, const double, c10::optional), bernoulli_scalar_stub); -DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional), cauchy_stub); -DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional), exponential_stub); -DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional), geometric_stub); -DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional), log_normal_stub); -DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional), uniform_stub); -DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, c10::optional), normal_stub); -DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, c10::optional), random_from_to_stub); -DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional), random_full_64_bits_range_stub); -DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional), random_stub); +DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, std::optional), bernoulli_tensor_stub); +DECLARE_DISPATCH(void(*)(const TensorBase&, const double, std::optional), bernoulli_scalar_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional), cauchy_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, std::optional), exponential_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, std::optional), geometric_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional), log_normal_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional), uniform_stub); +DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, std::optional), normal_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, std::optional), random_from_to_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, std::optional), random_full_64_bits_range_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, std::optional), random_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t, const double), kaiser_window_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t), polygamma_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const Scalar& a, const Scalar& b), clamp_stub); DECLARE_DISPATCH( - void (*)(Tensor&, const Tensor&, int64_t, c10::optional), + void (*)(Tensor&, const Tensor&, int64_t, std::optional), multinomial_with_replacement_stub); DECLARE_DISPATCH( void (*)( TensorIteratorBase&, - c10::optional, - c10::optional, - c10::optional), + std::optional, + std::optional, + std::optional), nan_to_num_stub); DECLARE_DISPATCH(void (*)(TensorIteratorBase&, int64_t), round_decimals_stub); diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp index 801af5d5e79fe..5c0deff804a33 100644 --- a/aten/src/ATen/native/Unique.cpp +++ b/aten/src/ATen/native/Unique.cpp @@ -484,7 +484,7 @@ unique_dim_consecutive_cpu(const Tensor& self, const int64_t dim, const bool ret } std::tuple -unique_consecutive_cpu(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional dim) { +unique_consecutive_cpu(const Tensor& self, const bool return_inverse, const bool return_counts, std::optional dim) { if (!dim.has_value() || (dim.value() == 0 && self.dim() == 1)) { return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] { return unique_consecutive_cpu_template(self, return_inverse, return_counts); diff --git a/aten/src/ATen/native/UpSample.cpp b/aten/src/ATen/native/UpSample.cpp index 2403d11e4604e..e0e3f82ac32fc 100644 --- a/aten/src/ATen/native/UpSample.cpp +++ b/aten/src/ATen/native/UpSample.cpp @@ -10,7 +10,7 @@ namespace at::native::upsample { TORCH_API c10::SmallVector compute_output_size( c10::IntArrayRef input_size, // Full input tensor size. at::OptionalIntArrayRef output_size, - c10::optional> scale_factors) { + std::optional> scale_factors) { const auto spatial_dimensions = static_cast(input_size.size()) - 2; if (output_size) { TORCH_CHECK(!scale_factors, "Must specify exactly one of output_size and scale_factors"); diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h index 8dadc7cee3ae4..e2b3c36b5d775 100644 --- a/aten/src/ATen/native/UpSample.h +++ b/aten/src/ATen/native/UpSample.h @@ -55,9 +55,9 @@ namespace upsample { TORCH_API c10::SmallVector compute_output_size( c10::IntArrayRef input_size, // Full input tensor size. at::OptionalIntArrayRef output_size, - c10::optional> scale_factors); + std::optional> scale_factors); -inline c10::optional get_scale_value(c10::optional> scales, int idx) { +inline std::optional get_scale_value(c10::optional> scales, int idx) { if (!scales) { return c10::nullopt; } @@ -66,7 +66,7 @@ inline c10::optional get_scale_value(c10::optional } // namespace upsample -using scale_t = c10::optional; +using scale_t = std::optional; using upsampling_nearest1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w); using _upsampling_nearest_exact1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w); using upsampling_nearest2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w); @@ -252,7 +252,7 @@ static inline void upsample_2d_shape_check( template static inline scalar_t compute_scales_value( - const c10::optional scale, + const std::optional scale, int64_t input_size, int64_t output_size) { // see Note [compute_scales_value] @@ -267,7 +267,7 @@ static inline scalar_t area_pixel_compute_scale( int64_t input_size, int64_t output_size, bool align_corners, - const c10::optional scale) { + const std::optional scale) { // see Note [area_pixel_compute_scale] if(align_corners) { if(output_size > 1) { @@ -335,7 +335,7 @@ static inline int64_t nearest_idx( int64_t output_index, int64_t input_size, int64_t output_size, - c10::optional scales) { + std::optional scales) { // This method specificly treats cases: output_size == input_size or // output_size == 2 * input_size, that we would like to get rid of // We keep this method for BC and consider as deprecated. @@ -356,13 +356,13 @@ static inline int64_t nearest_exact_idx( int64_t output_index, int64_t input_size, int64_t output_size, - c10::optional scales) { + std::optional scales) { float scale = compute_scales_value(scales, input_size, output_size); return nearest_neighbor_exact_compute_source_index(scale, output_index, input_size); } // Define a typedef to dispatch to nearest_idx or nearest_exact_idx -typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, c10::optional); +typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, std::optional); template static scalar_t upsample_get_value_bounded( diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp index f5e523c4a9114..8f5046534103b 100644 --- a/aten/src/ATen/native/UpSampleBicubic2d.cpp +++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp @@ -23,7 +23,7 @@ namespace at::meta { TORCH_META_FUNC(upsample_bicubic2d) ( - const Tensor& input, IntArrayRef output_size, bool align_corners, c10::optional scales_h, c10::optional scales_w + const Tensor& input, IntArrayRef output_size, bool align_corners, std::optional scales_h, c10::optional scales_w ) { auto full_output_size = native::upsample_2d_common_check(input.sizes(), output_size); @@ -41,8 +41,8 @@ TORCH_META_FUNC(upsample_bicubic2d_backward) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w + std::optional scales_h, + std::optional scales_w ) { auto full_output_size = native::upsample_2d_common_check(input_size, output_size); @@ -62,7 +62,7 @@ TORCH_META_FUNC(upsample_bicubic2d_backward) ( } TORCH_META_FUNC(_upsample_bicubic2d_aa) ( - const Tensor& input, IntArrayRef output_size, bool align_corners, c10::optional scales_h, c10::optional scales_w + const Tensor& input, IntArrayRef output_size, bool align_corners, std::optional scales_h, c10::optional scales_w ) { auto full_output_size = native::upsample_2d_common_check(input.sizes(), output_size); @@ -80,8 +80,8 @@ TORCH_META_FUNC(_upsample_bicubic2d_aa_backward) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w + std::optional scales_h, + std::optional scales_w ) { auto full_output_size = native::upsample_2d_common_check(input_size, output_size); @@ -115,8 +115,8 @@ static void upsample_bicubic2d_backward_out_frame( int64_t nbatch, int64_t channels, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { channels = channels * nbatch; auto input_slice_size = input_height * input_width; auto output_slice_size = output_height * output_width; @@ -185,8 +185,8 @@ static void upsample_bicubic2d_backward_kernel( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { int64_t output_height = output_size[0]; int64_t output_width = output_size[1]; @@ -227,8 +227,8 @@ TORCH_IMPL_FUNC(upsample_bicubic2d_out_cpu) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output ) { upsample_bicubic2d_kernel(kCPU, output, input, align_corners, scales_h, scales_w); @@ -239,8 +239,8 @@ TORCH_IMPL_FUNC(upsample_bicubic2d_backward_out_cpu) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input ) { grad_input.zero_(); @@ -251,8 +251,8 @@ TORCH_IMPL_FUNC(_upsample_bicubic2d_aa_out_cpu) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output ) { _upsample_bicubic2d_aa_kernel(kCPU, output, input, align_corners, scales_h, scales_w); @@ -263,8 +263,8 @@ TORCH_IMPL_FUNC(_upsample_bicubic2d_aa_backward_out_cpu) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input ) { grad_input.zero_(); @@ -280,7 +280,7 @@ Tensor upsample_bicubic2d( const Tensor& input, at::OptionalIntArrayRef output_size, bool align_corners, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_h = get_scale_value(scale_factors, 0); auto scale_w = get_scale_value(scale_factors, 1); @@ -291,7 +291,7 @@ Tensor _upsample_bicubic2d_aa( const Tensor& input, at::OptionalIntArrayRef output_size, bool align_corners, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_h = get_scale_value(scale_factors, 0); auto scale_w = get_scale_value(scale_factors, 1); diff --git a/aten/src/ATen/native/UpSampleBilinear2d.cpp b/aten/src/ATen/native/UpSampleBilinear2d.cpp index 202f33ab7970e..2cc8b56678c74 100644 --- a/aten/src/ATen/native/UpSampleBilinear2d.cpp +++ b/aten/src/ATen/native/UpSampleBilinear2d.cpp @@ -24,7 +24,7 @@ namespace at::meta { TORCH_META_FUNC(upsample_bilinear2d) ( - const Tensor& input, IntArrayRef output_size, bool align_corners, c10::optional scales_h, c10::optional scales_w + const Tensor& input, IntArrayRef output_size, bool align_corners, std::optional scales_h, c10::optional scales_w ) { auto full_output_size = native::upsample_2d_common_check(input.sizes(), output_size); @@ -42,8 +42,8 @@ TORCH_META_FUNC(upsample_bilinear2d_backward) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w + std::optional scales_h, + std::optional scales_w ) { auto full_output_size = native::upsample_2d_common_check(input_size, output_size); @@ -63,7 +63,7 @@ TORCH_META_FUNC(upsample_bilinear2d_backward) ( } TORCH_META_FUNC(_upsample_bilinear2d_aa) ( - const Tensor& input, IntArrayRef output_size, bool align_corners, c10::optional scales_h, c10::optional scales_w + const Tensor& input, IntArrayRef output_size, bool align_corners, std::optional scales_h, c10::optional scales_w ) { auto full_output_size = native::upsample_2d_common_check(input.sizes(), output_size); @@ -81,8 +81,8 @@ TORCH_META_FUNC(_upsample_bilinear2d_aa_backward) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w + std::optional scales_h, + std::optional scales_w ) { auto full_output_size = native::upsample_2d_common_check(input_size, output_size); @@ -109,8 +109,8 @@ TORCH_IMPL_FUNC(upsample_bilinear2d_out_cpu) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output ) { upsample_bilinear2d_kernel(kCPU, output, input, align_corners, scales_h, scales_w); @@ -121,8 +121,8 @@ TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_cpu) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input ) { grad_input.zero_(); @@ -134,8 +134,8 @@ TORCH_IMPL_FUNC(_upsample_bilinear2d_aa_out_cpu) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output ) { _upsample_bilinear2d_aa_kernel(kCPU, output, input, align_corners, scales_h, scales_w); @@ -146,8 +146,8 @@ TORCH_IMPL_FUNC(_upsample_bilinear2d_aa_backward_out_cpu) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input ) { grad_input.zero_(); @@ -161,7 +161,7 @@ Tensor upsample_bilinear2d( const Tensor& input, at::OptionalIntArrayRef output_size, bool align_corners, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_h = get_scale_value(scale_factors, 0); auto scale_w = get_scale_value(scale_factors, 1); @@ -172,7 +172,7 @@ Tensor _upsample_bilinear2d_aa( const Tensor& input, at::OptionalIntArrayRef output_size, bool align_corners, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_h = get_scale_value(scale_factors, 0); auto scale_w = get_scale_value(scale_factors, 1); diff --git a/aten/src/ATen/native/UpSampleLinear1d.cpp b/aten/src/ATen/native/UpSampleLinear1d.cpp index 7d80d5c2dc2b8..affbcaa4f06d9 100644 --- a/aten/src/ATen/native/UpSampleLinear1d.cpp +++ b/aten/src/ATen/native/UpSampleLinear1d.cpp @@ -23,7 +23,7 @@ TORCH_META_FUNC(upsample_linear1d) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales + std::optional scales ) { auto full_output_size = native::upsample_1d_common_check(input.sizes(), output_size); @@ -41,7 +41,7 @@ TORCH_META_FUNC(upsample_linear1d_backward) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales + std::optional scales ) { auto full_output_size = native::upsample_1d_common_check(input_size, output_size); @@ -65,7 +65,7 @@ TORCH_IMPL_FUNC(upsample_linear1d_out_cpu) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales, + std::optional scales, const Tensor& output ) { upsample_linear1d_kernel(kCPU, output, input, align_corners, scales); @@ -76,7 +76,7 @@ TORCH_IMPL_FUNC(upsample_linear1d_backward_out_cpu) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales, + std::optional scales, const Tensor& grad_input ) { grad_input.zero_(); @@ -92,7 +92,7 @@ Tensor upsample_linear1d( const Tensor& input, at::OptionalIntArrayRef output_size, bool align_corners, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_w = get_scale_value(scale_factors, 0); return at::upsample_linear1d(input, osize, align_corners, scale_w); diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp index 94441d6c3df97..7555d421d4afd 100644 --- a/aten/src/ATen/native/UpSampleNearest1d.cpp +++ b/aten/src/ATen/native/UpSampleNearest1d.cpp @@ -21,7 +21,7 @@ namespace at::meta { TORCH_META_FUNC(upsample_nearest1d) ( - const Tensor& input, IntArrayRef output_size, c10::optional scales + const Tensor& input, IntArrayRef output_size, std::optional scales ) { auto full_output_size = native::upsample_1d_common_check(input.sizes(), output_size); @@ -35,7 +35,7 @@ TORCH_META_FUNC(upsample_nearest1d) ( } TORCH_META_FUNC(_upsample_nearest_exact1d) ( - const Tensor& input, IntArrayRef output_size, c10::optional scales + const Tensor& input, IntArrayRef output_size, std::optional scales ) { auto full_output_size = native::upsample_1d_common_check(input.sizes(), output_size); @@ -49,7 +49,7 @@ TORCH_META_FUNC(_upsample_nearest_exact1d) ( } TORCH_META_FUNC(upsample_nearest1d_backward) ( - const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional scales + const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, std::optional scales ) { auto full_output_size = native::upsample_1d_common_check(input_size, output_size); @@ -61,7 +61,7 @@ TORCH_META_FUNC(upsample_nearest1d_backward) ( } TORCH_META_FUNC(_upsample_nearest_exact1d_backward) ( - const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional scales + const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, std::optional scales ) { auto full_output_size = native::upsample_1d_common_check(input_size, output_size); @@ -80,7 +80,7 @@ namespace at::native { TORCH_IMPL_FUNC(upsample_nearest1d_out_cpu) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales, + std::optional scales, const Tensor& output ) { upsample_nearest1d_kernel(kCPU, output, input, scales); @@ -89,7 +89,7 @@ TORCH_IMPL_FUNC(upsample_nearest1d_out_cpu) ( TORCH_IMPL_FUNC(_upsample_nearest_exact1d_out_cpu) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales, + std::optional scales, const Tensor& output ) { _upsample_nearest_exact1d_kernel(kCPU, output, input, scales); @@ -99,7 +99,7 @@ TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_cpu) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales, + std::optional scales, const Tensor& grad_input ) { grad_input.zero_(); @@ -110,7 +110,7 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact1d_backward_out_cpu) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales, + std::optional scales, const Tensor& grad_input ) { grad_input.zero_(); @@ -125,7 +125,7 @@ using at::native::upsample::get_scale_value; Tensor upsample_nearest1d( const Tensor& input, at::OptionalIntArrayRef output_size, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_w = get_scale_value(scale_factors, 0); return at::upsample_nearest1d(input, osize, scale_w); @@ -134,7 +134,7 @@ Tensor upsample_nearest1d( Tensor _upsample_nearest_exact1d( const Tensor& input, at::OptionalIntArrayRef output_size, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_w = get_scale_value(scale_factors, 0); return at::_upsample_nearest_exact1d(input, osize, scale_w); diff --git a/aten/src/ATen/native/UpSampleNearest2d.cpp b/aten/src/ATen/native/UpSampleNearest2d.cpp index 592108291cf76..0ee2db0597023 100644 --- a/aten/src/ATen/native/UpSampleNearest2d.cpp +++ b/aten/src/ATen/native/UpSampleNearest2d.cpp @@ -22,7 +22,7 @@ namespace at::meta { TORCH_META_FUNC(upsample_nearest2d) ( - const Tensor& input, IntArrayRef output_size, c10::optional scales_h, c10::optional scales_w + const Tensor& input, IntArrayRef output_size, std::optional scales_h, c10::optional scales_w ) { auto full_output_size = native::upsample_2d_common_check(input.sizes(), output_size); @@ -36,7 +36,7 @@ TORCH_META_FUNC(upsample_nearest2d) ( } TORCH_META_FUNC(_upsample_nearest_exact2d) ( - const Tensor& input, IntArrayRef output_size, c10::optional scales_h, c10::optional scales_w + const Tensor& input, IntArrayRef output_size, std::optional scales_h, c10::optional scales_w ) { auto full_output_size = native::upsample_2d_common_check(input.sizes(), output_size); @@ -53,8 +53,8 @@ TORCH_META_FUNC(upsample_nearest2d_backward) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_h, - c10::optional scales_w + std::optional scales_h, + std::optional scales_w ) { auto full_output_size = native::upsample_2d_common_check(input_size, output_size); @@ -77,8 +77,8 @@ TORCH_META_FUNC(_upsample_nearest_exact2d_backward) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_h, - c10::optional scales_w + std::optional scales_h, + std::optional scales_w ) { auto full_output_size = native::upsample_2d_common_check(input_size, output_size); @@ -104,8 +104,8 @@ namespace at::native { TORCH_IMPL_FUNC(upsample_nearest2d_out_cpu) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output ) { upsample_nearest2d_kernel(kCPU, output, input, scales_h, scales_w); @@ -114,8 +114,8 @@ TORCH_IMPL_FUNC(upsample_nearest2d_out_cpu) ( TORCH_IMPL_FUNC(_upsample_nearest_exact2d_out_cpu) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output ) { _upsample_nearest_exact2d_kernel(kCPU, output, input, scales_h, scales_w); @@ -125,8 +125,8 @@ TORCH_IMPL_FUNC(upsample_nearest2d_backward_out_cpu) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { grad_input.zero_(); upsample_nearest2d_backward_kernel(kCPU, grad_input, grad_output, scales_h, scales_w); @@ -136,8 +136,8 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact2d_backward_out_cpu) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { grad_input.zero_(); _upsample_nearest_exact2d_backward_kernel(kCPU, grad_input, grad_output, scales_h, scales_w); @@ -149,7 +149,7 @@ using at::native::upsample::get_scale_value; Tensor upsample_nearest2d( const Tensor& input, at::OptionalIntArrayRef output_size, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_h = get_scale_value(scale_factors, 0); auto scale_w = get_scale_value(scale_factors, 1); @@ -159,7 +159,7 @@ Tensor upsample_nearest2d( Tensor _upsample_nearest_exact2d( const Tensor& input, at::OptionalIntArrayRef output_size, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_h = get_scale_value(scale_factors, 0); auto scale_w = get_scale_value(scale_factors, 1); diff --git a/aten/src/ATen/native/UpSampleNearest3d.cpp b/aten/src/ATen/native/UpSampleNearest3d.cpp index 0c4851b7be513..ac4dc1796252e 100644 --- a/aten/src/ATen/native/UpSampleNearest3d.cpp +++ b/aten/src/ATen/native/UpSampleNearest3d.cpp @@ -23,9 +23,9 @@ namespace at::meta { TORCH_META_FUNC(upsample_nearest3d) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w + std::optional scales_d, + std::optional scales_h, + std::optional scales_w ) { auto full_output_size = native::upsample_3d_common_check(input.sizes(), output_size); @@ -41,9 +41,9 @@ TORCH_META_FUNC(upsample_nearest3d) ( TORCH_META_FUNC(_upsample_nearest_exact3d) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w + std::optional scales_d, + std::optional scales_h, + std::optional scales_w ) { auto full_output_size = native::upsample_3d_common_check(input.sizes(), output_size); @@ -60,9 +60,9 @@ TORCH_META_FUNC(upsample_nearest3d_backward) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w + std::optional scales_d, + std::optional scales_h, + std::optional scales_w ) { auto full_output_size = native::upsample_3d_common_check(input_size, output_size); @@ -85,9 +85,9 @@ TORCH_META_FUNC(_upsample_nearest_exact3d_backward) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w + std::optional scales_d, + std::optional scales_h, + std::optional scales_w ) { auto full_output_size = native::upsample_3d_common_check(input_size, output_size); @@ -113,9 +113,9 @@ namespace at::native { TORCH_IMPL_FUNC(upsample_nearest3d_out_cpu) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_d, + std::optional scales_h, + std::optional scales_w, const Tensor& output ) { upsample_nearest3d_kernel(kCPU, output, input, scales_d, scales_h, scales_w); @@ -124,9 +124,9 @@ TORCH_IMPL_FUNC(upsample_nearest3d_out_cpu) ( TORCH_IMPL_FUNC(_upsample_nearest_exact3d_out_cpu) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_d, + std::optional scales_h, + std::optional scales_w, const Tensor& output ) { _upsample_nearest_exact3d_kernel(kCPU, output, input, scales_d, scales_h, scales_w); @@ -136,9 +136,9 @@ TORCH_IMPL_FUNC(upsample_nearest3d_backward_out_cpu) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_d, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { grad_input.zero_(); upsample_nearest3d_backward_kernel(kCPU, grad_input, grad_output, scales_d, scales_h, scales_w); @@ -148,9 +148,9 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact3d_backward_out_cpu) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_d, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { grad_input.zero_(); _upsample_nearest_exact3d_backward_kernel(kCPU, grad_input, grad_output, scales_d, scales_h, scales_w); @@ -164,7 +164,7 @@ using at::native::upsample::get_scale_value; Tensor upsample_nearest3d( const Tensor& input, at::OptionalIntArrayRef output_size, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_d = get_scale_value(scale_factors, 0); auto scale_h = get_scale_value(scale_factors, 1); @@ -175,7 +175,7 @@ Tensor upsample_nearest3d( Tensor _upsample_nearest_exact3d( const Tensor& input, at::OptionalIntArrayRef output_size, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_d = get_scale_value(scale_factors, 0); auto scale_h = get_scale_value(scale_factors, 1); diff --git a/aten/src/ATen/native/UpSampleTrilinear3d.cpp b/aten/src/ATen/native/UpSampleTrilinear3d.cpp index 24a915d5d9a42..9aa8f9c5cb73c 100644 --- a/aten/src/ATen/native/UpSampleTrilinear3d.cpp +++ b/aten/src/ATen/native/UpSampleTrilinear3d.cpp @@ -23,9 +23,9 @@ TORCH_META_FUNC(upsample_trilinear3d) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w + std::optional scales_d, + std::optional scales_h, + std::optional scales_w ) { auto full_output_size = native::upsample_3d_common_check(input.sizes(), output_size); @@ -43,9 +43,9 @@ TORCH_META_FUNC(upsample_trilinear3d_backward) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w + std::optional scales_d, + std::optional scales_h, + std::optional scales_w ) { auto full_output_size = native::upsample_3d_common_check(input_size, output_size); @@ -71,9 +71,9 @@ TORCH_IMPL_FUNC(upsample_trilinear3d_out_cpu) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_d, + std::optional scales_h, + std::optional scales_w, const Tensor& output ) { upsample_trilinear3d_kernel(kCPU, output, input, align_corners, scales_d, scales_h, scales_w); @@ -84,9 +84,9 @@ TORCH_IMPL_FUNC(upsample_trilinear3d_backward_out_cpu) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_d, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input ) { grad_input.zero_(); @@ -102,7 +102,7 @@ Tensor upsample_trilinear3d( const Tensor& input, at::OptionalIntArrayRef output_size, bool align_corners, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_d = get_scale_value(scale_factors, 0); auto scale_h = get_scale_value(scale_factors, 1); diff --git a/aten/src/ATen/native/VariableMethodStubs.cpp b/aten/src/ATen/native/VariableMethodStubs.cpp index 477979d190be2..ed99aed399cb1 100644 --- a/aten/src/ATen/native/VariableMethodStubs.cpp +++ b/aten/src/ATen/native/VariableMethodStubs.cpp @@ -24,7 +24,7 @@ namespace at::native { -void _backward(const Tensor& self, TensorList inputs, const c10::optional& gradient_opt, c10::optional keep_graph, bool create_graph) { +void _backward(const Tensor& self, TensorList inputs, const std::optional& gradient_opt, c10::optional keep_graph, bool create_graph) { return self._backward(inputs, gradient_opt, keep_graph, create_graph); } diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h index 58a1a43fe67bc..a14fd4efc1b15 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h @@ -14,7 +14,7 @@ namespace sparse { struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase { PackedLinearWeight(std::unique_ptr> w, - c10::optional bias, + std::optional bias, std::vector col_offsets, std::vector w_scale, std::vector w_zp, @@ -31,7 +31,7 @@ struct TORCH_API PackedLinearWeight w_zp(std::move(w_zp)), q_scheme(q_scheme) {} std::unique_ptr> w; - c10::optional bias_; + std::optional bias_; std::vector col_offsets; std::vector w_scale; std::vector w_zp; @@ -68,13 +68,13 @@ struct TORCH_API PackedLinearWeight static c10::intrusive_ptr deserialize( const BCSRSerializationType& serialized); - c10::optional bias() override { + std::optional bias() override { return bias_; } static c10::intrusive_ptr prepack( const at::Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const int64_t out_features_block_size, const int64_t in_features_block_size); diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/packed_params.h b/aten/src/ATen/native/ao_sparse/quantized/cpu/packed_params.h index 1ca66bf536a77..db8ee9d619066 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/packed_params.h +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/packed_params.h @@ -9,14 +9,14 @@ namespace sparse { // using LinearPackedSerializationType = - std::tuple, std::vector>; + std::tuple, std::vector>; #define SPARSE_LINEAR_PACKED_PARAM_SERIALIZATION_VERSION 2 using BCSRSerializationType = std::tuple< int64_t, // Serialization Version - c10::optional, // Bias + std::optional, // Bias int64_t, // Out Features (Row) Block Size int64_t, // In Features (Column) Block Size at::Tensor, // Weight Scales (single element vector if per-tensor) (float) @@ -60,9 +60,9 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder { virtual BCSRSerializationType serialize() = 0; - virtual c10::optional bias() = 0; + virtual std::optional bias() = 0; - virtual void set_bias(const c10::optional& bias) { + virtual void set_bias(const std::optional& bias) { throw std::runtime_error( "set_bias is not implemented for this packed " "parameter type"); diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp index 8f80d920e3652..f5032f4d425b8 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp @@ -53,7 +53,7 @@ void calc_col_offsets_transpose( c10::intrusive_ptr PackedLinearWeight:: prepack( const at::Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const int64_t out_features_block_size, const int64_t in_features_block_size) { TORCH_CHECK( @@ -110,7 +110,7 @@ c10::intrusive_ptr PackedLinearWeight:: /*col_offsets=*/col_offsets.data(), /*qtype=*/qtype); - c10::optional bias_contig; + std::optional bias_contig; if (bias.has_value()) { const at::Tensor& bias_vec = bias.value(); TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)"); @@ -139,7 +139,7 @@ c10::intrusive_ptr PackedLinearWeight:: c10::intrusive_ptr PackedLinearWeightQnnp:: prepack( const at::Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const int64_t out_features_block_size, const int64_t in_features_block_size) { at::native::initQNNPACK(); @@ -150,7 +150,7 @@ c10::intrusive_ptr PackedLinearWeightQnnp:: // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) PackedLinearWeightQnnp::PackedLinearWeightQnnp( const at::Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const int64_t out_features_block_size, const int64_t in_features_block_size) : LinearPackedParamsBase(out_features_block_size, in_features_block_size), @@ -215,7 +215,7 @@ class QLinearPackWeightInt8 final { public: static c10::intrusive_ptr run( const at::Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const int64_t out_features_block_size, const int64_t in_features_block_size) { auto& ctx = at::globalContext(); diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h b/aten/src/ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h index 6ac89681899c5..b791cbe845756 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h @@ -16,9 +16,9 @@ namespace sparse { struct TORCH_API PackedLinearWeightQnnp : public LinearPackedParamsBase { - PackedLinearWeightQnnp(const at::Tensor& weight, const c10::optional& bias, const int64_t out_features_block_size /* block sparsity size across output_features */, const int64_t in_features_block_size /* block sparsity size across input_features */); + PackedLinearWeightQnnp(const at::Tensor& weight, const std::optional& bias, const int64_t out_features_block_size /* block sparsity size across output_features */, const int64_t in_features_block_size /* block sparsity size across input_features */); explicit PackedLinearWeightQnnp(const BCSRSerializationType& serialized); - c10::optional orig_bias_; + std::optional orig_bias_; // Separate copy of bias exist so that we can fill in zeros when // optional bias does not exist. This is to compy with qnnpack operator that // expects bias to be present. @@ -67,13 +67,13 @@ struct TORCH_API PackedLinearWeightQnnp static c10::intrusive_ptr deserialize( const BCSRSerializationType& serialized); - c10::optional bias() override { + std::optional bias() override { return orig_bias_; } static c10::intrusive_ptr prepack( const at::Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const int64_t out_features_block_size, const int64_t in_features_block_size); diff --git a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp index 572d5af43f651..4bf03b12b1446 100644 --- a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp @@ -21,7 +21,7 @@ void cpu_avg_pool2d( int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { using acc_t = at::opmath_type; auto input = input_.contiguous(); @@ -108,7 +108,7 @@ void cpu_avg_pool2d_channels_last( int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { TORCH_CHECK(input_.ndimension() == 4, "2d average pooling with channels last format supports tensors with 4 dims"); auto memory_format = at::MemoryFormat::ChannelsLast; @@ -222,7 +222,7 @@ void cpu_avg_pool2d_channels_last( int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { TORCH_CHECK(input_.ndimension() == 4, "2d average pooling with channels last format supports tensors with 4 dims"); auto memory_format = at::MemoryFormat::ChannelsLast; @@ -354,7 +354,7 @@ void cpu_avg_pool2d_backward( int dW, int dH, int padW, int padH, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { auto grad_output = grad_output_.contiguous(); auto grad_input = grad_input_.contiguous(); @@ -422,7 +422,7 @@ void cpu_avg_pool2d_backward_channels_last( int dW, int dH, int padW, int padH, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { auto memory_format = at::MemoryFormat::ChannelsLast; auto grad_input = grad_input_.contiguous(memory_format); auto grad_output = grad_output_.contiguous(memory_format); @@ -501,7 +501,7 @@ void avg_pool2d_kernel_impl( int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { switch (input.suggest_memory_format()) { case at::MemoryFormat::Contiguous: { AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool2d", [&] { @@ -527,7 +527,7 @@ void avg_pool2d_backward_kernel_impl( int dW, int dH, int padW, int padH, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { switch (grad_output.suggest_memory_format()) { case at::MemoryFormat::Contiguous: { AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool2d_backward", [&] { @@ -555,7 +555,7 @@ void cpu_avg_pool3d( int64_t dW, int64_t dH, int64_t dD, int64_t padW, int64_t padH, int64_t padD, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { using acc_t = at::opmath_type; auto input = input_.contiguous(); @@ -651,7 +651,7 @@ void cpu_avg_pool3d_channels_last( int64_t dW, int64_t dH, int64_t dD, int64_t padW, int64_t padH, int64_t padD, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { TORCH_CHECK(input_.ndimension() == 5, "3d average pooling with channels last format supports tensors with 5 dims"); auto memory_format = at::MemoryFormat::ChannelsLast3d; @@ -774,7 +774,7 @@ void cpu_avg_pool3d_channels_last( int64_t dW, int64_t dH, int64_t dD, int64_t padW, int64_t padH, int64_t padD, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { TORCH_CHECK(input_.ndimension() == 5, "3d average pooling with channels last format supports tensors with 5 dims"); auto memory_format = at::MemoryFormat::ChannelsLast3d; @@ -915,7 +915,7 @@ void cpu_avg_pool3d_backward( int dW, int dH, int dD, int padW, int padH, int padD, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { auto grad_output = grad_output_.contiguous(); auto grad_input = grad_input_.contiguous(); @@ -992,7 +992,7 @@ void cpu_avg_pool3d_backward_channels_last( int dW, int dH, int dD, int padW, int padH, int padD, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { auto memory_format = at::MemoryFormat::ChannelsLast3d; auto grad_input = grad_input_.contiguous(memory_format); auto grad_output = grad_output_.contiguous(memory_format); @@ -1083,7 +1083,7 @@ void avg_pool3d_kernel_impl( int64_t dW, int64_t dH, int64_t dD, int64_t padW, int64_t padH, int64_t padD, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { switch (input.suggest_memory_format()) { case at::MemoryFormat::Contiguous: { AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool3d", [&] { @@ -1110,7 +1110,7 @@ void avg_pool3d_backward_kernel_impl( int dW, int dH, int dD, int padW, int padH, int padD, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { switch (grad_output.suggest_memory_format()) { case at::MemoryFormat::Contiguous: { AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool3d_backward", [&] { diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp index 6dce481853ac2..7ee014058d70d 100644 --- a/aten/src/ATen/native/cpu/DistributionKernels.cpp +++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp @@ -26,27 +26,27 @@ namespace at::native { namespace { -static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, c10::optional gen) { +static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::cauchy_kernel(iter, median, sigma, generator); } -void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional gen) { +void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::bernoulli_kernel(self, p_, generator); } #if !AT_MKL_ENABLED() -void bernoulli_scalar_kernel_default(const TensorBase &self, double p, c10::optional gen) { +void bernoulli_scalar_kernel_default(const TensorBase &self, double p, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::bernoulli_kernel(self, p, generator); } -void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional gen) { +void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional gen) { bernoulli_scalar_kernel_default(self, p, gen); } #else -void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional gen) { +void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); int64_t seed; { @@ -99,17 +99,17 @@ void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional gen) { +static void exponential_kernel_default(TensorIteratorBase& iter, double lambda, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::exponential_kernel(iter, lambda, generator); } #if (!AT_MKL_ENABLED() || defined(FBCODE_CAFFE2)) -void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional gen) { +void exponential_kernel(TensorIteratorBase& iter, double lambda, std::optional gen) { exponential_kernel_default(iter, lambda, gen); } #else -void exponential_kernel(TensorIteratorBase &iter, double lambda, c10::optional gen) { +void exponential_kernel(TensorIteratorBase &iter, double lambda, std::optional gen) { TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype()); Tensor self = iter.tensor(0); @@ -195,32 +195,32 @@ void exponential_kernel(TensorIteratorBase &iter, double lambda, c10::optional gen) { +static void geometric_kernel(TensorIteratorBase& iter, double p, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::geometric_kernel(iter, p, generator); } -static void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, c10::optional gen) { +static void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::log_normal_kernel(iter, mean, std, generator); } -void uniform_kernel(TensorIteratorBase& iter, double from, double to, c10::optional gen) { +void uniform_kernel(TensorIteratorBase& iter, double from, double to, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::uniform_kernel(iter, from, to, generator); } -void normal_kernel(const TensorBase &self, double mean, double std, c10::optional gen) { +void normal_kernel(const TensorBase &self, double mean, double std, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::normal_kernel(self, mean, std, generator); } -static void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional gen) { +static void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::random_from_to_kernel(iter, range, base, generator); } -static void random_kernel(TensorIteratorBase& iter, c10::optional gen) { +static void random_kernel(TensorIteratorBase& iter, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::random_kernel(iter, generator); } @@ -228,7 +228,7 @@ static void random_kernel(TensorIteratorBase& iter, c10::optional gen // This is the special kernel to handle single specific case: // from(inclusive) = std::numeric_limits::lowest() // to(exclusive) = None (= std::numeric_limits::max() + 1) -static void random_full_64_bits_range_kernel(TensorIteratorBase& iter, c10::optional gen) { +static void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::random_full_64_bits_range_kernel(iter, generator); } diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h index 93a9b33b29285..961c0a3811ec1 100644 --- a/aten/src/ATen/native/cpu/DistributionTemplates.h +++ b/aten/src/ATen/native/cpu/DistributionTemplates.h @@ -57,10 +57,10 @@ void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG generator) { template struct RandomFromToKernel { - void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional gen) { + void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional gen) { random_from_to_kernel(iter, range, base, check_generator(gen)); } - void operator()(TensorIteratorBase& iter, c10::optional gen) { + void operator()(TensorIteratorBase& iter, std::optional gen) { random_full_64_bits_range_kernel(iter, check_generator(gen)); } }; @@ -78,7 +78,7 @@ void random_kernel(TensorIteratorBase& iter, RNG generator) { template struct RandomKernel { - void operator()(TensorIteratorBase& iter, c10::optional gen) { + void operator()(TensorIteratorBase& iter, std::optional gen) { random_kernel(iter, check_generator(gen)); } }; @@ -257,7 +257,7 @@ void normal_kernel(const TensorBase &self, double mean, double std, RNG generato template struct NormalKernel { - void operator()(Tensor& self, double mean, double std, c10::optional gen) { + void operator()(Tensor& self, double mean, double std, std::optional gen) { normal_kernel(self, mean, std, check_generator(gen)); } }; @@ -279,7 +279,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gene template struct UniformKernel { - void operator()(TensorIteratorBase& iter, double from, double to, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double from, double to, std::optional gen) { uniform_kernel(iter, from, to, check_generator(gen)); } }; @@ -299,7 +299,7 @@ void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, RNG ge template struct CauchyKernel { - void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double median, double sigma, std::optional gen) { cauchy_kernel(iter, median, sigma, check_generator(gen)); } }; @@ -319,7 +319,7 @@ void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, RNG ge template struct LogNormalKernel { - void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double mean, double std, std::optional gen) { log_normal_kernel(iter, mean, std, check_generator(gen)); } }; @@ -339,7 +339,7 @@ void geometric_kernel(TensorIteratorBase& iter, double p, RNG generator) { template struct GeometricKernel { - void operator()(TensorIteratorBase& iter, double p, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double p, std::optional gen) { geometric_kernel(iter, p, check_generator(gen)); } }; @@ -360,7 +360,7 @@ void exponential_kernel(TensorIteratorBase& iter, double lambda, RNG generator) template struct ExponentialKernel { - void operator()(TensorIteratorBase& iter, double lambda, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double lambda, std::optional gen) { exponential_kernel(iter, lambda, check_generator(gen)); } }; @@ -415,10 +415,10 @@ void bernoulli_kernel(const TensorBase &self, double p, RNG generator) { template struct BernoulliKernel { - void operator()(const TensorBase &self, double p, c10::optional gen) { + void operator()(const TensorBase &self, double p, std::optional gen) { bernoulli_kernel(self, p, check_generator(gen)); } - void operator()(const TensorBase &self, const TensorBase &p_, c10::optional gen) { + void operator()(const TensorBase &self, const TensorBase &p_, std::optional gen) { bernoulli_kernel(self, p_, check_generator(gen)); } }; diff --git a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp index cb96f24ebdde6..28422330403c6 100644 --- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp +++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp @@ -151,8 +151,8 @@ void cpu_flash_attention( const at::Tensor& v, double dropout_p, bool is_causal, - c10::optional attn_mask, - c10::optional scale) { + std::optional attn_mask, + std::optional scale) { // Query (Batch x Num_heads x Q_seq_len x Dim_per_head) // -> (Batch x Q_seq_len x Num_heads x Dim_per_head) // Key (Batch x Num_heads x KV_seq_len x Dim_per_head) @@ -400,8 +400,8 @@ void cpu_flash_attention_backward( const at::Tensor& logsumexp, double dropout_p, bool is_causal, - c10::optional attn_mask, - c10::optional scale) { + std::optional attn_mask, + std::optional scale) { constexpr bool is_reduced_type = is_reduced_floating_point_v; using accum_t = at::opmath_type; using Vec = vec::Vectorized; @@ -694,8 +694,8 @@ void flash_attention_kernel_impl( const at::Tensor& value, double dropout_p, bool is_causal, - c10::optional attn_mask, - c10::optional scale) { + std::optional attn_mask, + std::optional scale) { auto q_seq_len = query.size(2); AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, query.scalar_type(), "flash_attention", [&] { @@ -727,8 +727,8 @@ void flash_attention_backward_kernel_impl( const at::Tensor& logsumexp, double dropout_p, bool is_causal, - c10::optional attn_mask, - c10::optional scale) { + std::optional attn_mask, + std::optional scale) { // make sure grad_out has no zero strides (broadcasted dimensions) // since we are going to call gemm next // zero stride in leading dimension would lead to slow impl for gemm diff --git a/aten/src/ATen/native/cpu/FusedAdagradKernel.cpp b/aten/src/ATen/native/cpu/FusedAdagradKernel.cpp deleted file mode 100644 index 70085fde1e907..0000000000000 --- a/aten/src/ATen/native/cpu/FusedAdagradKernel.cpp +++ /dev/null @@ -1,225 +0,0 @@ -#define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include -#include -#include -#include -#include -#include -#include -#include -namespace at::native { - -namespace{ - -template -typename std::enable_if< - std::is_same::value || std::is_same::value, - void>:: - type inline adagrad_math( - scalar_t* param_ptr, - scalar_t* grad_ptr, - scalar_t* state_sum_ptr, - const double clr, - const double eps, - const double weight_decay, - const bool maximize, - const float* grad_scale_ptr, - int64_t size -){ - using lpVec = at::vec::Vectorized; - using fVec = at::vec::Vectorized; - lpVec grad_vec_to_store; - fVec param_vec1, param_vec2; - fVec grad_vec1, grad_vec2; - fVec state_sum_vec1, state_sum_vec2; - int64_t d = 0; - for (; d < size - (size % lpVec::size()); d += lpVec::size()) { - lpVec param_lpvec = lpVec::loadu(param_ptr + d); - std::tie(param_vec1, param_vec2) = vec::convert_to_float(param_lpvec); - lpVec grad_lpvec = lpVec::loadu(grad_ptr + d); - std::tie(grad_vec1, grad_vec2) = vec::convert_to_float(grad_lpvec); - if (grad_scale_ptr) { - grad_vec1 = grad_vec1 / fVec(float(*grad_scale_ptr)); - grad_vec2 = grad_vec2 / fVec(float(*grad_scale_ptr)); - grad_vec_to_store = vec::convert_from_float(grad_vec1, grad_vec2); - grad_vec_to_store.store(grad_ptr + d); - } - if (maximize){ - grad_vec1 = grad_vec1 * fVec(opmath_t(-1.0)); - grad_vec2 = grad_vec2 * fVec(opmath_t(-1.0)); - } - if (weight_decay != 0.0){ - grad_vec1 += param_vec1 * fVec(scalar_t(weight_decay)); - grad_vec2 += param_vec2 * fVec(scalar_t(weight_decay)); - } - std::tie(state_sum_vec1, state_sum_vec2) = vec::convert_to_float(lpVec::loadu(state_sum_ptr + d)); - state_sum_vec1 += grad_vec1 * grad_vec1; - state_sum_vec2 += grad_vec2 * grad_vec2; - vec::convert_from_float(state_sum_vec1, state_sum_vec2).store(state_sum_ptr + d); - - fVec std_vec1 = state_sum_vec1.sqrt() + fVec(scalar_t(eps)); - fVec std_vec2 = state_sum_vec2.sqrt() + fVec(scalar_t(eps)); - param_vec1 = param_vec1 - fVec(scalar_t(clr)) * grad_vec1 / std_vec1; - param_vec2 = param_vec2 - fVec(scalar_t(clr)) * grad_vec2 / std_vec2; - vec::convert_from_float(param_vec1, param_vec2).store(param_ptr + d); - } - scalar_t grad_val_to_store; - for (; d < size; d++) { - opmath_t grad_val = grad_ptr[d]; - opmath_t param_val = param_ptr[d]; - if (grad_scale_ptr) { - grad_val = grad_ptr[d] / opmath_t(*grad_scale_ptr); - grad_val_to_store = grad_val; - grad_ptr[d] = grad_val_to_store; - } - if (maximize) grad_val = -grad_val; - if (weight_decay != 0.0){ - grad_val += param_val * opmath_t(weight_decay); - } - opmath_t state_sum_val = state_sum_ptr[d]; - state_sum_val += grad_val * grad_val; - state_sum_ptr[d] = state_sum_val; - opmath_t std_val = std::sqrt(state_sum_val) + opmath_t(eps); - param_val -= opmath_t(clr) * grad_val / std_val; - param_ptr[d] = param_val; - } -} - - -template -typename std::enable_if< - std::is_same::value || std::is_same::value, - void>:: - type inline adagrad_math( - scalar_t* param_ptr, - scalar_t* grad_ptr, - scalar_t* state_sum_ptr, - const double clr, - const double eps, - const double weight_decay, - const bool maximize, - const float* grad_scale_ptr, - int64_t size -){ - using Vec = at::vec::Vectorized; - Vec grad_vec_to_store; - int64_t d = 0; - for (; d < size - (size % Vec::size()); d += Vec::size()) { - Vec param_vec = Vec::loadu(param_ptr + d); - Vec grad_vec = Vec::loadu(grad_ptr + d); - if (grad_scale_ptr) { - grad_vec = grad_vec / Vec(scalar_t(*grad_scale_ptr)); - grad_vec_to_store = grad_vec; - grad_vec_to_store.store(grad_ptr + d); - } - if (maximize) grad_vec = grad_vec * Vec(scalar_t(-1.0)); - if (weight_decay != 0.0){ - grad_vec += param_vec * Vec(scalar_t(weight_decay)); - } - - Vec sum_vec = Vec::loadu(state_sum_ptr + d) + grad_vec * grad_vec; - sum_vec.store(state_sum_ptr + d); - - Vec std_vec = sum_vec.sqrt() + Vec(scalar_t(eps)); - param_vec = param_vec - Vec(scalar_t(clr)) * grad_vec / std_vec; - param_vec.store(param_ptr + d); - } - scalar_t grad_val_to_store; - for (; d < size; d++) { - scalar_t grad_val = grad_ptr[d]; - if (grad_scale_ptr) { - grad_val = grad_ptr[d] / scalar_t(*grad_scale_ptr); - grad_val_to_store = grad_val; - grad_ptr[d] = grad_val_to_store; - } - if (maximize) grad_val = -grad_val; - if (weight_decay != 0.0){ - grad_val += param_ptr[d] * scalar_t(weight_decay); - } - state_sum_ptr[d] += grad_val * grad_val; - - scalar_t std_val = std::sqrt(state_sum_ptr[d]) + scalar_t(eps); - param_ptr[d] -= scalar_t(clr) * grad_val / std_val; - } -} - -template -void adagrad_fused_step_impl( - const at::Tensor& param, - const at::Tensor& grad, - const at::Tensor& state_sum, - const at::Tensor& state_step, - const double lr, - const double lr_decay, - const double weight_decay, - const double eps, - const bool maximize, - const float* grad_scale_ptr) { - using opmath_t = at::opmath_type; - scalar_t* param_data = param.data_ptr(); - scalar_t* grad_data = grad.data_ptr(); - scalar_t* state_sum_data = state_sum.data_ptr(); - double step = state_step.item(); - double clr = lr / (1.0 + (step - 1.0) * lr_decay); - - constexpr size_t cache_line_size = 64; - constexpr int64_t cache_line_aligned_task_unit = cache_line_size / sizeof(scalar_t); - size_t num_units = divup(param.numel(), cache_line_aligned_task_unit); - - auto adagrad_fn = [&](int64_t begin, int64_t end) { - // local pointers - begin *= cache_line_aligned_task_unit; - end = std::min(end * cache_line_aligned_task_unit, param.numel()); - scalar_t* param_ptr = param_data + begin; - scalar_t* grad_ptr = grad_data + begin; - scalar_t* state_sum_ptr = state_sum_data + begin; - - const int64_t size = end - begin; - adagrad_math( - param_ptr, - grad_ptr, - state_sum_ptr, - clr, - eps, - weight_decay, - maximize, - grad_scale_ptr, - size - ); - }; - at::parallel_for( - 0, num_units, 0, adagrad_fn); -} - -void fused_adagrad_kernel( - const at::Tensor& param, - const at::Tensor& grad, - const at::Tensor& state_sum, - const at::Tensor& state_step, - const double lr, - const double lr_decay, - const double weight_decay, - const double eps, - const bool maximize, - const float* grad_scale_ptr - ) { - Tensor grad_contiguous = grad.contiguous(); - AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, param.scalar_type(), "fused_adagrad_kernel", [&] { - adagrad_fused_step_impl( - param, - grad, - state_sum, - state_step, - lr, - lr_decay, - weight_decay, - eps, - maximize, - grad_scale_ptr); - }); -} - -} - -REGISTER_DISPATCH(fused_adagrad_stub, &fused_adagrad_kernel); -} // namespace at::native diff --git a/aten/src/ATen/native/cpu/HistogramKernel.cpp b/aten/src/ATen/native/cpu/HistogramKernel.cpp index 196bfd5647a76..829ec71fbd07c 100644 --- a/aten/src/ATen/native/cpu/HistogramKernel.cpp +++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp @@ -78,7 +78,7 @@ enum BIN_SELECTION_ALGORITHM { }; template void histogramdd_cpu_contiguous(Tensor& hist, const TensorList& bin_edges, - const Tensor& input, const c10::optional& weight) { + const Tensor& input, const std::optional& weight) { TORCH_INTERNAL_ASSERT(input.dim() == 2); const int64_t N = input.size(0); @@ -100,12 +100,12 @@ void histogramdd_cpu_contiguous(Tensor& hist, const TensorList& bin_edges, TensorAccessor accessor_in = input.accessor(); - /* Constructs a c10::optional containing an accessor iff + /* Constructs a std::optional containing an accessor if * the optional weight tensor has a value. */ const auto accessor_wt = weight.has_value() - ? c10::optional>(weight.value().accessor()) - : c10::optional>(); + ? std::optional>(weight.value().accessor()) + : std::optional>(); std::vector bin_seq(D); std::vector num_bin_edges(D); @@ -208,7 +208,7 @@ void histogramdd_cpu_contiguous(Tensor& hist, const TensorList& bin_edges, * Initializes hist to 0, calls into the main algorithm, and normalizes output if necessary. */ template -void histogramdd_out_cpu_template(const Tensor& self, const c10::optional& weight, bool density, +void histogramdd_out_cpu_template(const Tensor& self, const std::optional& weight, bool density, Tensor& hist, const TensorList& bin_edges) { hist.fill_(0); @@ -219,8 +219,8 @@ void histogramdd_out_cpu_template(const Tensor& self, const c10::optional(weight.value().reshape({M})) - : c10::optional(); + ? std::optional(weight.value().reshape({M})) + : std::optional(); std::vector bin_edges_contig(bin_edges.size()); for (const auto dim : c10::irange(bin_edges_contig.size())) { @@ -259,7 +259,7 @@ void histogramdd_out_cpu_template(const Tensor& self, const c10::optional& weight, bool density, +static void histogramdd_kernel_impl(const Tensor& self, const std::optional& weight, bool density, Tensor& hist, const TensorList& bin_edges) { histogramdd_out_cpu_template(self, weight, density, hist, bin_edges); } @@ -269,7 +269,7 @@ static void histogramdd_kernel_impl(const Tensor& self, const c10::optional& weight, +static void histogramdd_linear_kernel_impl(const Tensor& self, const std::optional& weight, bool density, Tensor& hist, const TensorList& bin_edges, bool local_search) { if (local_search) { // histogramdd codepath: both hist and bin_edges are eventually returned as output, diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp index d5af5d23e8b10..0ebe127c6a8dc 100644 --- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp +++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp @@ -54,7 +54,7 @@ void cpu_max_unpool( int64_t input_image_size = numel / channels; int64_t output_image_size = output.numel() / channels; - c10::optional optional_error_index; + std::optional optional_error_index; // parallel on dim N, C, D, H, W: [channels, input_image_size] at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) { @@ -118,7 +118,7 @@ void cpu_max_unpool_channels_last( int64_t input_image_size = input_height * input_width; int64_t output_image_size = output_height * output_width; - c10::optional optional_error_index; + std::optional optional_error_index; // parallel on dim N, H, W at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) { @@ -191,7 +191,7 @@ void cpu_max_unpool_backward( int64_t input_image_size = numel / channels; int64_t output_image_size = grad_output.numel() / channels; - c10::optional optional_error_index; + std::optional optional_error_index; // parallel on dim N, C, D, H, W at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) { diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp index 1c4054abdf239..f15292bd21fdb 100644 --- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp +++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp @@ -24,7 +24,7 @@ multinomial_with_replacement_apply( Tensor& result, const Tensor& self, const int64_t n_sample, - c10::optional generator) { + std::optional generator) { auto gen = get_generator_or_default( generator, detail::getDefaultCPUGenerator()); // See Note [Acquire lock when using random generators] @@ -128,7 +128,7 @@ multinomial_with_replacement_apply( Tensor& result, const Tensor& self, const int64_t n_sample, - c10::optional generator) { + std::optional generator) { auto gen = get_generator_or_default( generator, detail::getDefaultCPUGenerator()); // See Note [Acquire lock when using random generators] @@ -230,7 +230,7 @@ static void multinomial_with_replacement_kernel_impl( Tensor& result, const Tensor& self, const int64_t n_sample, - c10::optional gen) { + std::optional gen) { AT_DISPATCH_FLOATING_TYPES_AND2( kHalf, kBFloat16, self.scalar_type(), "multinomial", [&] { multinomial_with_replacement_apply( diff --git a/aten/src/ATen/native/cpu/ReduceUtils.h b/aten/src/ATen/native/cpu/ReduceUtils.h index d6afac295aff6..8c6424f8b0eac 100644 --- a/aten/src/ATen/native/cpu/ReduceUtils.h +++ b/aten/src/ATen/native/cpu/ReduceUtils.h @@ -60,7 +60,7 @@ inline vec_scalar_t init_value() { } template -inline vec_scalar_t init_value(const c10::optional& initial) { +inline vec_scalar_t init_value(const std::optional& initial) { using acc_t = vec_scalar_t; if (initial.has_value()) { return initial.value().to(); @@ -80,7 +80,7 @@ inline void init(scalar_t* out, int64_t size, const vec_scalar_t& val) } template -inline void init(scalar_t* out, int64_t size, const c10::optional& initial) { +inline void init(scalar_t* out, int64_t size, const std::optional& initial) { using acc_t = vec_scalar_t; acc_t val = init_value(initial); init(out, size, val); diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index 461ceb2f36383..9754b003e19c6 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -496,9 +496,9 @@ inline Vectorized> _nan_to_num_replace( static void nan_to_num_kernel( TensorIteratorBase& iter, - c10::optional nan, - c10::optional pos_inf, - c10::optional neg_inf) { + std::optional nan, + std::optional pos_inf, + std::optional neg_inf) { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "nan_to_num", [&]() { using value_t = c10::scalar_value_type::type; value_t nan_replacement = static_cast(nan.value_or(0.)); diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp index 67fe50c1d2a62..17b6d0a543f34 100644 --- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp +++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp @@ -21,7 +21,7 @@ namespace at::native { namespace { -using scale_t = std::vector>; +using scale_t = std::vector>; // TODO: this file could benefit from a global renaming of its functions / // classes and terms, as well as from adding more comments. In particular: @@ -987,7 +987,7 @@ struct HelperInterpBase { template static inline std::tuple, int, unsigned int> _compute_index_ranges_int16_weights( int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, - int64_t reshape_dim, bool align_corners, const c10::optional opt_scale, + int64_t reshape_dim, bool align_corners, const std::optional opt_scale, int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, bool align_i32=false ) { @@ -1072,7 +1072,7 @@ struct HelperInterpNearest : public HelperInterpBase { static inline std::vector compute_indices_weights( at::ScalarType scalar_type, int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, - int64_t reshape_dim, bool align_corners, const c10::optional opt_scale + int64_t reshape_dim, bool align_corners, const std::optional opt_scale ) { TORCH_INTERNAL_ASSERT(!align_corners); @@ -1123,7 +1123,7 @@ struct HelperInterpNearestExact : public HelperInterpNearest { static inline std::vector compute_indices_weights( at::ScalarType scalar_type, int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, - int64_t reshape_dim, bool align_corners, const c10::optional opt_scale + int64_t reshape_dim, bool align_corners, const std::optional opt_scale ) { TORCH_INTERNAL_ASSERT(!align_corners); @@ -1175,7 +1175,7 @@ struct HelperInterpLinear : public HelperInterpBase { static inline std::vector compute_indices_weights( at::ScalarType scalar_type, int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, int64_t reshape_dim, - bool align_corners, const c10::optional opt_scale + bool align_corners, const std::optional opt_scale ) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::vector output; @@ -1230,7 +1230,7 @@ struct HelperInterpLinear : public HelperInterpBase { int64_t ndims, int64_t reshape_dim, bool align_corners, - const c10::optional opt_scale, + const std::optional opt_scale, bool antialias ) { @@ -1266,7 +1266,7 @@ struct HelperInterpLinear : public HelperInterpBase { int64_t ndims, int64_t reshape_dim, bool align_corners, - const c10::optional opt_scale, + const std::optional opt_scale, bool antialias, bool align_i32=false ) { @@ -1296,7 +1296,7 @@ struct HelperInterpCubic : public HelperInterpBase { static inline std::vector compute_indices_weights( at::ScalarType scalar_type, int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, int64_t reshape_dim, - bool align_corners, const c10::optional opt_scale + bool align_corners, const std::optional opt_scale ) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::vector output; @@ -1364,7 +1364,7 @@ struct HelperInterpCubic : public HelperInterpBase { int64_t ndims, int64_t reshape_dim, bool align_corners, - const c10::optional opt_scale, + const std::optional opt_scale, bool antialias ) { @@ -1400,7 +1400,7 @@ struct HelperInterpCubic : public HelperInterpBase { int64_t ndims, int64_t reshape_dim, bool align_corners, - const c10::optional opt_scale, + const std::optional opt_scale, bool antialias, bool align_i32=false ) { @@ -1422,7 +1422,7 @@ struct HelperInterpCubic : public HelperInterpBase { // // Internally, it uses TensorIterator to optimize the computations. // - out_ndims is the number of interpolated dims: 1, 2, 3 -// - scale_type is template type for scales, typically c10::optional +// - scale_type is template type for scales, typically std::optional // - template class F is one of the above structs to compute indices and weights template void upsample_generic_Nd_kernel_impl( @@ -1686,7 +1686,7 @@ void separable_upsample_generic_Nd_kernel_impl( void upsample_nearest1d_kernel_impl( const Tensor& output, const Tensor& input, - c10::optional scales_w) { + std::optional scales_w) { upsample_generic_Nd_kernel_impl<1, scale_t, HelperInterpNearest>( output, input, false, {scales_w}); } @@ -1694,7 +1694,7 @@ void upsample_nearest1d_kernel_impl( void _upsample_nearest_exact1d_kernel_impl( const Tensor& output, const Tensor& input, - c10::optional scales_w) { + std::optional scales_w) { upsample_generic_Nd_kernel_impl<1, scale_t, HelperInterpNearestExact>( output, input, false, {scales_w}); } @@ -1726,8 +1726,8 @@ int _use_vectorized_kernel_cond_3d( void upsample_nearest2d_kernel_impl( const Tensor& output, const Tensor& input, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { if (_use_vectorized_kernel_cond_2d(output, input)) { AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf, input.scalar_type(), "upsample_nearest2d_channels_last", [&] { @@ -1742,8 +1742,8 @@ void upsample_nearest2d_kernel_impl( void _upsample_nearest_exact2d_kernel_impl( const Tensor& output, const Tensor& input, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { if (_use_vectorized_kernel_cond_2d(output, input)) { AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf, input.scalar_type(), "upsample_nearest2d_channels_last", [&] { cpu_upsample_nearest_channels_last(output, input, {scales_h, scales_w}); @@ -1757,9 +1757,9 @@ void _upsample_nearest_exact2d_kernel_impl( void upsample_nearest3d_kernel_impl( const Tensor& output, const Tensor& input, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_d, + std::optional scales_h, + std::optional scales_w) { if (_use_vectorized_kernel_cond_3d(output, input)) { AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf, input.scalar_type(), "upsample_nearest3d_channels_last", [&] { @@ -1774,9 +1774,9 @@ void upsample_nearest3d_kernel_impl( void _upsample_nearest_exact3d_kernel_impl( const Tensor& output, const Tensor& input, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_d, + std::optional scales_h, + std::optional scales_w) { if (_use_vectorized_kernel_cond_3d(output, input)) { AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf, input.scalar_type(), "upsample_nearest3d_channels_last", [&] { cpu_upsample_nearest_channels_last(output, input, {scales_d, scales_h, scales_w}); @@ -1791,7 +1791,7 @@ void upsample_linear1d_kernel_impl( const Tensor& output, const Tensor& input, bool align_corners, - c10::optional scales_w) { + std::optional scales_w) { upsample_generic_Nd_kernel_impl<1, scale_t, HelperInterpLinear>( output, input, align_corners, {scales_w}); } @@ -1801,8 +1801,8 @@ void upsample_bilinear2d_kernel_impl_float( const Tensor& output, const Tensor& input, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { // See note above about _use_vectorized_kernel_cond_2d(output, input). The extra cond is present // because benchmarks showed that with only 1 thread, images (C == 3) were @@ -1823,8 +1823,8 @@ void upsample_bilinear2d_kernel_impl( const Tensor& output, const Tensor& input, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { if (input.dtype() == at::kByte){ #ifdef CPU_CAPABILITY_AVX2 @@ -1852,8 +1852,8 @@ void upsample_bilinear2d_aa_kernel_impl( const Tensor& output, const Tensor& input, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { #ifdef CPU_CAPABILITY_AVX2 if (input.dtype() == at::kByte && input.size(1) <= 4) { upsample_avx_bilinear_bicubic_uint8( @@ -1875,9 +1875,9 @@ void upsample_trilinear3d_kernel_impl( const Tensor& output, const Tensor& input, bool align_corners, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_d, + std::optional scales_h, + std::optional scales_w) { if ((_use_vectorized_kernel_cond_3d(output, input))) { AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "upsample_trilinear3d_channels_last", [&] { cpu_upsample_linear_channels_last(output, input, align_corners, {scales_d, scales_h, scales_w}); @@ -1892,8 +1892,8 @@ void upsample_bicubic2d_kernel_impl( const Tensor& output, const Tensor& input, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { if (input.dtype() == at::kByte){ #ifdef CPU_CAPABILITY_AVX2 @@ -1922,8 +1922,8 @@ void upsample_bicubic2d_aa_kernel_impl( const Tensor& output, const Tensor& input, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { #ifdef CPU_CAPABILITY_AVX2 if (input.dtype() == at::kByte && input.size(1) <= 4) { @@ -2061,8 +2061,8 @@ void upsample_bilinear2d_aa_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { AT_DISPATCH_FLOATING_TYPES( grad_output.scalar_type(), "upsample_bilinear2d_aa_backward_cpu", [&] { cpu_upsample_genNd_backward_aa( @@ -2074,8 +2074,8 @@ void upsample_bicubic2d_aa_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { AT_DISPATCH_FLOATING_TYPES( grad_output.scalar_type(), "upsample_bicubic2d_aa_backward_cpu", [&] { cpu_upsample_genNd_backward_aa( diff --git a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp index b97b5cefee2c8..fae70686591ee 100644 --- a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp +++ b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp @@ -12,7 +12,7 @@ namespace at::native { namespace { -using scale_t = std::vector>; +using scale_t = std::vector>; template , @@ -337,7 +337,7 @@ void cpu_upsample_nearest_backward_channels_last( void upsample_nearest1d_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, - c10::optional scales_w) { + std::optional scales_w) { AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_nearest1d_backward", [&] { cpu_upsample_nearest_backward(grad_input, grad_output, {scales_w}); }); @@ -346,7 +346,7 @@ void upsample_nearest1d_backward_kernel_impl( void _upsample_nearest_exact1d_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, - c10::optional scales_w) { + std::optional scales_w) { AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact1d_backward", [&] { cpu_upsample_nearest_backward(grad_input, grad_output, {scales_w}); }); @@ -355,8 +355,8 @@ void _upsample_nearest_exact1d_backward_kernel_impl( void upsample_nearest2d_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) { AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_nearest2d_backward_cl", [&] { cpu_upsample_nearest_backward_channels_last(grad_input, grad_output, {scales_h, scales_w}); @@ -371,8 +371,8 @@ void upsample_nearest2d_backward_kernel_impl( void _upsample_nearest_exact2d_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) { AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward_cl", [&] { cpu_upsample_nearest_backward_channels_last(grad_input, grad_output, {scales_h, scales_w}); @@ -387,9 +387,9 @@ void _upsample_nearest_exact2d_backward_kernel_impl( void upsample_nearest3d_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_d, + std::optional scales_h, + std::optional scales_w) { if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) { AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest3d_backward_cl", [&] { cpu_upsample_nearest_backward_channels_last(grad_input, grad_output, {scales_d, scales_h, scales_w}); @@ -404,9 +404,9 @@ void upsample_nearest3d_backward_kernel_impl( void _upsample_nearest_exact3d_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_d, + std::optional scales_h, + std::optional scales_w) { if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) { AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact3d_backward_cl", [&] { cpu_upsample_nearest_backward_channels_last(grad_input, grad_output, {scales_d, scales_h, scales_w}); @@ -745,7 +745,7 @@ void upsample_linear1d_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, bool align_corners, - c10::optional scales_w) { + std::optional scales_w) { AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_linear1d_backward", [&] { cpu_upsample_linear_backward(grad_input, grad_output, align_corners, {scales_w}); }); @@ -755,8 +755,8 @@ void upsample_bilinear2d_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) { AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_bilinear2d_backward_channels_last", [&] { cpu_upsample_linear_backward_channels_last(grad_input, grad_output, align_corners, {scales_h, scales_w}); @@ -772,9 +772,9 @@ void upsample_trilinear3d_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, bool align_corners, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_d, + std::optional scales_h, + std::optional scales_w) { if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) { AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_trilinear3d_backward_channels_last", [&] { cpu_upsample_linear_backward_channels_last(grad_input, grad_output, align_corners, {scales_d, scales_h, scales_w}); diff --git a/aten/src/ATen/native/cuda/AveragePool2d.cu b/aten/src/ATen/native/cuda/AveragePool2d.cu index e55b9e5e96ef1..3ea9dcc854a3f 100644 --- a/aten/src/ATen/native/cuda/AveragePool2d.cu +++ b/aten/src/ATen/native/cuda/AveragePool2d.cu @@ -250,7 +250,7 @@ TORCH_IMPL_FUNC(avg_pool2d_out_cuda) int64_t padW_, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, const Tensor& output) { TensorArg output_arg{ output, "output", 1 }; TensorArg input_arg{ input_, "input_", 2 }; @@ -362,7 +362,7 @@ TORCH_IMPL_FUNC(avg_pool2d_backward_out_cuda) ( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, const Tensor& gradInput ) { TensorArg gradInput_arg{ gradInput, "gradInput", 1 }; diff --git a/aten/src/ATen/native/cuda/AveragePool3d.cu b/aten/src/ATen/native/cuda/AveragePool3d.cu index f4b0ee00d9a9a..dabcf5b63be99 100644 --- a/aten/src/ATen/native/cuda/AveragePool3d.cu +++ b/aten/src/ATen/native/cuda/AveragePool3d.cu @@ -351,7 +351,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cuda) ( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, const Tensor& output ) { TensorArg output_arg{ output, "output", 1 }; @@ -451,7 +451,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cuda) ( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, const Tensor& gradInput ) { // See Note [Writing Nondeterministic Operations] diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index 9e76aad45f644..c0ed650cf0219 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -839,11 +839,11 @@ static bool _scaled_mm_allowed_device() { std::tuple _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - const c10::optional& bias, - c10::optional out_dtype, - const c10::optional& scale_a, - const c10::optional& scale_b, - const c10::optional& scale_result, + const std::optional& bias, + std::optional out_dtype, + const std::optional& scale_a, + const std::optional& scale_b, + const std::optional& scale_result, bool use_fast_accum, Tensor& out, Tensor& amax) { // Check sizes @@ -1022,11 +1022,11 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, std::tuple _scaled_mm_cuda(const Tensor& mat_a, const Tensor& mat_b, - const c10::optional& bias, - c10::optional out_dtype, - const c10::optional& scale_a, - const c10::optional& scale_b, - const c10::optional& scale_result, + const std::optional& bias, + std::optional out_dtype, + const std::optional& scale_a, + const std::optional& scale_b, + const std::optional& scale_result, bool use_fast_accum) { const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type()); Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_)); diff --git a/aten/src/ATen/native/cuda/Bucketization.cu b/aten/src/ATen/native/cuda/Bucketization.cu index 05d5421b046f8..73a68683b6c04 100644 --- a/aten/src/ATen/native/cuda/Bucketization.cu +++ b/aten/src/ATen/native/cuda/Bucketization.cu @@ -134,8 +134,8 @@ Tensor& searchsorted_out_cuda( const Tensor& self, bool out_int32, bool right, - const c10::optional side_opt, - const c10::optional& sorter_opt, + const std::optional side_opt, + const std::optional& sorter_opt, Tensor& result) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned sorter_maybe_owned = at::borrow_from_optional_tensor(sorter_opt); @@ -180,8 +180,8 @@ Tensor& searchsorted_out_cuda( const Scalar& self, bool out_int32, bool right, - const c10::optional side_opt, - const c10::optional& sorter_opt, + const std::optional side_opt, + const std::optional& sorter_opt, Tensor& result) { const Tensor& scalar_tensor = searchsorted_scalar_tensor(self, sorted_sequence.device()); return searchsorted_out_cuda(sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter_opt, result); @@ -192,8 +192,8 @@ Tensor searchsorted_cuda( const Tensor& self, bool out_int32, bool right, - const c10::optional side_opt, - const c10::optional& sorter) { + const std::optional side_opt, + const std::optional& sorter) { ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long; c10::TensorOptions options = TensorOptions().device(self.options().device()).dtype(scalar_type); Tensor result = at::empty({0}, options, MemoryFormat::Contiguous); @@ -206,8 +206,8 @@ Tensor searchsorted_cuda( const Scalar& self, bool out_int32, bool right, - const c10::optional side_opt, - const c10::optional& sorter) { + const std::optional side_opt, + const std::optional& sorter) { const Tensor& scalar_tensor = searchsorted_scalar_tensor(self, sorted_sequence.device()); return searchsorted_cuda(sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter); } diff --git a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu index 9e45e2693cb0f..4f6ef77eb7e05 100644 --- a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu +++ b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu @@ -376,7 +376,7 @@ Tensor& slow_conv2d_forward_out_cuda( const Tensor &self_, const Tensor &weight_, IntArrayRef kernel_size, - const c10::optional &bias_, + const std::optional &bias_, IntArrayRef stride, IntArrayRef padding, Tensor &output) { @@ -409,7 +409,7 @@ Tensor slow_conv2d_forward_cuda( const Tensor &self, const Tensor &weight, IntArrayRef kernel_size, - const c10::optional &bias, + const std::optional &bias, IntArrayRef stride, IntArrayRef padding) { auto output = at::empty({0}, self.options()); diff --git a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu index 69757df220886..b87dd41dd59ef 100644 --- a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu +++ b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu @@ -29,9 +29,120 @@ PackedTensorAccessor32 dummy_packed_accessor32() { return {nullptr, zeros.data(), zeros.data()}; } +template +__global__ void +#if !defined(USE_ROCM) +C10_LAUNCH_BOUNDS_1(at::cuda::detail::CUDA_NUM_THREADS) +#endif +conv_depthwise2d_forward_kernel_generic( + const PackedTensorAccessor32 input, + PackedTensorAccessor32 output, + const PackedTensorAccessor32 weight, + const PackedTensorAccessor32 bias, + bool biasEnabled, + index_t totalElements, + const int outputChannels, + const int depthwiseMultiplier, + const int inputWidth, const int inputHeight, + const int outputWidth, const int outputHeight, + const int kernelWidth, const int kernelHeight, + const int strideWidth, const int strideHeight, + const int padWidth, const int padHeight, + const int dilationWidth, const int dilationHeight) { + using acc_t = at::acc_type; + + CUDA_KERNEL_LOOP_TYPE(linearIndex, totalElements, index_t) { + //calculate n,c,h,w indices, replacing modulos by divide and multiply add, + //result is same as would be in the code below + //const int n = linearIndex / batchStride; //batchStride = outputChannels * outputHeight * outputWidth + //const int c = (linearIndex / channelStride) % outputChannels; //channelStride = outputHeight * outputWidth + //const int h = (linearIndex / outputWidth) % outputHeight; + //const int w = linearIndex % outputWidth; + + int indtmp1 = linearIndex/outputWidth; + const int w = linearIndex - indtmp1 * outputWidth; + int indtmp2 = indtmp1/outputHeight; + const int h = indtmp1 - indtmp2 * outputHeight; + indtmp1 = indtmp2; + indtmp2 = indtmp1/outputChannels; + const int c = indtmp1 - indtmp2 * outputChannels; + const int n = indtmp2; + + int inputChannel = c; + int inputChannels = outputChannels; + if (depthwiseMultiplier !=1) { + inputChannel /= depthwiseMultiplier; + inputChannels /= depthwiseMultiplier; + } + + int weightOffset = c * kernelHeight * kernelWidth; + + // By precisely computing the filtering boundaries, we avoid repeating several + // expensive edge condition checks for every fetched item. If the input element is + // resident in L1, then the extra branches and comparisons would have been + // comparable in terms of cycles with the actual data fetch. Therefore computing + // boundaries ahead of the loop showed significant performance boost. + + int kHmin = 0, kHmax = kernelHeight, kWmin = 0, kWmax = kernelWidth; + + // Top + int h_in_min = -padHeight + h * strideHeight; + if (h_in_min < 0) { + kHmin = -h_in_min / dilationHeight; + if ((-h_in_min) % dilationHeight > 0) { + kHmin++; + } + } + + // Bottom + int h_in_max = h_in_min + (kernelHeight - 1) * dilationHeight - inputHeight + 1; + if (h_in_max >= 0) { + kHmax = kernelHeight - h_in_max / dilationHeight; + if (h_in_max % dilationHeight > 0) { + kHmax--; + } + } + + // Left + int w_in_min = -padWidth + w * strideWidth; + if (w_in_min < 0) { + kWmin = -w_in_min / dilationWidth; + if ((-w_in_min) % dilationWidth > 0) { + kWmin++; + } + } + + // Right + int w_in_max = w_in_min + (kernelWidth - 1) * dilationWidth - inputWidth + 1; + if (w_in_max >= 0) { + kWmax = kernelWidth - w_in_max / dilationWidth; + if (w_in_max % dilationWidth > 0) { + kWmax--; + } + } + + acc_t value = biasEnabled ? static_cast(bias.data()[c]) : acc_t(0); + const index_t offset0 = (n * inputChannels + inputChannel) * inputHeight * inputWidth; + + for (int kH = kHmin; kH < kHmax; ++kH) { + const int h_in = -padHeight + h * strideHeight + kH * dilationHeight; + for (int kW = kWmin; kW < kWmax; ++kW) { + const int w_in = -padWidth + w * strideWidth + kW * dilationWidth; + const index_t offset = offset0 + h_in * inputWidth + w_in; + value += (static_cast(weight.data()[weightOffset + kH * kernelWidth + kW]) * + static_cast(input.data()[offset])); + } + } + output.data()[linearIndex] = static_cast(value); + } +} template -__global__ void conv_depthwise2d_forward_kernel( +__global__ void +#if !defined(USE_ROCM) +C10_LAUNCH_BOUNDS_1(at::cuda::detail::CUDA_NUM_THREADS) +#endif +conv_depthwise2d_forward_kernel( const PackedTensorAccessor32 input, PackedTensorAccessor32 output, const PackedTensorAccessor32 weight, @@ -315,7 +426,13 @@ void conv_depthwise2d_forward_out( const auto bias_a = has_bias ? bias.packed_accessor32() : dummy_packed_accessor32(); - if (kW == 3 && kH == 3) { + if (kW == 5 && kH == 5) { + conv_depthwise2d_forward_kernel<5> <<>>( + input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier, + width, height, outputWidth, outputHeight, + kW, kH, dW, dH, padW, padH, dilationW, dilationH); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } else if (kW == 3 && kH == 3) { conv_depthwise2d_forward_kernel<3> <<>>( input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier, width, height, outputWidth, outputHeight, @@ -328,7 +445,7 @@ void conv_depthwise2d_forward_out( kW, kH, dW, dH, padW, padH, dilationW, dilationH); C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { - conv_depthwise2d_forward_kernel<0> <<>>( + conv_depthwise2d_forward_kernel_generic<<>>( input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier, width, height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH); @@ -521,7 +638,7 @@ const Tensor& conv_depthwise2d_cuda_out( const Tensor &input_, const Tensor &weight_, IntArrayRef kernel_size, - const c10::optional &bias_opt, + const std::optional &bias_opt, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, @@ -556,7 +673,7 @@ Tensor conv_depthwise2d_cuda( const Tensor &input, const Tensor &weight, IntArrayRef kernel_size, - const c10::optional &bias, + const std::optional &bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation) { diff --git a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu index 991471a6ef82f..62c36d66ee40e 100644 --- a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu +++ b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu @@ -390,7 +390,7 @@ void conv_depthwise_shape_check( Tensor conv_depthwise3d_cuda( const Tensor& input, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation) { diff --git a/aten/src/ATen/native/cuda/DistributionBernoulli.cu b/aten/src/ATen/native/cuda/DistributionBernoulli.cu index 89a518267d25e..5a04ae9b3450f 100644 --- a/aten/src/ATen/native/cuda/DistributionBernoulli.cu +++ b/aten/src/ATen/native/cuda/DistributionBernoulli.cu @@ -23,12 +23,12 @@ namespace at::native { -void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional gen_) { +void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, std::optional gen_) { auto generator = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::bernoulli_kernel(self, p_, generator); } -void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional gen) { +void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional gen) { auto iter = TensorIterator::borrowing_nullary_op(self); auto generator = get_generator_or_default(gen, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::bernoulli_kernel(iter, p, generator); diff --git a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu index a66d3cf3288fd..e6a4629930659 100644 --- a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu @@ -5,7 +5,7 @@ namespace at::native { -void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, c10::optional gen) { +void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional gen) { auto generator = get_generator_or_default(gen, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::cauchy_kernel(iter, median, sigma, generator); } diff --git a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu index 76cb94f6fd878..78ee9e745d36b 100644 --- a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu @@ -5,7 +5,7 @@ namespace at::native { -void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional gen) { +void exponential_kernel(TensorIteratorBase& iter, double lambda, std::optional gen) { auto generator = get_generator_or_default(gen, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::exponential_kernel(iter, lambda, generator); } diff --git a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu index 0fe49d7bbd4b5..783863f99a9aa 100644 --- a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu @@ -5,7 +5,7 @@ namespace at::native { -void geometric_kernel(TensorIteratorBase& iter, double p_, c10::optional gen) { +void geometric_kernel(TensorIteratorBase& iter, double p_, std::optional gen) { auto generator = get_generator_or_default(gen, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::geometric_kernel(iter, p_, generator); } diff --git a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu index f394d4fea39db..148e8e00dd99b 100644 --- a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu @@ -5,7 +5,7 @@ namespace at::native { -void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, c10::optional gen) { +void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional gen) { auto generator = get_generator_or_default(gen, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::log_normal_kernel(iter, mean, std, generator); } diff --git a/aten/src/ATen/native/cuda/DistributionNormal.cu b/aten/src/ATen/native/cuda/DistributionNormal.cu index a17c3e3da0556..bd4763e269f89 100644 --- a/aten/src/ATen/native/cuda/DistributionNormal.cu +++ b/aten/src/ATen/native/cuda/DistributionNormal.cu @@ -5,7 +5,7 @@ namespace at::native { -void normal_kernel(const TensorBase &self, double mean, double std, c10::optional gen) { +void normal_kernel(const TensorBase &self, double mean, double std, std::optional gen) { auto generator = get_generator_or_default(gen, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::normal_kernel(self, mean, std, generator); } diff --git a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu index 034a19c512f4f..827a12b3f28be 100644 --- a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu @@ -5,17 +5,17 @@ namespace at::native { -void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional gen_) { +void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional gen_) { auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::random_from_to_kernel(iter, range, base, gen); } -void random_full_64_bits_range_kernel(TensorIteratorBase& iter, c10::optional gen_) { +void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional gen_) { auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::random_full_64_bits_range_kernel(iter, gen); } -void random_kernel(TensorIteratorBase& iter, c10::optional gen_) { +void random_kernel(TensorIteratorBase& iter, std::optional gen_) { auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::random_kernel(iter, gen); } diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h index 8ac91f3114511..8f8860f04ad1b 100644 --- a/aten/src/ATen/native/cuda/DistributionTemplates.h +++ b/aten/src/ATen/native/cuda/DistributionTemplates.h @@ -352,10 +352,10 @@ void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG gen) { template struct RandomFromToKernel { - void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional gen) { + void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional gen) { random_from_to_kernel(iter, range, base, check_generator(gen)); } - void operator()(TensorIteratorBase& iter, c10::optional gen) { + void operator()(TensorIteratorBase& iter, std::optional gen) { random_full_64_bits_range_kernel(iter, check_generator(gen)); } }; @@ -448,7 +448,7 @@ void normal_kernel(const TensorBase &self, double mean_, double std_, RNG gen) { template struct NormalKernel { - void operator()(const TensorBase &self, double mean, double std, c10::optional gen) { + void operator()(const TensorBase &self, double mean, double std, std::optional gen) { normal_kernel(self, mean, std, check_generator(gen)); } }; @@ -481,7 +481,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen) template struct UniformKernel { - void operator()(TensorIteratorBase& iter, double from, double to, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double from, double to, std::optional gen) { uniform_kernel(iter, from, to, check_generator(gen)); } }; @@ -504,7 +504,7 @@ void log_normal_kernel(TensorIteratorBase& iter, double mean_, double std_, RNG template struct LogNormalKernel { - void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double mean, double std, std::optional gen) { log_normal_kernel(iter, mean, std, check_generator(gen)); } }; @@ -525,7 +525,7 @@ void geometric_kernel(TensorIteratorBase& iter, double p, RNG gen) { template struct GeometricKernel { - void operator()(TensorIteratorBase& iter, double p, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double p, std::optional gen) { geometric_kernel(iter, p, check_generator(gen)); } }; @@ -548,7 +548,7 @@ void exponential_kernel(TensorIteratorBase& iter, double lambda_, RNG gen) { template struct ExponentialKernel { - void operator()(TensorIteratorBase& iter, double lambda, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double lambda, std::optional gen) { exponential_kernel(iter, lambda, check_generator(gen)); } }; @@ -571,7 +571,7 @@ void cauchy_kernel(TensorIteratorBase& iter, double median_, double sigma_, RNG template struct CauchyKernel { - void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double median, double sigma, std::optional gen) { cauchy_kernel(iter, median, sigma, check_generator(gen)); } }; @@ -661,10 +661,10 @@ void bernoulli_kernel(TensorIteratorBase& iter, double p, RNG gen) { template struct BernoulliKernel { - void operator()(TensorIteratorBase& iter, double p, c10::optional gen) { + void operator()(TensorIteratorBase& iter, double p, std::optional gen) { bernoulli_kernel(iter, p, check_generator(gen)); } - void operator()(const TensorBase &self, const TensorBase &p_, c10::optional gen) { + void operator()(const TensorBase &self, const TensorBase &p_, std::optional gen) { bernoulli_kernel(self, p_, check_generator(gen)); } }; diff --git a/aten/src/ATen/native/cuda/DistributionUniform.cu b/aten/src/ATen/native/cuda/DistributionUniform.cu index 2ebdfa4464598..ed34b78727dbd 100644 --- a/aten/src/ATen/native/cuda/DistributionUniform.cu +++ b/aten/src/ATen/native/cuda/DistributionUniform.cu @@ -5,7 +5,7 @@ namespace at::native { -void uniform_kernel(TensorIteratorBase& iter, double from, double to, c10::optional gen) { +void uniform_kernel(TensorIteratorBase& iter, double from, double to, std::optional gen) { auto generator = get_generator_or_default(gen, cuda::detail::getDefaultCUDAGenerator()); templates::cuda::uniform_kernel(iter, from, to, generator); } diff --git a/aten/src/ATen/native/cuda/Distributions.cpp b/aten/src/ATen/native/cuda/Distributions.cpp index c0d5abb49bf6a..21ce151276fe5 100644 --- a/aten/src/ATen/native/cuda/Distributions.cpp +++ b/aten/src/ATen/native/cuda/Distributions.cpp @@ -18,14 +18,14 @@ namespace at::native { -Tensor _s_poisson_cuda(const Tensor& lambda, c10::optional gen_) { +Tensor _s_poisson_cuda(const Tensor& lambda, std::optional gen_) { auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); Tensor ret = at::empty(lambda.sizes(), lambda.options()); launch_poisson_cuda_kernel(ret, lambda, gen); return ret; } -Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, c10::optional gen_) { +Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, std::optional gen_) { auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); Tensor ret = at::empty(count.sizes(), count.options()); at::TensorIterator iter = at::TensorIteratorConfig() @@ -37,14 +37,14 @@ Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, c10::optional gen_) { +Tensor _s_gamma_cuda(const Tensor& alpha, std::optional gen_) { auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); Tensor ret = at::empty(alpha.sizes(), alpha.options()); launch_gamma_kernel(ret, alpha, gen); return ret; } -Tensor _s_dirichlet_cuda(const Tensor& alpha, c10::optional gen_) { +Tensor _s_dirichlet_cuda(const Tensor& alpha, std::optional gen_) { auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); Tensor ret = at::empty(alpha.sizes(), alpha.options()); launch_gamma_kernel(ret, alpha, gen); diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu index a749872ba38f3..690051e679082 100644 --- a/aten/src/ATen/native/cuda/Dropout.cu +++ b/aten/src/ATen/native/cuda/Dropout.cu @@ -366,7 +366,7 @@ dropout_cuda(CUDAGeneratorImpl* gen, const Tensor& self, double p){ } std::tuple -native_dropout_cuda(const Tensor& self, double p, c10::optional train){ +native_dropout_cuda(const Tensor& self, double p, std::optional train){ // short-cut for train == false if (train.has_value() && !train.value()) { return std::make_tuple(self.clone(), at::ones_like(self, self.options().dtype(c10::CppTypeToScalarType::value))); @@ -387,7 +387,7 @@ native_dropout_cuda(const Tensor& self, double p, c10::optional train){ // TODO: _fused_dropout_cuda is to be removed, see PR #63937 std::tuple -fused_dropout_cuda(const Tensor& self, double p, c10::optional gen_){ +fused_dropout_cuda(const Tensor& self, double p, std::optional gen_){ auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); return dropout_cuda(gen, self, p); } diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index 64852ae79b1f9..7c9f845b7ee26 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -312,7 +312,7 @@ Tensor embedding_bag_backward_cuda_max(const Tensor &grad, std::tuple _embedding_bag_forward_only_cuda(const Tensor &weight, const Tensor &indices, const Tensor &offsets, const bool scale_grad_by_freq, - const int64_t mode, bool sparse, const c10::optional& per_sample_weights_opt, + const int64_t mode, bool sparse, const std::optional& per_sample_weights_opt, bool include_last_offset, int64_t padding_idx) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt); @@ -335,7 +335,7 @@ _embedding_bag_forward_only_cuda(const Tensor &weight, const Tensor &indices, std::tuple _embedding_bag_cuda(const Tensor &weight, const Tensor &indices_, const Tensor &offsets_, const bool scale_grad_by_freq, - const int64_t mode, bool sparse, const c10::optional& per_sample_weights_opt, + const int64_t mode, bool sparse, const std::optional& per_sample_weights_opt, bool include_last_offset, int64_t padding_idx) { TORCH_CHECK(indices_.dim() == 1 || indices_.dim() == 2, "input has to be a 1D or 2D Tensor, but got Tensor of dimension ", @@ -432,7 +432,7 @@ Tensor _embedding_bag_dense_backward_cuda(const Tensor &grad_, const Tensor &ind const Tensor &bag_size_, const Tensor &max_indices, int64_t num_weights, - bool scale_grad_by_freq, int64_t mode, const c10::optional& per_sample_weights_opt, + bool scale_grad_by_freq, int64_t mode, const std::optional& per_sample_weights_opt, int64_t padding_idx) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt); diff --git a/aten/src/ATen/native/cuda/FusedAdamKernel.cu b/aten/src/ATen/native/cuda/FusedAdamKernel.cu index 9365f9a34ea76..99120ffc2816e 100644 --- a/aten/src/ATen/native/cuda/FusedAdamKernel.cu +++ b/aten/src/ATen/native/cuda/FusedAdamKernel.cu @@ -27,8 +27,8 @@ void _fused_adam_kernel_cuda_( const double eps, const bool amsgrad, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { if (amsgrad) { TORCH_CHECK( at::native::check_fast_path_restrictions( @@ -86,8 +86,8 @@ void _fused_adam_kernel_cuda_( const double eps, const bool amsgrad, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { if (lr.is_cpu()) { _fused_adam_kernel_cuda_( params, diff --git a/aten/src/ATen/native/cuda/FusedAdamWKernel.cu b/aten/src/ATen/native/cuda/FusedAdamWKernel.cu index f926199ae9680..b0fcfe23dee81 100644 --- a/aten/src/ATen/native/cuda/FusedAdamWKernel.cu +++ b/aten/src/ATen/native/cuda/FusedAdamWKernel.cu @@ -28,8 +28,8 @@ void _fused_adamw_kernel_cuda_( const double eps, const bool amsgrad, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { if (amsgrad) { TORCH_CHECK( at::native::check_fast_path_restrictions( @@ -87,8 +87,8 @@ void _fused_adamw_kernel_cuda_( const double eps, const bool amsgrad, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { if (lr.is_cpu()) { _fused_adamw_kernel_cuda_( params, diff --git a/aten/src/ATen/native/cuda/FusedSgdKernel.cu b/aten/src/ATen/native/cuda/FusedSgdKernel.cu index 36ac7401a2d0b..61da02ce0b888 100644 --- a/aten/src/ATen/native/cuda/FusedSgdKernel.cu +++ b/aten/src/ATen/native/cuda/FusedSgdKernel.cu @@ -157,8 +157,8 @@ void _fused_sgd_with_momentum_kernel_cuda_( const bool nesterov, const bool maximize, const bool is_first_step, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { TORCH_CHECK_GT(momentum, 0); TORCH_CHECK(at::native::check_fast_path_restrictions( {params, grads, momentum_buffer_list})); @@ -203,8 +203,8 @@ void _fused_sgd_with_momentum_kernel_cuda_( const bool nesterov, const bool maximize, const bool is_first_step, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { if (lr.is_cpu()) { _fused_sgd_with_momentum_kernel_cuda_( params, @@ -279,8 +279,8 @@ void _fused_sgd_kernel_cuda_( const bool nesterov, const bool maximize, const bool is_first_step, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { if (!momentum_buffer_list.empty()) { _fused_sgd_with_momentum_kernel_cuda_( params, @@ -343,8 +343,8 @@ void _fused_sgd_kernel_cuda_( const bool nesterov, const bool maximize, const bool is_first_step, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { if (!momentum_buffer_list.empty()) { _fused_sgd_with_momentum_kernel_cuda_( params, diff --git a/aten/src/ATen/native/cuda/IndexKernel.cpp b/aten/src/ATen/native/cuda/IndexKernel.cpp index 68770bc64e0ac..4c7ee5339afe0 100644 --- a/aten/src/ATen/native/cuda/IndexKernel.cpp +++ b/aten/src/ATen/native/cuda/IndexKernel.cpp @@ -42,7 +42,7 @@ static Tensor & masked_select_out_cuda_impl(Tensor & result, const Tensor & self auto mask_self_expanded = expand_outplace(*mask_temp, *self_temp); at::cuda::index_out( result, *std::get<1>(mask_self_expanded), - c10::List>({*std::move(std::get<0>(mask_self_expanded))})); + c10::List>({*std::move(std::get<0>(mask_self_expanded))})); return result; } diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu index ca37b2cefd411..b0a5d0a5a6a1b 100644 --- a/aten/src/ATen/native/cuda/Indexing.cu +++ b/aten/src/ATen/native/cuda/Indexing.cu @@ -426,7 +426,7 @@ int64_t largestIndex(const Tensor &self) { return result; } -void index_put_with_sort_kernel(Tensor & self, const c10::List>& indices, const Tensor & value, bool accumulate, bool unsafe) { +void index_put_with_sort_kernel(Tensor & self, const c10::List>& indices, const Tensor & value, bool accumulate, bool unsafe) { TORCH_CHECK(!indices.empty() || is_expandable_to(value.sizes(), self.sizes()), "shape mismatch: value tensor of shape ", value.sizes(), " cannot be broadcast to indexing result of shape ", self.sizes()); if (indices.size() > (size_t)self.dim()) { @@ -561,7 +561,7 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List>& indices, const Tensor & value, double scale, int zero_point, bool unsafe) { +void index_put_with_sort_quantized(Tensor & self, const c10::List>& indices, const Tensor & value, double scale, int zero_point, bool unsafe) { if (indices.size() > (size_t)self.dim()) { TORCH_CHECK_INDEX(false, "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); } @@ -861,7 +861,7 @@ void index_add_cuda_impl(const Tensor& self, int64_t dim, const Tensor& index, c TORCH_CHECK(index.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims"); if (globalContext().deterministicAlgorithms()){ - torch::List> indices; + torch::List> indices; indices.reserve(dim + 1); for (const auto i: c10::irange(dim)) { indices.emplace_back(); diff --git a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp index 045bfa8d1f90b..701669bf709e5 100644 --- a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp +++ b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp @@ -98,7 +98,7 @@ void lazy_linalg_eig_kernel(Tensor& eigenvalues, Tensor& eigenvectors, Tensor& i void lazy_svd_kernel(const Tensor& A, const bool full_matrices, const bool compute_uv, - const c10::optional& driver, + const std::optional& driver, const Tensor& U, const Tensor& S, const Tensor& Vh, diff --git a/aten/src/ATen/native/cuda/Loss.cu b/aten/src/ATen/native/cuda/Loss.cu index 1691adca87253..d87f1aa97873b 100644 --- a/aten/src/ATen/native/cuda/Loss.cu +++ b/aten/src/ATen/native/cuda/Loss.cu @@ -62,7 +62,7 @@ void binary_cross_entropy_backward_out_kernel(Tensor& grad_input, const Tensor& namespace at::native { -Tensor binary_cross_entropy_cuda(const Tensor& input, const Tensor& target, const c10::optional& weight_opt, int64_t reduction) { +Tensor binary_cross_entropy_cuda(const Tensor& input, const Tensor& target, const std::optional& weight_opt, int64_t reduction) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -72,7 +72,7 @@ Tensor binary_cross_entropy_cuda(const Tensor& input, const Tensor& target, cons input, target, weight, reduction, loss); } -Tensor& binary_cross_entropy_out_cuda(const Tensor& input, const Tensor& target, const c10::optional& weight_opt, int64_t reduction, Tensor& loss) { +Tensor& binary_cross_entropy_out_cuda(const Tensor& input, const Tensor& target, const std::optional& weight_opt, int64_t reduction, Tensor& loss) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -121,7 +121,7 @@ Tensor& binary_cross_entropy_out_cuda(const Tensor& input, const Tensor& target, return loss; } -Tensor binary_cross_entropy_backward_cuda(const Tensor& grad, const Tensor& input, const Tensor& target, const c10::optional& weight_opt, int64_t reduction) { +Tensor binary_cross_entropy_backward_cuda(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional& weight_opt, int64_t reduction) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -131,7 +131,7 @@ Tensor binary_cross_entropy_backward_cuda(const Tensor& grad, const Tensor& inpu grad, input, target, weight, reduction, grad_input); } -Tensor& binary_cross_entropy_backward_out_cuda(const Tensor& grad, const Tensor& input, const Tensor& target, const c10::optional& weight_opt, int64_t reduction, Tensor& grad_input) { +Tensor& binary_cross_entropy_backward_out_cuda(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional& weight_opt, int64_t reduction, Tensor& grad_input) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; diff --git a/aten/src/ATen/native/cuda/MixedDtypesLinear.cu b/aten/src/ATen/native/cuda/MixedDtypesLinear.cu index 7b55c7a952442..27563c1017fbf 100644 --- a/aten/src/ATen/native/cuda/MixedDtypesLinear.cu +++ b/aten/src/ATen/native/cuda/MixedDtypesLinear.cu @@ -196,8 +196,8 @@ mixed_dtypes_linear_dispatch_bias_activation( Tensor _mixed_dtypes_linear(const Tensor& input, const Tensor& weight, const Tensor& scale, - const c10::optional& bias_opt, - const c10::optional activation_opt) { + const std::optional& bias_opt, + const std::optional activation_opt) { #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080) AT_ERROR("_mixed_dtypes_linear: not compiled for this platform"); return Tensor{}; diff --git a/aten/src/ATen/native/cuda/MultiMarginLoss.cu b/aten/src/ATen/native/cuda/MultiMarginLoss.cu index 989a3e116ad62..0424fcc8e3d38 100644 --- a/aten/src/ATen/native/cuda/MultiMarginLoss.cu +++ b/aten/src/ATen/native/cuda/MultiMarginLoss.cu @@ -132,7 +132,7 @@ void multi_margin_loss_shape_check( const int64_t& ndims, const Tensor& input, const Tensor& target, - const c10::optional& weight) { + const std::optional& weight) { TORCH_CHECK( (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0, "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ", @@ -162,7 +162,7 @@ void multi_margin_loss_shape_check( Tensor& multi_margin_loss_cuda_out( const Tensor &input_, const Tensor &target_, const Scalar &p_, const Scalar &margin_, - const c10::optional &weights_, int64_t reduction, Tensor& out_) { + const std::optional &weights_, int64_t reduction, Tensor& out_) { auto p = p_.toLong(); int64_t nframe, dim; const auto ndims = input_.dim(); @@ -288,7 +288,7 @@ Tensor& multi_margin_loss_cuda_out( Tensor multi_margin_loss_cuda( const Tensor &input, const Tensor &target, const Scalar &p, const Scalar &margin, - const c10::optional &weights, int64_t reduction) { + const std::optional &weights, int64_t reduction) { auto out = at::empty({0}, input.options()); multi_margin_loss_cuda_out(input, target, p, margin, weights, reduction, out); return out; @@ -296,7 +296,7 @@ Tensor multi_margin_loss_cuda( Tensor& multi_margin_loss_cuda_backward_out( const Tensor &grad_output_,const Tensor &input_, const Tensor &target_, - const Scalar &p_, const Scalar &margin_, const c10::optional &weights_, + const Scalar &p_, const Scalar &margin_, const std::optional &weights_, int64_t reduction, Tensor &grad_input_) { auto p = p_.toLong(); int64_t nframe, dim; @@ -403,7 +403,7 @@ Tensor& multi_margin_loss_cuda_backward_out( Tensor multi_margin_loss_cuda_backward( const Tensor &grad_output, const Tensor &input, const Tensor &target, - const Scalar &p, const Scalar &margin, const c10::optional &weights, + const Scalar &p, const Scalar &margin, const std::optional &weights, int64_t reduction) { auto grad_input = at::empty({0}, input.options()); multi_margin_loss_cuda_backward_out( diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu index d8f142a813f83..3e67f5ad5bfbe 100644 --- a/aten/src/ATen/native/cuda/MultinomialKernel.cu +++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu @@ -328,7 +328,7 @@ void multinomial_with_replacement_kernel_impl( Tensor& result, const Tensor& self, const int64_t n_sample, - c10::optional generator) { + std::optional generator) { auto gen = get_generator_or_default(generator, cuda::detail::getDefaultCUDAGenerator()); int inputSize = self.dim(); diff --git a/aten/src/ATen/native/cuda/NLLLoss2d.cu b/aten/src/ATen/native/cuda/NLLLoss2d.cu index 94c9aeba79f51..046ea7bbc6d7f 100644 --- a/aten/src/ATen/native/cuda/NLLLoss2d.cu +++ b/aten/src/ATen/native/cuda/NLLLoss2d.cu @@ -233,7 +233,7 @@ void nll_loss2d_forward_out_cuda_template( Tensor& total_weight, const Tensor& input, const Tensor& target, - const c10::optional& weight_opt, + const std::optional& weight_opt, int64_t reduction, int64_t ignore_index) { // See Note [Writing Nondeterministic Operations] @@ -356,7 +356,7 @@ void nll_loss2d_backward_out_cuda_template( const Tensor& grad_output, const Tensor& input, const Tensor& target, - const c10::optional& weight_opt, + const std::optional& weight_opt, int64_t reduction, int64_t ignore_index, const Tensor& total_weight) { @@ -467,7 +467,7 @@ void nll_loss2d_backward_out_cuda_template( std::tuple nll_loss2d_forward_out_cuda( const Tensor& self, const Tensor& target, - const c10::optional& weight_opt, + const std::optional& weight_opt, int64_t reduction, int64_t ignore_index, Tensor& output, @@ -480,7 +480,7 @@ std::tuple nll_loss2d_forward_out_cuda( std::tuple nll_loss2d_forward_cuda( const Tensor& self, const Tensor& target, - const c10::optional& weight_opt, + const std::optional& weight_opt, int64_t reduction, int64_t ignore_index) { auto output = at::empty({0}, self.options()); @@ -494,7 +494,7 @@ Tensor& nll_loss2d_backward_out_cuda( const Tensor& grad_output, const Tensor& self, const Tensor& target, - const c10::optional& weight_opt, + const std::optional& weight_opt, int64_t reduction, int64_t ignore_index, const Tensor& total_weight, @@ -515,7 +515,7 @@ Tensor nll_loss2d_backward_cuda( const Tensor& grad_output, const Tensor& self, const Tensor& target, - const c10::optional& weight_opt, + const std::optional& weight_opt, int64_t reduction, int64_t ignore_index, const Tensor& total_weight) { diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu index fd6e83aa24171..56b762a051fbf 100644 --- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu +++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu @@ -835,7 +835,7 @@ void slow_conv_transpose3d_acc_grad_parameters_cuda( Tensor& slow_conv_transpose3d_out_cuda(const Tensor& input, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, IntArrayRef output_padding, @@ -862,7 +862,7 @@ Tensor& slow_conv_transpose3d_out_cuda(const Tensor& input, Tensor slow_conv_transpose3d_cuda( const Tensor& input, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, IntArrayRef output_padding, diff --git a/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu b/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu index e62e959fdf4a0..cd969fa9405bb 100644 --- a/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu +++ b/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu @@ -399,7 +399,7 @@ void slow_conv_dilated_all_cuda_template( Tensor slow_conv_dilated2d_cuda( const Tensor& input, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride_size, IntArrayRef pad_size, IntArrayRef dilation_size) { @@ -505,7 +505,7 @@ std::tuple slow_conv_dilated2d_backward_cuda( Tensor slow_conv_dilated3d_cuda( const Tensor& input, const Tensor& weight, - IntArrayRef kernel_size, const c10::optional& bias_opt, + IntArrayRef kernel_size, const std::optional& bias_opt, IntArrayRef stride_size, IntArrayRef pad_size, IntArrayRef dilation_size) { diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu index ce0a50daae145..2bfaf13390858 100644 --- a/aten/src/ATen/native/cuda/Normalization.cu +++ b/aten/src/ATen/native/cuda/Normalization.cu @@ -95,8 +95,8 @@ inline Impl batch_norm_choose_impl(const Tensor& in1, const Tensor& in2) { } void batch_norm_elementwise( - const Tensor& out, const Tensor& self, const c10::optional& weight_opt, - const c10::optional& bias_opt, const Tensor& mean_, const Tensor& invstd_) { + const Tensor& out, const Tensor& self, const std::optional& weight_opt, + const std::optional& bias_opt, const Tensor& mean_, const Tensor& invstd_) { switch (batch_norm_choose_impl(self)) { case Impl::Contiguous: { c10::MaybeOwned weight = at::borrow_from_optional_tensor(weight_opt); @@ -432,7 +432,7 @@ void batch_norm_calc_invstd(const Tensor& out_invstd, const Tensor& running_var, } } -std::tuple batch_norm_cuda_out(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) { +std::tuple batch_norm_cuda_out(const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) { const bool has_running_mean = (running_mean_opt.has_value() && running_mean_opt->defined()); const bool has_running_var = (running_var_opt.has_value() && running_var_opt->defined()); TORCH_CHECK(has_running_mean == has_running_var); @@ -458,7 +458,7 @@ std::tuple batch_norm_cuda_out(const Tensor& self, co return std::tuple(output, save_mean, save_invstd); } -std::tuple batch_norm_cuda(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, bool train, double momentum, double epsilon) { +std::tuple batch_norm_cuda(const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, bool train, double momentum, double epsilon) { auto output = at::empty_like(self); int64_t n_input = self.size(1); auto options = self.options().dtype( @@ -482,7 +482,7 @@ std::tuple batch_norm_cuda(const Tensor& self, const c10 } std::tuple _batch_norm_with_update_cuda( - const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + const Tensor& input, const std::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, double momentum, double eps) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); @@ -507,7 +507,7 @@ std::tuple _batch_norm_with_update_cuda( } std::tuple _batch_norm_with_update_cuda_out( - const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + const Tensor& input, const std::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var, Tensor& reserve) { // See [Note: hacky wrapper removal for optional tensor] @@ -529,26 +529,26 @@ std::tuple _batch_norm_with_update_cuda_out( return std::tuple(out, save_mean, save_var, reserve); } -std::tuple _batch_norm_legit_cuda(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon) { +std::tuple _batch_norm_legit_cuda(const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon) { return batch_norm_cuda(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, epsilon); } -std::tuple _batch_norm_legit_no_stats_cuda(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, bool train, double momentum, double epsilon) { +std::tuple _batch_norm_legit_no_stats_cuda(const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, bool train, double momentum, double epsilon) { return batch_norm_cuda(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, epsilon); } -std::tuple _batch_norm_legit_cuda_out(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) { +std::tuple _batch_norm_legit_cuda_out(const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) { return batch_norm_cuda_out(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, epsilon, output, save_mean, save_invstd); } -std::tuple _batch_norm_legit_no_stats_cuda_out(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) { +std::tuple _batch_norm_legit_no_stats_cuda_out(const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) { return batch_norm_cuda_out(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, epsilon, output, save_mean, save_invstd); } std::tuple _new_batch_norm_backward_cuda( const Tensor& grad_output, const Tensor& input, const Tensor& weight, - const c10::optional& running_mean_opt, const c10::optional& running_var_opt, - const c10::optional& save_mean_opt, const c10::optional& save_var_opt, + const std::optional& running_mean_opt, const c10::optional& running_var_opt, + const std::optional& save_mean_opt, const c10::optional& save_var_opt, bool update, double eps, std::array grad_input_mask, const Tensor& reserve) { const Tensor& dummy_bias = at::empty(1); const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();}); @@ -567,7 +567,7 @@ std::tuple _new_batch_norm_backward_cuda( } } -std::tuple batch_norm_backward_cuda(const Tensor& grad_out, const Tensor& input, const c10::optional& weight_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_invstd_opt, bool train, double epsilon, std::array grad_input_mask) { +std::tuple batch_norm_backward_cuda(const Tensor& grad_out, const Tensor& input, const std::optional& weight_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_invstd_opt, bool train, double epsilon, std::array grad_input_mask) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight = at::borrow_from_optional_tensor(weight_opt); c10::MaybeOwned save_mean = at::borrow_from_optional_tensor(save_mean_opt); @@ -673,8 +673,8 @@ std::tuple batch_norm_stats_cuda(const Tensor& self, double epsi } Tensor batch_norm_elemt_cuda( - const Tensor& self, const c10::optional& weight_opt, - const c10::optional& bias_opt, const Tensor& mean, + const Tensor& self, const std::optional& weight_opt, + const std::optional& bias_opt, const Tensor& mean, const Tensor& invstd, double epsilon) { auto output = at::empty_like(self); // FIXME: Epsilon parameter isn't required, we don't take the reciprocal @@ -682,7 +682,7 @@ Tensor batch_norm_elemt_cuda( return output; } -Tensor& batch_norm_elemt_cuda_out(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, +Tensor& batch_norm_elemt_cuda_out(const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, const Tensor& mean, const Tensor& invstd, double epsilon, Tensor& output) { // FIXME: Epsilon parameter isn't required, we don't take the reciprocal batch_norm_elementwise(output, self, weight_opt, bias_opt, mean, invstd); @@ -690,7 +690,7 @@ Tensor& batch_norm_elemt_cuda_out(const Tensor& self, const c10::optional batch_norm_gather_stats_cuda(const Tensor& self, const Tensor& mean, const Tensor& invstd, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, double momentum, double epsilon, int64_t count) { +std::tuple batch_norm_gather_stats_cuda(const Tensor& self, const Tensor& mean, const Tensor& invstd, const std::optional& running_mean_opt, const c10::optional& running_var_opt, double momentum, double epsilon, int64_t count) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned running_mean_maybe_owned = at::borrow_from_optional_tensor(running_mean_opt); const Tensor& running_mean = *running_mean_maybe_owned; @@ -704,7 +704,7 @@ std::tuple batch_norm_gather_stats_cuda(const Tensor& self, cons std::tuple batch_norm_gather_stats_with_counts_cuda( - const Tensor& self, const Tensor& mean, const Tensor& invstd, const c10::optional& running_mean_opt /* optional */, const c10::optional& running_var_opt /* optional */, double momentum, double epsilon, const Tensor& counts) { + const Tensor& self, const Tensor& mean, const Tensor& invstd, const std::optional& running_mean_opt /* optional */, const c10::optional& running_var_opt /* optional */, double momentum, double epsilon, const Tensor& counts) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned running_mean_maybe_owned = at::borrow_from_optional_tensor(running_mean_opt); const Tensor& running_mean = *running_mean_maybe_owned; @@ -722,7 +722,7 @@ std::tuple batch_norm_gather_stats_with_counts_cuda( }); } -std::tuple batch_norm_backward_reduce_cuda(const Tensor& grad_output, const Tensor& input, const Tensor& mean, const Tensor& invstd, const c10::optional& weight_opt, bool input_g, bool weight_g, bool bias_g) { +std::tuple batch_norm_backward_reduce_cuda(const Tensor& grad_output, const Tensor& input, const Tensor& mean, const Tensor& invstd, const std::optional& weight_opt, bool input_g, bool weight_g, bool bias_g) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -759,7 +759,7 @@ std::tuple batch_norm_backward_reduce_cuda(const }); } -Tensor batch_norm_backward_elemt_cuda(const Tensor& self, const Tensor& input, const Tensor& mean, const Tensor& invstd, const c10::optional& weight_opt, const Tensor& sum_dy, const Tensor& sum_dy_xmu, const Tensor& count) { +Tensor batch_norm_backward_elemt_cuda(const Tensor& self, const Tensor& input, const Tensor& mean, const Tensor& invstd, const std::optional& weight_opt, const Tensor& sum_dy, const Tensor& sum_dy_xmu, const Tensor& count) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; @@ -794,8 +794,8 @@ Tensor batch_norm_backward_elemt_cuda(const Tensor& self, const Tensor& input, c } std::tuple batch_norm_update_stats_cuda( - const Tensor& self, const c10::optional& running_mean_opt, - const c10::optional& running_var_opt, double momentum) { + const Tensor& self, const std::optional& running_mean_opt, + const std::optional& running_var_opt, double momentum) { c10::MaybeOwned running_mean = at::borrow_from_optional_tensor(running_mean_opt); c10::MaybeOwned running_var = at::borrow_from_optional_tensor(running_var_opt); diff --git a/aten/src/ATen/native/cuda/RNN.cu b/aten/src/ATen/native/cuda/RNN.cu index a997777fe0c3a..c448ba592e4af 100644 --- a/aten/src/ATen/native/cuda/RNN.cu +++ b/aten/src/ATen/native/cuda/RNN.cu @@ -516,7 +516,7 @@ void gru_backward_impl(const Tensor& grad_hy, const Tensor& workspace, std::tuple _thnn_fused_lstm_cell_cuda( const Tensor& input_gates, const Tensor& hidden_gates, - const Tensor& cx, const c10::optional& input_bias_opt, const c10::optional& hidden_bias_opt) { + const Tensor& cx, const std::optional& input_bias_opt, const c10::optional& hidden_bias_opt) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned input_bias_maybe_owned = at::borrow_from_optional_tensor(input_bias_opt); const Tensor& input_bias = *input_bias_maybe_owned; @@ -564,7 +564,7 @@ void checkLSTMBackwardSizes(const TensorArg& grad_hy, const TensorArg& grad_cy, checkNumel(c, workspace, exp_size[0] * exp_size[1] * 4); } -std::tuple _thnn_fused_lstm_cell_backward_impl_cuda( const c10::optional& grad_hy_opt, const c10::optional& grad_cy_opt, +std::tuple _thnn_fused_lstm_cell_backward_impl_cuda( const std::optional& grad_hy_opt, const c10::optional& grad_cy_opt, const Tensor& cx, const Tensor& cy, const Tensor& workspace, bool has_bias) { // See [Note: hacky wrapper removal for optional tensor] @@ -602,7 +602,7 @@ static constexpr int64_t GRU_WORKSPACE_MULTIPLIER = 5; std::tuple _thnn_fused_gru_cell_cuda( const Tensor& input_gates, const Tensor& hidden_gates, - const Tensor& hx, const c10::optional& input_bias_opt, const c10::optional& hidden_bias_opt) { + const Tensor& hx, const std::optional& input_bias_opt, const c10::optional& hidden_bias_opt) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned input_bias_maybe_owned = at::borrow_from_optional_tensor(input_bias_opt); const Tensor& input_bias = *input_bias_maybe_owned; diff --git a/aten/src/ATen/native/cuda/Randperm.cu b/aten/src/ATen/native/cuda/Randperm.cu index c22c99dfe6a71..bde5457e8cdd8 100644 --- a/aten/src/ATen/native/cuda/Randperm.cu +++ b/aten/src/ATen/native/cuda/Randperm.cu @@ -55,7 +55,7 @@ namespace { template struct alignas(N) OpaqueType { char data[N]; }; } -Tensor& randperm_out_cuda(int64_t n, c10::optional generator, Tensor& result) { +Tensor& randperm_out_cuda(int64_t n, std::optional generator, Tensor& result) { TORCH_CHECK(n >= 0, "n must be non-negative, got", n); check_supported_max_int_with_precision(n, result); diff --git a/aten/src/ATen/native/cuda/Repeat.cu b/aten/src/ATen/native/cuda/Repeat.cu index 0a39a0445dbe2..57a879d6f61ac 100644 --- a/aten/src/ATen/native/cuda/Repeat.cu +++ b/aten/src/ATen/native/cuda/Repeat.cu @@ -54,7 +54,7 @@ namespace at::native { Tensor repeat_interleave_cuda( const Tensor& repeat, - c10::optional output_size) { + std::optional output_size) { Tensor output; AT_DISPATCH_INDEX_TYPES( repeat.scalar_type(), "repeat_interleave_cuda", [&]() { diff --git a/aten/src/ATen/native/cuda/Resize.cpp b/aten/src/ATen/native/cuda/Resize.cpp index 2bf6266d678b9..fe844f55d2333 100644 --- a/aten/src/ATen/native/cuda/Resize.cpp +++ b/aten/src/ATen/native/cuda/Resize.cpp @@ -49,7 +49,7 @@ void resize_bytes_cuda(StorageImpl* storage, size_t size_bytes) { const Tensor& resize_cuda_( const Tensor& self, IntArrayRef size, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { if (self.has_names()) { return resize_named_tensor_(self, size, optional_memory_format); } diff --git a/aten/src/ATen/native/cuda/RreluWithNoise.cu b/aten/src/ATen/native/cuda/RreluWithNoise.cu index 463a5ce00c813..7133a4920c327 100644 --- a/aten/src/ATen/native/cuda/RreluWithNoise.cu +++ b/aten/src/ATen/native/cuda/RreluWithNoise.cu @@ -74,7 +74,7 @@ inline void _rrelu_with_noise_cuda_train( const Tensor& noise_, const Scalar& lower_, const Scalar& upper_, - c10::optional generator) { + std::optional generator) { auto input = input_.contiguous(); auto noise = noise_.contiguous(); Tensor tmp_output = output.contiguous(); @@ -142,7 +142,7 @@ Tensor& rrelu_with_noise_out_cuda(const Tensor& self, const Scalar& lower, const Scalar& upper, bool training, - c10::optional generator, + std::optional generator, Tensor& output) { at::native::resize_output(output, self.sizes()); @@ -176,7 +176,7 @@ Tensor rrelu_with_noise_cuda( const Scalar& lower, const Scalar& upper, bool training, - c10::optional generator) { + std::optional generator) { Tensor output = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT); return at::native::rrelu_with_noise_out_cuda(self, noise, lower, upper, training, generator, output); } @@ -187,7 +187,7 @@ Tensor& rrelu_with_noise_cuda_( const Scalar& lower, const Scalar& upper, bool training, - c10::optional generator) { + std::optional generator) { return at::native::rrelu_with_noise_out_cuda( self, noise, lower, upper, training, generator, self); } diff --git a/aten/src/ATen/native/cuda/SegmentReduce.cu b/aten/src/ATen/native/cuda/SegmentReduce.cu index d4af81db771d3..cbdbb020d634a 100644 --- a/aten/src/ATen/native/cuda/SegmentReduce.cu +++ b/aten/src/ATen/native/cuda/SegmentReduce.cu @@ -266,7 +266,7 @@ Tensor _segment_reduce_lengths_offsets_backward_cuda_kernel( ReductionType reduction, const Tensor& lengths_or_offsets_contig, int64_t axis, - const c10::optional& initial, + const std::optional& initial, bool is_offsets_like) { axis = lengths_or_offsets_contig.dim() - 1; int64_t segment_count = is_offsets_like ? @@ -368,7 +368,7 @@ Tensor _segment_reduce_lengths_backward_cuda_kernel( ReductionType reduction, const Tensor& lengths_contig, int64_t axis, - const c10::optional& initial) { + const std::optional& initial) { return _segment_reduce_lengths_offsets_backward_cuda_kernel( grad_contig, output_contig, data_contig, reduction, lengths_contig, axis, initial, /*is_offsets_like=*/false); } @@ -380,7 +380,7 @@ Tensor _segment_reduce_offsets_backward_cuda_kernel( ReductionType reduction, const Tensor& offsets_contig, int64_t axis, - const c10::optional& initial) { + const std::optional& initial) { return _segment_reduce_lengths_offsets_backward_cuda_kernel( grad_contig, output_contig, data_contig, reduction, offsets_contig, axis, initial, /*is_offsets_like=*/true); } @@ -390,7 +390,7 @@ Tensor _segment_reduce_lengths_offsets_cuda_kernel( const Tensor& data, const Tensor& lengths_or_offsets, int64_t axis, - const c10::optional& initial, + const std::optional& initial, bool is_offsets_like) { // data and lengths_or_offsets should be contiguous from the call to .contiguous in segment_reduce_kernel TORCH_CHECK(data.is_contiguous()); @@ -575,7 +575,7 @@ Tensor _segment_reduce_lengths_cuda_kernel( const Tensor& data, const Tensor& lengths, int64_t axis, - const c10::optional& initial) { + const std::optional& initial) { return _segment_reduce_lengths_offsets_cuda_kernel( reduction, data, lengths, axis, initial, /*is_offsets_like=*/false); } @@ -585,7 +585,7 @@ Tensor _segment_reduce_offsets_cuda_kernel( const Tensor& data, const Tensor& offsets, int64_t axis, - const c10::optional& initial) { + const std::optional& initial) { return _segment_reduce_lengths_offsets_cuda_kernel( reduction, data, offsets, axis, initial, /*is_offsets_like=*/true); } diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu index cffd52624f9e3..97528b48d8cb0 100644 --- a/aten/src/ATen/native/cuda/SoftMax.cu +++ b/aten/src/ATen/native/cuda/SoftMax.cu @@ -1113,7 +1113,7 @@ TORCH_IMPL_FUNC(softmax_backward_cuda_out) host_softmax_backward(tmp, output, dim, half_to_float, grad_input); } -Tensor masked_softmax_cuda(const Tensor& input_, const Tensor& mask_, const c10::optional dim_, const c10::optional mask_type_) { +Tensor masked_softmax_cuda(const Tensor& input_, const Tensor& mask_, const std::optional dim_, const c10::optional mask_type_) { Tensor output = at::empty_like(input_, input_.options()); TORCH_CHECK(mask_.scalar_type() == ScalarType::Bool, "Mask should be a boolean tensor"); @@ -1211,7 +1211,7 @@ Tensor masked_softmax_backward_cuda( const Tensor& grad_, const Tensor& output_, const Tensor& mask_, - const c10::optional dim_) { + const std::optional dim_) { Tensor grad_input = at::empty_like(grad_, grad_.options()); if (grad_.numel() == 0) { return grad_input; diff --git a/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu b/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu index 62282659f6e8b..2cd1dd893a447 100644 --- a/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu +++ b/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu @@ -167,7 +167,7 @@ struct CUDAValueSelectionIntersectionKernel { } }; -using OptTensor = c10::optional; +using OptTensor = std::optional; void mul_sparse_sparse_out_cuda_kernel( Tensor& result, diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp index 1032fb28d799c..5d93797c5bd21 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cpp +++ b/aten/src/ATen/native/cuda/SpectralOps.cpp @@ -218,7 +218,7 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_ CuFFTParams Params(input.strides(), out.strides(), signal_size, fft_type, value_type); CuFFTParamsLRUCache& plan_cache = cufft_get_plan_cache(input.device().index()); std::unique_lock guard(plan_cache.mutex, std::defer_lock); - c10::optional uncached_plan; + std::optional uncached_plan; const CuFFTConfig * config = nullptr; // Workaround for gh-63152, gh-58724 diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu index f2626ccff4db7..30adb0b3e5c1a 100644 --- a/aten/src/ATen/native/cuda/SummaryOps.cu +++ b/aten/src/ATen/native/cuda/SummaryOps.cu @@ -360,7 +360,7 @@ Tensor _histc_cuda_template( namespace native { Tensor _bincount_cuda( - const Tensor& self, const c10::optional& weights_opt, + const Tensor& self, const std::optional& weights_opt, int64_t minlength) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weights_maybe_owned = at::borrow_from_optional_tensor(weights_opt); diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu index 42ea83a4b8bf0..87daceacdfba0 100644 --- a/aten/src/ATen/native/cuda/TensorFactories.cu +++ b/aten/src/ATen/native/cuda/TensorFactories.cu @@ -51,7 +51,7 @@ Tensor& eye_out_cuda(int64_t n, int64_t m, Tensor& result) { return result; } -Tensor empty_cuda(IntArrayRef size, c10::optional dtype_opt, c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt, c10::optional memory_format_opt) { +Tensor empty_cuda(IntArrayRef size, std::optional dtype_opt, c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt, c10::optional memory_format_opt) { Tensor result = at::detail::empty_cuda(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt); // See Note [Enabling Deterministic Operations] if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) { @@ -61,10 +61,10 @@ Tensor empty_cuda(IntArrayRef size, c10::optional dtype_opt, c10::op } Tensor _efficientzerotensor_cuda(IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { auto device_ = device_or_default(device); if (!device_.has_index()) { device_.set_index(at::cuda::current_device()); @@ -77,7 +77,7 @@ Tensor _efficientzerotensor_cuda(IntArrayRef size, } -Tensor empty_strided_cuda(IntArrayRef size, IntArrayRef stride, c10::optional dtype_opt, c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { +Tensor empty_strided_cuda(IntArrayRef size, IntArrayRef stride, std::optional dtype_opt, c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { Tensor result = at::detail::empty_strided_cuda(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt); // See Note [Enabling Deterministic Operations] if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) { @@ -274,8 +274,8 @@ void tril_indices_kernel(scalar_t * tensor, // implementation, please enable them in test/test_cuda.py and make sure they // pass on your local server. Tensor tril_indices_cuda( - int64_t row, int64_t col, int64_t offset, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { + int64_t row, int64_t col, int64_t offset, std::optional dtype_opt, + std::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { check_args(row, col, layout_opt); auto tril_size = get_tril_size(row, col, offset); @@ -350,8 +350,8 @@ void triu_indices_kernel(scalar_t * tensor, // implementation, please enable them in test/test_cuda.py and make sure they // pass on your local server. Tensor triu_indices_cuda( - int64_t row, int64_t col, int64_t offset, c10::optional dtype_opt, - c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { + int64_t row, int64_t col, int64_t offset, std::optional dtype_opt, + std::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { check_args(row, col, layout_opt); auto triu_size = row * col - get_tril_size(row, col, offset - 1); diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu index 451c15443fa8e..1dd47c93fae94 100644 --- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu +++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu @@ -221,9 +221,9 @@ C10_HOST_DEVICE static inline scalar_t _nan_to_num_replace(scalar_t a, scalar_t void nan_to_num_kernel_cuda( TensorIteratorBase& iter, - c10::optional nan, - c10::optional pos_inf, - c10::optional neg_inf) { + std::optional nan, + std::optional pos_inf, + std::optional neg_inf) { if (isComplexType(iter.dtype())) { AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "nan_to_num", [&]() { using value_t = scalar_t::value_type; diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu index e2654be0135f8..39e80e0a68c3c 100644 --- a/aten/src/ATen/native/cuda/Unique.cu +++ b/aten/src/ATen/native/cuda/Unique.cu @@ -218,7 +218,7 @@ unique_dim_consecutive_cuda(const Tensor& self, const int64_t dim, const bool re } std::tuple -unique_consecutive_cuda(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional dim) { +unique_consecutive_cuda(const Tensor& self, const bool return_inverse, const bool return_counts, std::optional dim) { if (!dim.has_value()) { return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] { // The current CUDA implementation of unique always sort due to the diff --git a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu index 6673fe4993f39..31cdf0a5688b7 100644 --- a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu @@ -170,8 +170,8 @@ static void upsample_bicubic2d_out_cuda_template( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2}; checkAllSameGPU(__func__, {input_arg, output_arg}); @@ -225,8 +225,8 @@ static void upsample_bicubic2d_backward_out_cuda_template( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { TensorArg grad_input_arg{grad_input, "grad_input", 1}, grad_output_arg{grad_output_, "grad_output_", 2}; checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg}); @@ -275,8 +275,8 @@ TORCH_IMPL_FUNC(upsample_bicubic2d_out_cuda) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output) { upsample_bicubic2d_out_cuda_template(output, input, output_size, align_corners, scales_h, scales_w); } @@ -286,8 +286,8 @@ TORCH_IMPL_FUNC(upsample_bicubic2d_backward_out_cuda) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu index 3c80cb7877a5c..4bd230ab8fe76 100644 --- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu @@ -264,8 +264,8 @@ static void upsample_bilinear2d_out_cuda_template( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2}; checkAllSameGPU(__func__, {input_arg, output_arg}); @@ -362,8 +362,8 @@ static void upsample_bilinear2d_backward_out_cuda_template( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { TensorArg grad_input_arg{grad_input, "grad_input", 1}, grad_output_arg{grad_output_, "grad_output_", 2}; checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg}); @@ -674,8 +674,8 @@ static void upsample_gen2d_aa_out_cuda_template( const Tensor& input_, IntArrayRef output_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { TensorArg input_arg{input_, "input_", 1}, output_arg{output, "output", 2}; checkAllSameGPU("upsample_gen2d_aa_out_cuda", {input_arg, output_arg}); @@ -769,8 +769,8 @@ static void upsample_gen2d_aa_backward_out_cuda_template( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { // Inspired from UpSampleBicubic2d.cu::upsample_bicubic2d_backward_out_cuda_template TensorArg grad_input_arg{grad_input, "grad_input", 1}, @@ -844,8 +844,8 @@ TORCH_IMPL_FUNC(upsample_bilinear2d_out_cuda) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output) { upsample_bilinear2d_out_cuda_template(output, input, output_size, align_corners, scales_h, scales_w); } @@ -855,8 +855,8 @@ TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_cuda) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage @@ -869,8 +869,8 @@ TORCH_IMPL_FUNC(_upsample_bilinear2d_aa_out_cuda) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output) { upsample_gen2d_aa_out_cuda_template( @@ -882,8 +882,8 @@ TORCH_IMPL_FUNC(_upsample_bilinear2d_aa_backward_out_cuda) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage @@ -898,8 +898,8 @@ TORCH_IMPL_FUNC(_upsample_bicubic2d_aa_out_cuda) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output) { upsample_gen2d_aa_out_cuda_template( output, input, output_size, align_corners, scales_h, scales_w); @@ -910,8 +910,8 @@ TORCH_IMPL_FUNC(_upsample_bicubic2d_aa_backward_out_cuda) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage diff --git a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu index dfba2f5479071..ebd11e234d7b3 100644 --- a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu +++ b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu @@ -121,7 +121,7 @@ static void upsample_linear1d_out_cuda_template( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales) { + std::optional scales) { TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2}; checkAllSameGPU(__func__, {input_arg, output_arg}); @@ -164,7 +164,7 @@ static void upsample_linear1d_backward_out_cuda_template( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales) { + std::optional scales) { TensorArg grad_output_arg{grad_output_, "grad_output_", 1}, grad_input_arg{grad_input, "grad_input", 2}; checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg}); @@ -208,7 +208,7 @@ TORCH_IMPL_FUNC(upsample_linear1d_out_cuda) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales, + std::optional scales, const Tensor& output ) { upsample_linear1d_out_cuda_template(output, input, output_size, align_corners, scales); @@ -219,7 +219,7 @@ TORCH_IMPL_FUNC(upsample_linear1d_backward_out_cuda) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales, + std::optional scales, const Tensor& grad_input ) { // See Note [Writing Nondeterministic Operations] diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu index 3085cba0a1d16..1073f8d9dbb51 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu @@ -104,7 +104,7 @@ static void upsample_nearest1d_out_cuda_template( const Tensor& output, const Tensor& input_, IntArrayRef output_size, - c10::optional scales) { + std::optional scales) { TensorArg input_arg{input_, "input_", 1}, output_arg{output, "output", 2}; checkAllSameGPU("upsample_nearest1d_out_cuda", {input_arg, output_arg}); @@ -149,7 +149,7 @@ static void upsample_nearest1d_backward_out_cuda_template( const Tensor& grad_output_, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales) { + std::optional scales) { TensorArg grad_input_arg{grad_input, "grad_input", 1}, grad_output_arg{grad_output_, "grad_output_", 2}; checkAllSameGPU( @@ -198,7 +198,7 @@ static void upsample_nearest1d_backward_out_cuda_template( TORCH_IMPL_FUNC(upsample_nearest1d_out_cuda) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales, + std::optional scales, const Tensor& output ) { upsample_nearest1d_out_cuda_template( @@ -208,7 +208,7 @@ TORCH_IMPL_FUNC(upsample_nearest1d_out_cuda) ( TORCH_IMPL_FUNC(_upsample_nearest_exact1d_out_cuda) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales, + std::optional scales, const Tensor& output ) { upsample_nearest1d_out_cuda_template(output, input, output_size, scales); @@ -218,7 +218,7 @@ TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_cuda) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales, + std::optional scales, const Tensor& grad_input ) { upsample_nearest1d_backward_out_cuda_template( @@ -229,7 +229,7 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact1d_backward_out_cuda) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales, + std::optional scales, const Tensor& grad_input ) { upsample_nearest1d_backward_out_cuda_template( diff --git a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu index 197fc9d60bef7..36db81cd277aa 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu @@ -207,8 +207,8 @@ static void upsample_nearest2d_out_cuda_template( const Tensor& output, const Tensor& input_, IntArrayRef output_size, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { TensorArg input_arg{input_, "input_", 1}, output_arg{output, "output", 2}; checkAllSameGPU(__func__, {input_arg, output_arg}); @@ -337,8 +337,8 @@ static void upsample_nearest2d_backward_out_cuda_template( const Tensor& grad_output_, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { TensorArg grad_input_arg{grad_input, "grad_input", 1}, grad_output_arg{grad_output_, "grad_output_", 2}; checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg}); @@ -446,8 +446,8 @@ static void upsample_nearest2d_backward_out_cuda_template( TORCH_IMPL_FUNC(upsample_nearest2d_out_cuda) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output) { upsample_nearest2d_out_cuda_template( output, input, output_size, scales_h, scales_w); @@ -456,8 +456,8 @@ TORCH_IMPL_FUNC(upsample_nearest2d_out_cuda) ( TORCH_IMPL_FUNC(_upsample_nearest_exact2d_out_cuda) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output) { upsample_nearest2d_out_cuda_template( output, input, output_size, scales_h, scales_w); @@ -467,8 +467,8 @@ TORCH_IMPL_FUNC(upsample_nearest2d_backward_out_cuda) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { upsample_nearest2d_backward_out_cuda_template( grad_input, grad_output, output_size, input_size, scales_h, scales_w); @@ -478,8 +478,8 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact2d_backward_out_cuda) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { upsample_nearest2d_backward_out_cuda_template( grad_input, grad_output, output_size, input_size, scales_h, scales_w); diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu index 31a7ee92e7488..53e8d71e79a79 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu @@ -148,9 +148,9 @@ static void upsample_nearest3d_out_cuda_template( const Tensor& output, const Tensor& input_, IntArrayRef output_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_d, + std::optional scales_h, + std::optional scales_w) { TensorArg input_arg{input_, "input_", 1}, output_arg{output, "output", 2}; checkAllSameGPU(__func__, {input_arg, output_arg}); @@ -223,9 +223,9 @@ static void upsample_nearest3d_backward_out_cuda_template( const Tensor& grad_output_, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_d, + std::optional scales_h, + std::optional scales_w) { TensorArg grad_input_arg{grad_input, "grad_input", 1}, grad_output_arg{grad_output_, "grad_output_", 2}; checkAllSameGPU( @@ -292,9 +292,9 @@ static void upsample_nearest3d_backward_out_cuda_template( TORCH_IMPL_FUNC(upsample_nearest3d_out_cuda) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_d, + std::optional scales_h, + std::optional scales_w, const Tensor& output) { upsample_nearest3d_out_cuda_template( output, input, output_size, scales_d, scales_h, scales_w); @@ -303,9 +303,9 @@ TORCH_IMPL_FUNC(upsample_nearest3d_out_cuda) ( TORCH_IMPL_FUNC(_upsample_nearest_exact3d_out_cuda) ( const Tensor& input, IntArrayRef output_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_d, + std::optional scales_h, + std::optional scales_w, const Tensor& output) { upsample_nearest3d_out_cuda_template(output, input, output_size, scales_d, scales_h, scales_w); } @@ -314,9 +314,9 @@ TORCH_IMPL_FUNC(upsample_nearest3d_backward_out_cuda) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_d, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { upsample_nearest3d_backward_out_cuda_template( grad_input, grad_output, output_size, input_size, scales_d, scales_h, scales_w); @@ -326,9 +326,9 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact3d_backward_out_cuda) ( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_d, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { upsample_nearest3d_backward_out_cuda_template( grad_input, grad_output, output_size, input_size, scales_d, scales_h, scales_w); diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu index 43cc09d34b677..0abe0b6bcb4d2 100644 --- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu +++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu @@ -245,9 +245,9 @@ static void upsample_trilinear3d_out_cuda_template( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_d, + std::optional scales_h, + std::optional scales_w) { TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2}; checkAllSameGPU("upsample_trilinear3d_out_cuda", {input_arg, output_arg}); @@ -301,9 +301,9 @@ static void upsample_trilinear3d_backward_out_cuda_template( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_d, + std::optional scales_h, + std::optional scales_w) { TensorArg grad_input_arg{grad_input_, "grad_input_", 1}, grad_output_arg{grad_output_, "grad_output_", 2}; checkAllSameGPU( @@ -377,9 +377,9 @@ TORCH_IMPL_FUNC(upsample_trilinear3d_out_cuda) ( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_d, + std::optional scales_h, + std::optional scales_w, const Tensor& output) { upsample_trilinear3d_out_cuda_template(output, input, output_size, align_corners, scales_d, scales_h, scales_w); } @@ -389,9 +389,9 @@ TORCH_IMPL_FUNC(upsample_trilinear3d_backward_out_cuda) ( IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_d, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage diff --git a/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu b/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu index 9cebb82e512a8..cef07de1b41f9 100644 --- a/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu +++ b/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu @@ -21,8 +21,8 @@ void _fused_adam_amsgrad_cuda_impl_( const double weight_decay, const double eps, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { std::vector> tensor_lists{ params.vec(), grads.vec(), @@ -72,8 +72,8 @@ void _fused_adam_amsgrad_cuda_impl_( const double weight_decay, const double eps, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { std::vector> tensor_lists{ params.vec(), grads.vec(), diff --git a/aten/src/ATen/native/cuda/fused_adam_impl.cu b/aten/src/ATen/native/cuda/fused_adam_impl.cu index 7f2843b3b4ee4..2c1f5ce0d6d57 100644 --- a/aten/src/ATen/native/cuda/fused_adam_impl.cu +++ b/aten/src/ATen/native/cuda/fused_adam_impl.cu @@ -20,8 +20,8 @@ void _fused_adam_cuda_impl_( const double weight_decay, const double eps, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { std::vector> tensor_lists{ params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()}; @@ -66,8 +66,8 @@ void _fused_adam_cuda_impl_( const double weight_decay, const double eps, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { std::vector> tensor_lists{ params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()}; diff --git a/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu b/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu index 376711c39db6d..8a22b57a47e8b 100644 --- a/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu +++ b/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu @@ -22,8 +22,8 @@ void _fused_adamw_amsgrad_cuda_impl_( const double weight_decay, const double eps, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { std::vector> tensor_lists{ params.vec(), grads.vec(), @@ -73,8 +73,8 @@ void _fused_adamw_amsgrad_cuda_impl_( const double weight_decay, const double eps, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { std::vector> tensor_lists{ params.vec(), grads.vec(), diff --git a/aten/src/ATen/native/cuda/fused_adamw_impl.cu b/aten/src/ATen/native/cuda/fused_adamw_impl.cu index cc4feaa145122..b0f9dc6db6aff 100644 --- a/aten/src/ATen/native/cuda/fused_adamw_impl.cu +++ b/aten/src/ATen/native/cuda/fused_adamw_impl.cu @@ -21,8 +21,8 @@ void _fused_adamw_cuda_impl_( const double weight_decay, const double eps, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { std::vector> tensor_lists{ params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()}; @@ -67,8 +67,8 @@ void _fused_adamw_cuda_impl_( const double weight_decay, const double eps, const bool maximize, - const c10::optional& grad_scale, - const c10::optional& found_inf) { + const std::optional& grad_scale, + const std::optional& found_inf) { std::vector> tensor_lists{ params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()}; diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp index 6e804efe5f847..0d870cef58708 100644 --- a/aten/src/ATen/native/cuda/jit_utils.cpp +++ b/aten/src/ATen/native/cuda/jit_utils.cpp @@ -1393,7 +1393,7 @@ std::string generate_reduction_code( } // Acquires (possibly creating) the kernel cache directory -c10::optional get_cache_dir() { +std::optional get_cache_dir() { // If the environment variable USE_TORCH_KERNEL_CACHE is set to "0" then no persistent cache is used const char* uptkc = std::getenv("USE_PYTORCH_KERNEL_CACHE"); const bool use_kernel_cache = (uptkc == nullptr) ? true : std::strcmp(uptkc, "0"); @@ -1483,7 +1483,7 @@ NvrtcFunction jit_pwise_function( NvrtcFunction compiled_kernel_; std::string name = kernel_name + "_kernel"; - static const c10::optional cache_dir = get_cache_dir(); + static const std::optional cache_dir = get_cache_dir(); std::string file_path; if (cache_dir.has_value()) { diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu index 6423dddbb2995..f06b247ef32be 100644 --- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu @@ -1334,8 +1334,8 @@ void LayerNormBackwardKernelImpl( std::tuple layer_norm_cuda( const Tensor& input, IntArrayRef normalized_shape, - const c10::optional& weight_opt /* optional */, - const c10::optional& bias_opt /* optional */, + const std::optional& weight_opt /* optional */, + const std::optional& bias_opt /* optional */, double eps) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = @@ -1390,8 +1390,8 @@ std::tuple layer_norm_backward_cuda( IntArrayRef normalized_shape, const Tensor& mean, const Tensor& rstd, - const c10::optional& weight_opt /* optional */, - const c10::optional& bias_opt /* optional */, + const std::optional& weight_opt /* optional */, + const std::optional& bias_opt /* optional */, std::array grad_input_mask) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp index 5471c57ec30ed..04b12695dd0a7 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp @@ -2210,7 +2210,7 @@ void svd_magma(const Tensor& A, void svd_kernel(const Tensor& A, const bool full_matrices, const bool compute_uv, - const c10::optional& driver, + const std::optional& driver, const Tensor& U, const Tensor& S, const Tensor& Vh, diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp index 643501f0cbccd..bc06f118ae9a0 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp @@ -648,7 +648,7 @@ std::string _format_non_converging_batches(const std::vector& batches) void svd_cusolver(const Tensor& A, const bool full_matrices, const bool compute_uv, - const c10::optional& driver, + const std::optional& driver, const Tensor& U, const Tensor& S, const Tensor& V, diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h index cca2e04941a54..75732ec315a45 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h @@ -61,7 +61,7 @@ void lu_solve_batched_cublas(const Tensor& LU, const Tensor& pivots, const Tenso // entrance of calculations of `svd` using cusolver gesvdj and gesvdjBatched void svd_cusolver(const Tensor& A, const bool full_matrices, const bool compute_uv, - const c10::optional& driver, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& info); + const std::optional& driver, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& info); // entrance of calculations of `cholesky` using cusolver potrf and potrfBatched void cholesky_helper_cusolver(const Tensor& input, bool upper, const Tensor& info); diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp index 44b004dff0007..460a9b73dd2c5 100644 --- a/aten/src/ATen/native/cudnn/BatchNorm.cpp +++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp @@ -19,9 +19,9 @@ namespace native { std::tuple cudnn_batch_norm( const Tensor& input, const Tensor& weight, - const c10::optional& bias_opt, - const c10::optional& running_mean_opt, - const c10::optional& running_var_opt, + const std::optional& bias_opt, + const std::optional& running_mean_opt, + const std::optional& running_var_opt, bool training, double exponential_average_factor, double epsilon) { @@ -32,10 +32,10 @@ std::tuple cudnn_batch_norm_backward( const Tensor& input, const Tensor& grad_output, const Tensor& weight, - const c10::optional& running_mean_opt, - const c10::optional& running_var_opt, - const c10::optional& save_mean_opt, - const c10::optional& save_var_opt, + const std::optional& running_mean_opt, + const std::optional& running_var_opt, + const std::optional& save_mean_opt, + const std::optional& save_var_opt, double epsilon, const Tensor& reservedSpace) { AT_ERROR("cudnn_batch_norm_backward: ATen not compiled with cuDNN support"); @@ -121,9 +121,9 @@ size_t _get_cudnn_batch_norm_reserve_space_size( std::tuple cudnn_batch_norm( const Tensor& input_t, const Tensor& weight_t, - const c10::optional& bias_t_opt, - const c10::optional& running_mean_t_opt, - const c10::optional& running_var_t_opt, + const std::optional& bias_t_opt, + const std::optional& running_mean_t_opt, + const std::optional& running_var_t_opt, bool training, double exponential_average_factor, double epsilon) { @@ -274,10 +274,10 @@ std::tuple cudnn_batch_norm_backward( const Tensor& weight_t, // Unused: but we require them to be passed so that double backwards // has access - const c10::optional& running_mean_opt, - const c10::optional& running_var_opt, - const c10::optional& save_mean_t_opt, - const c10::optional& save_var_t_opt, + const std::optional& running_mean_opt, + const std::optional& running_var_opt, + const std::optional& save_mean_t_opt, + const std::optional& save_var_t_opt, double epsilon, const Tensor& reserveSpace) { // See [Note: hacky wrapper removal for optional tensor] diff --git a/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp b/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp index 8475a143f466c..349999e4544f9 100644 --- a/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp +++ b/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp @@ -205,7 +205,7 @@ void raw_cudnn_convolution_backward_weight_out( Tensor cudnn_convolution_relu( const Tensor& input_t, const Tensor& weight_t, - const c10::optional& bias_t, + const std::optional& bias_t, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, @@ -217,8 +217,8 @@ Tensor cudnn_convolution_add_relu( const Tensor& input_t, const Tensor& weight_t, const Tensor& z_t, - const c10::optional& alpha, - const c10::optional& bias_t, + const std::optional& alpha, + const std::optional& bias_t, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp index 104ae8c70803d..09a10581ab142 100644 --- a/aten/src/ATen/native/cudnn/ConvShared.cpp +++ b/aten/src/ATen/native/cudnn/ConvShared.cpp @@ -705,7 +705,7 @@ std::tuple cudnn_convolution_transpose_backward( Tensor cudnn_convolution_relu( const Tensor& input_t, const Tensor& weight_t, - const c10::optional& bias_t, + const std::optional& bias_t, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, @@ -758,8 +758,8 @@ Tensor cudnn_convolution_add_relu( const Tensor& input_t, const Tensor& weight_t, const Tensor& z_t, - const c10::optional& alpha, - const c10::optional& bias_t, + const std::optional& alpha, + const std::optional& bias_t, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index 05b1df3114f85..55c666eeca83c 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -51,9 +51,9 @@ std::tuple _cudnn_rnn( const Tensor& input_r, TensorList weight, int64_t weight_stride0, - const c10::optional& weight_buf_r_opt, + const std::optional& weight_buf_r_opt, const Tensor& hx, - const c10::optional& cx_opt, + const std::optional& cx_opt, int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size, @@ -63,7 +63,7 @@ std::tuple _cudnn_rnn( bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes, - const c10::optional& fn_dropout_state_opt) { + const std::optional& fn_dropout_state_opt) { AT_ERROR("_cudnn_rnn: ATen not compiled with cuDNN support"); } @@ -73,11 +73,11 @@ std::tuple> _cudnn_rnn_backward( int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, - const c10::optional& cx_opt, + const std::optional& cx_opt, const Tensor& output, - const c10::optional& grad_output_r_opt, - const c10::optional& grad_hy_r_opt, - const c10::optional& grad_cy_r_opt, + const std::optional& grad_output_r_opt, + const std::optional& grad_hy_r_opt, + const std::optional& grad_cy_r_opt, int64_t mode, int64_t hidden_size, int64_t proj_size, @@ -87,7 +87,7 @@ std::tuple> _cudnn_rnn_backward( bool train, bool bidirectional, IntArrayRef batch_sizes, - const c10::optional& dropout_state_opt, + const std::optional& dropout_state_opt, const Tensor& reserve, std::array output_mask) { AT_ERROR("_cudnn_rnn_backward: ATen not compiled with cuDNN support"); @@ -97,10 +97,10 @@ Tensor _cudnn_init_dropout_state( double dropout, bool train, int64_t dropout_seed, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory( pin_memory); @@ -1396,9 +1396,9 @@ std::tuple _cudnn_rnn( const Tensor& input_r, TensorList weight, int64_t weight_stride0, - const c10::optional& weight_buf_r_opt, + const std::optional& weight_buf_r_opt, const Tensor& hx, - const c10::optional& cx_opt, + const std::optional& cx_opt, int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size, @@ -1408,7 +1408,7 @@ std::tuple _cudnn_rnn( bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes, - const c10::optional& fn_dropout_state_opt) { + const std::optional& fn_dropout_state_opt) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_buf_r_maybe_owned = at::borrow_from_optional_tensor(weight_buf_r_opt); @@ -2105,11 +2105,11 @@ std::tuple> _cudnn_rnn_backward( int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, - const c10::optional& cx_opt, + const std::optional& cx_opt, const Tensor& output, - const c10::optional& grad_output_r_opt, - const c10::optional& grad_hy_r_opt, - const c10::optional& grad_cy_r_opt, + const std::optional& grad_output_r_opt, + const std::optional& grad_hy_r_opt, + const std::optional& grad_cy_r_opt, int64_t mode, int64_t hidden_size, int64_t proj_size, @@ -2119,7 +2119,7 @@ std::tuple> _cudnn_rnn_backward( bool train, bool bidirectional, IntArrayRef batch_sizes, - const c10::optional& dropout_state_opt, + const std::optional& dropout_state_opt, const Tensor& reserve, std::array output_mask) { // See [Note: hacky wrapper removal for optional tensor] @@ -2214,10 +2214,10 @@ Tensor _cudnn_init_dropout_state( double dropout, bool train, int64_t dropout_seed, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory( @@ -2304,7 +2304,7 @@ struct DropoutState { // needed for the first time. Note that in this case needed != used, as we // don't need a buffer to e.g. run RNNs in test mode. at::Tensor buffer; - c10::optional event; + std::optional event; std::mutex mutex; #if !defined(USE_ROCM) // cudaStreamGetCaptureInfo will never give back a capture id of 0, so 0 can @@ -2531,8 +2531,8 @@ std::pair _cudnn_impl( } // TODO: try_get_weight_buf returns a Tensor, but _cudnn_rnn below takes a - // c10::optional in weight_buf's slot. Do we want try_get_weight_buf - // to return a c10::optional instead of a defined or undefined Tensor? + // std::optional in weight_buf's slot. Do we want try_get_weight_buf + // to return a std::optional instead of a defined or undefined Tensor? at::cuda::OptionalCUDAGuard guard(input.get_device()); auto weight_buf = try_get_weight_buf( input, diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp index 1babf82b90e05..85767b7502dc3 100644 --- a/aten/src/ATen/native/group_norm.cpp +++ b/aten/src/ATen/native/group_norm.cpp @@ -61,8 +61,8 @@ void check_group_norm_inputs( std::tuple native_group_norm( const Tensor& X, - const c10::optional& gamma_opt /* optional */, - const c10::optional& beta_opt /* optional */, + const std::optional& gamma_opt /* optional */, + const std::optional& beta_opt /* optional */, int64_t N, int64_t C, int64_t HxW, @@ -107,7 +107,7 @@ std::tuple native_group_norm_backward( const Tensor& X, const Tensor& mean, const Tensor& rstd, - const c10::optional& gamma_opt, + const std::optional& gamma_opt, int64_t N, int64_t C, int64_t HxW, @@ -177,8 +177,8 @@ std::tuple native_group_norm_backward( Tensor group_norm( const Tensor& input, int64_t num_groups, - const c10::optional& weight_opt /* optional */, - const c10::optional& bias_opt /* optional */, + const std::optional& weight_opt /* optional */, + const std::optional& bias_opt /* optional */, double eps, bool /* cudnn_enabled, deprecated */) { // See [Note: hacky wrapper removal for optional tensor] @@ -213,8 +213,8 @@ DEFINE_DISPATCH(GroupNormBackwardKernel); // Ported from pytorch/xla repo std::tuple math_group_norm( const Tensor& input, - const c10::optional& weight_opt, - const c10::optional& bias_opt, + const std::optional& weight_opt, + const std::optional& bias_opt, int64_t N, int64_t C, int64_t HxW, diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp index 27a701dd2eb49..9858840f95223 100644 --- a/aten/src/ATen/native/layer_norm.cpp +++ b/aten/src/ATen/native/layer_norm.cpp @@ -74,7 +74,7 @@ void layer_norm_cpu_out( std::tuple layer_norm_cpu( const Tensor& input, - IntArrayRef normalized_shape, const c10::optional& weight_opt /* optional */, const c10::optional& bias_opt /* optional */, + IntArrayRef normalized_shape, const std::optional& weight_opt /* optional */, const c10::optional& bias_opt /* optional */, double eps) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); @@ -115,8 +115,8 @@ std::tuple layer_norm_backward_cpu( IntArrayRef normalized_shape, const Tensor& mean, const Tensor& rstd, - const c10::optional& weight_opt /* optional */, - const c10::optional& bias_opt /* optional */, + const std::optional& weight_opt /* optional */, + const std::optional& bias_opt /* optional */, std::array grad_input_mask) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = @@ -186,7 +186,7 @@ std::tuple layer_norm_backward_cpu( Tensor layer_norm_symint( const Tensor& input, - c10::SymIntArrayRef normalized_shape, const c10::optional& weight_opt /* optional */, const c10::optional& bias_opt /* optional */, + c10::SymIntArrayRef normalized_shape, const std::optional& weight_opt /* optional */, const c10::optional& bias_opt /* optional */, double eps, bool /* cudnn_enable, deprecated */) { // See [Note: hacky wrapper removal for optional tensor] @@ -204,7 +204,7 @@ DEFINE_DISPATCH(LayerNormBackwardKernel); // Ported from pytorch/xla repo std::tuple math_native_layer_norm( const Tensor& input, - IntArrayRef normalized_shape, const c10::optional& weight_opt, const c10::optional& bias_opt, + IntArrayRef normalized_shape, const std::optional& weight_opt, const c10::optional& bias_opt, double eps) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); @@ -266,8 +266,8 @@ std::tuple math_native_layer_norm( Tensor rms_norm( const Tensor& input, IntArrayRef normalized_shape, - const c10::optional& weight_opt /* optional */, - c10::optional eps) { + const std::optional& weight_opt /* optional */, + std::optional eps) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); diff --git a/aten/src/ATen/native/layer_norm.h b/aten/src/ATen/native/layer_norm.h index 38e63569586e3..e35ccf8634bcc 100644 --- a/aten/src/ATen/native/layer_norm.h +++ b/aten/src/ATen/native/layer_norm.h @@ -74,8 +74,8 @@ void layer_norm_cpu_out( Tensor rms_norm( const Tensor& input, IntArrayRef normalized_shape, - const c10::optional& weight_opt /* optional */, - c10::optional eps); + const std::optional& weight_opt /* optional */, + std::optional eps); using forward_fn = void (*)( const Tensor& /* X */, diff --git a/aten/src/ATen/native/metal/MetalNeuronType.h b/aten/src/ATen/native/metal/MetalNeuronType.h index b59d163c4ae88..c5cb0b99502c6 100644 --- a/aten/src/ATen/native/metal/MetalNeuronType.h +++ b/aten/src/ATen/native/metal/MetalNeuronType.h @@ -20,8 +20,8 @@ enum class NeuronType { }; static inline NeuronType neuronType( - c10::optional output_min, - c10::optional output_max) { + std::optional output_min, + std::optional output_max) { float inf_max = std::numeric_limits::infinity(); float inf_min = -std::numeric_limits::infinity(); float output_max_ = diff --git a/aten/src/ATen/native/metal/MetalPrepackOpContext.h b/aten/src/ATen/native/metal/MetalPrepackOpContext.h index 02f474ece8da2..4481c879eec29 100644 --- a/aten/src/ATen/native/metal/MetalPrepackOpContext.h +++ b/aten/src/ATen/native/metal/MetalPrepackOpContext.h @@ -9,13 +9,13 @@ namespace metal { using SerializationTypeConv2dPrePack = std::tuple< Tensor, - c10::optional, + std::optional, std::vector, std::vector, std::vector, int64_t, - c10::optional, - c10::optional>; + std::optional, + std::optional>; class Conv2dOpContext : public torch::jit::CustomClassHolder { public: @@ -33,13 +33,13 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder { Conv2dOpContext() = delete; Conv2dOpContext( at::Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector stride, std::vector padding, std::vector dilation, int64_t groups, - c10::optional output_min, - c10::optional output_max) + std::optional output_min, + std::optional output_max) : weight_(std::move(weight)), bias_(std::move(bias)), stride_(std::move(stride)), @@ -65,7 +65,7 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder { return weight_; } - const c10::optional& get_bias() const { + const std::optional& get_bias() const { return bias_; } @@ -85,11 +85,11 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder { return groups_; } - const c10::optional& get_output_min() const { + const std::optional& get_output_min() const { return output_min_; } - const c10::optional& get_output_max() const { + const std::optional& get_output_max() const { return output_max_; } @@ -111,22 +111,22 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder { private: Tensor weight_; - c10::optional bias_; + std::optional bias_; std::vector stride_; std::vector padding_; std::vector dilation_; int64_t groups_; - c10::optional output_min_; - c10::optional output_max_; + std::optional output_min_; + std::optional output_max_; std::function releaseCallback_ = nullptr; void* conv2dOp_ = nullptr; // reserved to hold MPSCNNConv2dOp objects }; using SerializationTypeLinearPrePack = std::tuple< Tensor, - c10::optional, - c10::optional, - c10::optional>; + std::optional, + std::optional, + std::optional>; class LinearOpContext : public torch::jit::CustomClassHolder { public: @@ -136,9 +136,9 @@ class LinearOpContext : public torch::jit::CustomClassHolder { LinearOpContext() = delete; LinearOpContext( at::Tensor&& weight, - c10::optional&& bias, - c10::optional output_min, - c10::optional output_max) + std::optional&& bias, + std::optional output_min, + std::optional output_max) : weight_(std::move(weight)), bias_(std::move(bias)), output_min_(std::move(output_min)), @@ -160,15 +160,15 @@ class LinearOpContext : public torch::jit::CustomClassHolder { return weight_; } - const c10::optional& get_bias() const { + const std::optional& get_bias() const { return bias_; } - const c10::optional& get_output_min() const { + const std::optional& get_output_min() const { return output_min_; } - const c10::optional& get_output_max() const { + const std::optional& get_output_max() const { return output_max_; } @@ -190,9 +190,9 @@ class LinearOpContext : public torch::jit::CustomClassHolder { private: Tensor weight_; - c10::optional bias_; - c10::optional output_min_; - c10::optional output_max_; + std::optional bias_; + std::optional output_min_; + std::optional output_max_; void* opaqueOpPtr_ = nullptr; // reserved to hold MPSCNNFullyConnected objects std::function releaseCallback_ = nullptr; }; diff --git a/aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp b/aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp index bbdf713801860..ebf9b9daf6263 100644 --- a/aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp +++ b/aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp @@ -9,13 +9,13 @@ namespace metal { c10::intrusive_ptr unpack( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& dilation, int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { + const std::optional& output_min, + const std::optional& output_max) { auto packedWeight = weight.contiguous(MemoryFormat::ChannelsLast); return c10::make_intrusive( std::move(packedWeight), @@ -30,9 +30,9 @@ c10::intrusive_ptr unpack( c10::intrusive_ptr unpack( Tensor&& weight, - c10::optional&& bias, - const c10::optional& output_min, - const c10::optional& output_max) { + std::optional&& bias, + const std::optional& output_min, + const std::optional& output_max) { TORCH_CHECK(weight.dim() == 2); // Don't need to do `weight.t()` auto packedWeight = weight.view({weight.size(0), weight.size(1), 1, 1}) @@ -96,13 +96,13 @@ TORCH_LIBRARY(metal_prepack, m) { c10::intrusive_ptr conv2d_prepack( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& dilation, int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { + const std::optional& output_min, + const std::optional& output_max) { TORCH_CHECK(weight.dim() == 4); return c10::make_intrusive( std::move(weight), @@ -117,9 +117,9 @@ c10::intrusive_ptr conv2d_prepack( c10::intrusive_ptr linear_prepack( Tensor&& weight, - c10::optional&& bias, - const c10::optional& output_min, - const c10::optional& output_max) { + std::optional&& bias, + const std::optional& output_min, + const std::optional& output_max) { return c10::make_intrusive( std::move(weight), std::move(bias), output_min, output_max); } diff --git a/aten/src/ATen/native/metal/ops/MetalConvolution.h b/aten/src/ATen/native/metal/ops/MetalConvolution.h index e5a68e45cd929..77053448cbcb4 100644 --- a/aten/src/ATen/native/metal/ops/MetalConvolution.h +++ b/aten/src/ATen/native/metal/ops/MetalConvolution.h @@ -9,7 +9,7 @@ namespace metal { Tensor conv2d( const Tensor& input, const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp index 7c641b3fadd89..5a89c01bc0394 100644 --- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp +++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp @@ -22,13 +22,13 @@ namespace at { namespace native { // See Note [ATen preprocessor philosophy] std::tuple miopen_batch_norm( - const Tensor& input, const Tensor& weight, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, + const Tensor& input, const Tensor& weight, const std::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, bool training, double exponential_average_factor, double epsilon) { AT_ERROR("miopen_batch_norm: ATen not compiled with MIOpen support"); } std::tuple miopen_batch_norm_backward( - const Tensor& input, const Tensor& grad_output, const Tensor& weight, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_var_opt, + const Tensor& input, const Tensor& grad_output, const Tensor& weight, const std::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_var_opt, double epsilon) { AT_ERROR("miopen_batch_norm_backward: ATen not compiled with MIOpen support"); } @@ -58,7 +58,7 @@ Tensor expandScale(const Tensor& t, int64_t dim) { } // namespace std::tuple miopen_batch_norm( - const Tensor& input_t, const Tensor& weight_t, const c10::optional& bias_t_opt, const c10::optional& running_mean_t_opt, const c10::optional& running_var_t_opt, + const Tensor& input_t, const Tensor& weight_t, const std::optional& bias_t_opt, const c10::optional& running_mean_t_opt, const c10::optional& running_var_t_opt, bool training, double exponential_average_factor, double epsilon) { // See [Note: hacky wrapper removal for optional tensor] @@ -83,7 +83,8 @@ std::tuple miopen_batch_norm( checkAllSameType(c, {input, weight}); } checkAllSameType(c, {weight, bias, running_mean, running_var}); - checkAllContiguous(c, {input, weight, bias, running_mean, running_var}); + checkAllContiguous(c, {weight, bias, running_mean, running_var}); + TORCH_CHECK(input->is_contiguous(input->suggest_memory_format())); checkDimRange(c, input, 2, 6 /* exclusive */); auto num_features = input->size(1); for (auto t : {weight, bias, running_mean, running_var}) { diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index 88f889c2cc1fa..71b4620ecfdf0 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -31,7 +31,7 @@ namespace at { namespace native { // See Note [ATen preprocessor philosophy] at::Tensor miopen_convolution( - const Tensor& input, const Tensor& weight, const c10::optional& bias_opt /* optional */, + const Tensor& input, const Tensor& weight, const std::optional& bias_opt /* optional */, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic) { AT_ERROR("miopen_convolution: ATen not compiled with MIOpen support"); @@ -64,7 +64,7 @@ std::tuple miopen_convolution_backward( } at::Tensor miopen_convolution_transpose( - const Tensor& input, const Tensor& weight, const c10::optional& bias_opt /* optional */, + const Tensor& input, const Tensor& weight, const std::optional& bias_opt /* optional */, IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic) { AT_ERROR("miopen_convolution_transpose: ATen not compiled with MIOpen support"); @@ -92,7 +92,7 @@ std::tuple miopen_convolution_transpose_backwa } at::Tensor miopen_depthwise_convolution( - const Tensor& input, const Tensor& weight, const c10::optional& bias_opt /* optional */, + const Tensor& input, const Tensor& weight, const std::optional& bias_opt /* optional */, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic) { AT_ERROR("miopen_depthwise_convolution: ATen not compiled with MIOpen support"); @@ -122,13 +122,13 @@ std::tuple miopen_depthwise_convolution_backwa at::Tensor miopen_convolution_add_relu( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& z, - const c10::optional& alpha, const c10::optional& bias, IntArrayRef stride, + const std::optional& alpha, const c10::optional& bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, int64_t groups) { AT_ERROR("miopen_convolution_add_relu: ATen not compiled with MIOpen support"); } at::Tensor miopen_convolution_relu( - const at::Tensor& input, const at::Tensor& weight, const c10::optional& bias, + const at::Tensor& input, const at::Tensor& weight, const std::optional& bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, int64_t groups) { AT_ERROR("miopen_convolution_relu: ATen not compiled with MIOpen support"); } @@ -795,7 +795,7 @@ Tensor miopen_convolution_forward( } Tensor miopen_convolution( - const Tensor& input_t, const Tensor& weight_t, const c10::optional& bias_t_opt, + const Tensor& input_t, const Tensor& weight_t, const std::optional& bias_t_opt, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic) { @@ -896,7 +896,7 @@ Tensor miopen_depthwise_convolution_forward( } Tensor miopen_depthwise_convolution( - const Tensor& input_t, const Tensor& weight_t, const c10::optional& bias_t_opt, + const Tensor& input_t, const Tensor& weight_t, const std::optional& bias_t_opt, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic) { @@ -1463,7 +1463,7 @@ std::tuple miopen_depthwise_convolution_backwa } Tensor miopen_convolution_transpose( - const Tensor& input_t, const Tensor& weight_t, const c10::optional& bias_t_opt, + const Tensor& input_t, const Tensor& weight_t, const std::optional& bias_t_opt, IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic) { @@ -1552,8 +1552,8 @@ Tensor miopen_convolution_add_relu( const Tensor& input, const Tensor& weight, const Tensor& z, - const c10::optional& alpha, - const c10::optional& bias, + const std::optional& alpha, + const std::optional& bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, @@ -1607,7 +1607,7 @@ Tensor miopen_convolution_add_relu( Tensor miopen_convolution_relu( const Tensor& input, const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, diff --git a/aten/src/ATen/native/miopen/RNN_miopen.cpp b/aten/src/ATen/native/miopen/RNN_miopen.cpp index 7b2b2ab80e553..2cba1aa3aef14 100644 --- a/aten/src/ATen/native/miopen/RNN_miopen.cpp +++ b/aten/src/ATen/native/miopen/RNN_miopen.cpp @@ -29,18 +29,18 @@ namespace at { namespace native { std::tuple miopen_rnn( const Tensor& input_r, TensorList weight, int64_t weight_stride0, - const Tensor& hx, const c10::optional& cx_opt, + const Tensor& hx, const std::optional& cx_opt, int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_num_layers, bool batch_first, double fn_dropout, bool fn_train, bool fn_bidirectional, - IntArrayRef fn_batch_sizes, const c10::optional& fn_dropout_state_opt + IntArrayRef fn_batch_sizes, const std::optional& fn_dropout_state_opt ) { AT_ERROR("miopen_rnn : ATen not compiled with MIOpen support."); } std::tuple> miopen_rnn_backward( - const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const c10::optional& cx_opt, - const Tensor& output, const c10::optional& grad_output_r_opt, const c10::optional& grad_hy_r_opt, const c10::optional& grad_cy_r_opt, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, - double dropout, bool train, bool bidirectional, IntArrayRef batch_sizes, const c10::optional& dropout_state_opt, + const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const std::optional& cx_opt, + const Tensor& output, const std::optional& grad_output_r_opt, const c10::optional& grad_hy_r_opt, const c10::optional& grad_cy_r_opt, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, + double dropout, bool train, bool bidirectional, IntArrayRef batch_sizes, const std::optional& dropout_state_opt, const Tensor& reserve, std::array output_mask ) { AT_ERROR("miopen_rnn_backward: ATen not compiled with MIOpen support."); @@ -444,10 +444,10 @@ std::vector _output_size(const RNNDescriptorParams& rnn, const TensorDe std::tuple miopen_rnn( const Tensor& input_r, TensorList weight, int64_t weight_stride0, - const Tensor& hx, const c10::optional& cx_opt, + const Tensor& hx, const std::optional& cx_opt, int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_num_layers, bool batch_first, double fn_dropout, bool fn_train, bool fn_bidirectional, - IntArrayRef fn_batch_sizes, const c10::optional& fn_dropout_state_opt + IntArrayRef fn_batch_sizes, const std::optional& fn_dropout_state_opt ) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned cx_maybe_owned = at::borrow_from_optional_tensor(cx_opt); @@ -758,9 +758,9 @@ std::vector miopen_rnn_backward_weight( } std::tuple> miopen_rnn_backward( - const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const c10::optional& cx_opt, - const Tensor& output, const c10::optional& grad_output_r_opt, const c10::optional& grad_hy_r_opt, const c10::optional& grad_cy_r_opt, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, - double dropout, bool train, bool bidirectional, IntArrayRef batch_sizes, const c10::optional& dropout_state_opt, + const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const std::optional& cx_opt, + const Tensor& output, const std::optional& grad_output_r_opt, const c10::optional& grad_hy_r_opt, const c10::optional& grad_cy_r_opt, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, + double dropout, bool train, bool bidirectional, IntArrayRef batch_sizes, const std::optional& dropout_state_opt, const Tensor& reserve, std::array output_mask ) { // See [Note: hacky wrapper removal for optional tensor] diff --git a/aten/src/ATen/native/mkldnn/Common.h b/aten/src/ATen/native/mkldnn/Common.h index 4e048ebce7597..baf823a9bcec7 100644 --- a/aten/src/ATen/native/mkldnn/Common.h +++ b/aten/src/ATen/native/mkldnn/Common.h @@ -13,7 +13,7 @@ namespace mkldnn { struct ContextConv final { ideep::tensor weight_packed_; - c10::optional at_bias_; + std::optional at_bias_; std::vector padding_; std::vector stride_; std::vector dilation_; @@ -24,7 +24,7 @@ struct ContextConv final { ContextConv( ideep::tensor&& weight_packed, - c10::optional at_bias, + std::optional at_bias, std::vector padding, std::vector stride, std::vector dilation, diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp index 3e41e2f1071d0..09dca06e2b5ae 100644 --- a/aten/src/ATen/native/mkldnn/Conv.cpp +++ b/aten/src/ATen/native/mkldnn/Conv.cpp @@ -22,7 +22,7 @@ namespace at { namespace native { Tensor mkldnn_convolution( - const Tensor& input, const Tensor& weight, const c10::optional& bias_opt, + const Tensor& input, const Tensor& weight, const std::optional& bias_opt, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) { TORCH_CHECK(false, "mkldnn_convolution_forward: ATen not compiled with MKLDNN support"); } @@ -48,7 +48,7 @@ static std::tuple mkldnn_convolution_backward( REGISTER_NO_CPU_DISPATCH(mkldnn_convolution_backward_stub); static Tensor mkldnn_convolution_transpose( - const Tensor& input, const Tensor& weight, const c10::optional& bias_opt, + const Tensor& input, const Tensor& weight, const std::optional& bias_opt, IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) { TORCH_CHECK(false, "mkldnn_convolution_transpose: ATen not compiled with MKLDNN support"); } @@ -259,16 +259,16 @@ static void _mkldnn_convolution_out ( static Tensor _mkldnn_convolution( const Tensor& input_t, const Tensor& weight_t, - const c10::optional& bias_opt, + const std::optional& bias_opt, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool use_channels_last, c10::string_view attr = "none", - torch::List> scalars = - torch::List>(), - c10::optional algorithm = c10::nullopt) { + torch::List> scalars = + torch::List>(), + std::optional algorithm = c10::nullopt) { ideep::attr_t op_attr = ideep::attr_t(); if (attr != "none") { auto it = fusion_unary_attr_map().find(attr); @@ -324,7 +324,7 @@ static Tensor _mkldnn_convolution( Tensor mkldnn_convolution( const Tensor& input_t, const Tensor& weight_t, - const c10::optional& bias_opt, + const std::optional& bias_opt, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, @@ -345,14 +345,14 @@ namespace{ Tensor mkldnn_convolution_pointwise( const Tensor& input_t, const Tensor& weight_t, - const c10::optional& bias_opt, + const std::optional& bias_opt, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, c10::string_view attr, - torch::List> scalars, - c10::optional algorithm) { + torch::List> scalars, + std::optional algorithm) { c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset); bool use_channels_last = weight_t.is_mkldnn() || mkldnn_conv_use_channels_last(input_t, weight_t); @@ -382,16 +382,16 @@ Tensor mkldnn_convolution_pointwise_binary( const Tensor& input_t, const Tensor& other_t, const Tensor& weight_t, - const c10::optional& bias_opt, + const std::optional& bias_opt, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, c10::string_view binary_attr, - c10::optional alpha, - c10::optional unary_attr, - torch::List> unary_scalars, - c10::optional unary_algorithm) { + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm) { TORCH_CHECK( input_t.ndimension() == 4 || input_t.ndimension() == 5, "mkldnn_convolution_pointwise_binary: currently only support 2d and 3d") @@ -546,16 +546,16 @@ Tensor& mkldnn_convolution_pointwise_binary_( Tensor& other_t, const Tensor& input_t, const Tensor& weight_t, - const c10::optional& bias_opt, + const std::optional& bias_opt, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, c10::string_view binary_attr, - c10::optional alpha, - c10::optional unary_attr, - torch::List> unary_scalars, - c10::optional unary_algorithm) { + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm) { // other_t += convolution(...), other_t = unary(other_t) TORCH_CHECK( input_t.ndimension() == 4 || input_t.ndimension() == 5, @@ -664,7 +664,7 @@ std::vector _original_deconv_weight_size( Tensor _mkldnn_convolution_transpose( const Tensor& input_t, const Tensor& weight_t, - const c10::optional& bias_opt, + const std::optional& bias_opt, IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, @@ -672,9 +672,9 @@ Tensor _mkldnn_convolution_transpose( int64_t groups, bool use_channels_last, c10::string_view attr = "none", - torch::List> scalars = - torch::List>(), - c10::optional algorithm = c10::nullopt) { + torch::List> scalars = + torch::List>(), + std::optional algorithm = c10::nullopt) { ideep::attr_t op_attr = ideep::attr_t(); if (attr != "none") { auto it = fusion_unary_attr_map().find(attr); @@ -760,15 +760,15 @@ Tensor _mkldnn_convolution_transpose( Tensor mkldnn_convolution_transpose_pointwise( const Tensor& input_t, const Tensor& weight_t, - const c10::optional& bias_opt, + const std::optional& bias_opt, IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, c10::string_view attr, - torch::List> scalars, - c10::optional algorithm) { + torch::List> scalars, + std::optional algorithm) { c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset); bool use_channels_last = weight_t.is_mkldnn() || mkldnn_conv_use_channels_last(input_t, weight_t); @@ -791,15 +791,15 @@ Tensor mkldnn_convolution_transpose_pointwise( Tensor mkldnn_convolution_transpose_pointwise_meta( const Tensor& input_t, const Tensor& weight_t, - const c10::optional& bias_opt, + const std::optional& bias_opt, IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, c10::string_view attr, - torch::List> scalars, - c10::optional algorithm) { + torch::List> scalars, + std::optional algorithm) { std::vector weight_IOHW_sizes = _original_deconv_weight_size(weight_t, groups); int64_t dim = input_t.ndimension() - 2; @@ -941,7 +941,7 @@ namespace{ Tensor mkldnn_convolution_transpose( const Tensor& input, const Tensor& weight, - const c10::optional& bias_opt, + const std::optional& bias_opt, IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, diff --git a/aten/src/ATen/native/mkldnn/ConvPrepack.cpp b/aten/src/ATen/native/mkldnn/ConvPrepack.cpp index 4fb126f25cf09..cab4f1efa55eb 100644 --- a/aten/src/ATen/native/mkldnn/ConvPrepack.cpp +++ b/aten/src/ATen/native/mkldnn/ConvPrepack.cpp @@ -19,7 +19,7 @@ namespace convolution { c10::intrusive_ptr createConvPrePackOpContext( Tensor weight, - c10::optional bias, + std::optional bias, std::vector stride, std::vector padding, std::vector dilation, @@ -43,7 +43,7 @@ c10::intrusive_ptr createConvPrePackOpContext( ContextConv create( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, @@ -98,7 +98,7 @@ static void _mkldnn_convolution_out( const ideep::tensor& x, ideep::tensor& y, const ideep::tensor& w, - const c10::optional& b, + const std::optional& b, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, @@ -147,7 +147,7 @@ static void mkldnn_convolution_out( const Tensor& input, ideep::tensor& mkldnn_output, const ideep::tensor& mkldnn_weight, - const c10::optional& bias_opt, + const std::optional& bias_opt, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, @@ -160,7 +160,7 @@ static void mkldnn_convolution_out( c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset); const ideep::tensor mkldnn_input = itensor_from_tensor(input); - c10::optional mkldnn_bias{c10::nullopt}; + std::optional mkldnn_bias{c10::nullopt}; if (bias.defined()) { mkldnn_bias = itensor_from_tensor(bias); } diff --git a/aten/src/ATen/native/mkldnn/ConvPrepack.h b/aten/src/ATen/native/mkldnn/ConvPrepack.h index 03189c5f5e706..db858b9bb46d9 100644 --- a/aten/src/ATen/native/mkldnn/ConvPrepack.h +++ b/aten/src/ATen/native/mkldnn/ConvPrepack.h @@ -14,7 +14,7 @@ namespace convolution { c10::intrusive_ptr createConvPrePackOpContext( Tensor weight, - c10::optional bias, + std::optional bias, std::vector stride, std::vector padding, std::vector dilation, @@ -28,7 +28,7 @@ Tensor conv_run( ContextConv create( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, diff --git a/aten/src/ATen/native/mkldnn/Linear.cpp b/aten/src/ATen/native/mkldnn/Linear.cpp index 71d033fca3b86..70434fde7e479 100644 --- a/aten/src/ATen/native/mkldnn/Linear.cpp +++ b/aten/src/ATen/native/mkldnn/Linear.cpp @@ -26,7 +26,7 @@ namespace native { Tensor mkldnn_linear( const Tensor& self, - const Tensor& weight, const c10::optional& bias_opt) { + const Tensor& weight, const std::optional& bias_opt) { TORCH_CHECK(false, "mkldnn_linear: ATen not compiled with MKLDNN support"); } Tensor mkldnn_linear_backward_input( @@ -58,7 +58,7 @@ namespace native { Tensor mkldnn_linear( const Tensor& self, - const Tensor& weight_t, const c10::optional& bias_opt) { + const Tensor& weight_t, const std::optional& bias_opt) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; @@ -183,10 +183,10 @@ std::tuple mkldnn_linear_backward( static Tensor mkldnn_linear_pointwise( const Tensor& input_t, const Tensor& weight_t, - const c10::optional& bias_opt, + const std::optional& bias_opt, c10::string_view attr, - torch::List> scalars, - c10::optional algorithm) { + torch::List> scalars, + std::optional algorithm) { auto input = input_t.contiguous(); auto input_size = input.sizes(); @@ -218,7 +218,7 @@ static Tensor mkldnn_linear_pointwise( const ideep::tensor mkldnn_input = itensor_view_from_dense(input_reshaped); - c10::optional mkldnn_bias{c10::nullopt}; + std::optional mkldnn_bias{c10::nullopt}; if (bias.defined()) { mkldnn_bias = itensor_from_tensor(bias); } @@ -258,7 +258,7 @@ static Tensor mkldnn_linear_pointwise_binary( const Tensor& input_t, const Tensor& other_t, const Tensor& weight_t, - const c10::optional& bias_opt, + const std::optional& bias_opt, c10::string_view attr) { c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); @@ -303,7 +303,7 @@ static Tensor mkldnn_linear_pointwise_binary( const ideep::tensor mkldnn_other = itensor_from_tensor(other_reshaped); const ideep::tensor mkldnn_input = itensor_view_from_dense(input_reshaped); - c10::optional mkldnn_bias{c10::nullopt}; + std::optional mkldnn_bias{c10::nullopt}; if (bias.defined()) { mkldnn_bias = itensor_from_tensor(bias); } @@ -339,7 +339,7 @@ static Tensor mkl_linear( const Tensor& self, const Tensor& mkl_weight_t, const Tensor& origin_weight_t, - const c10::optional& bias_opt, + const std::optional& bias_opt, const int64_t prepack_batch_size) { c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); @@ -427,7 +427,7 @@ static Tensor mkl_linear( const Tensor& self, const Tensor& mkl_weight_t, const Tensor& origin_weight_t, - const c10::optional& bias_opt, + const std::optional& bias_opt, const int64_t prepack_batch_size) { TORCH_CHECK(false, "mkl_linear: ATen not compiled with MKL support"); } diff --git a/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp b/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp index 061d154f3b40f..e6fdbb0656c07 100644 --- a/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp +++ b/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp @@ -61,7 +61,7 @@ ideep::tensor::data_type get_mkldnn_dtype(ScalarType type) { } } -Tensor new_with_itensor_mkldnn(ideep::tensor&& it, c10::optional dtype, c10::optional device) { +Tensor new_with_itensor_mkldnn(ideep::tensor&& it, std::optional dtype, c10::optional device) { // NOTE: int32_t dims from ideep::tensor but sizes needs int64_t // TODO: support int64_t dims in ideep::tensor to avoid extra conversion auto dims = it.get_dims(); diff --git a/aten/src/ATen/native/mkldnn/MKLDNNCommon.h b/aten/src/ATen/native/mkldnn/MKLDNNCommon.h index 5e9044ce908aa..f41c4ae075be5 100644 --- a/aten/src/ATen/native/mkldnn/MKLDNNCommon.h +++ b/aten/src/ATen/native/mkldnn/MKLDNNCommon.h @@ -29,7 +29,7 @@ static inline ideep::tensor::data_type get_mkldnn_dtype(const Tensor& t) { } // Construct aten MKL-DNN tensor given an ideep tensor -TORCH_API Tensor new_with_itensor_mkldnn(ideep::tensor&& it, c10::optional dtype, c10::optional device); +TORCH_API Tensor new_with_itensor_mkldnn(ideep::tensor&& it, std::optional dtype, c10::optional device); // Retrieve `ideep::tensor` from MKL-DNN tensor TORCH_API ideep::tensor& itensor_from_mkldnn(const Tensor& mkldnn_tensor); diff --git a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp index b2901bc522be2..f01cb8da1241f 100644 --- a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp +++ b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp @@ -24,7 +24,7 @@ namespace at { namespace native { #if AT_MKLDNN_ENABLED() -Tensor mkldnn_to_dense(const Tensor& mkldnn_tensor, c10::optional dtype, c10::optional masked_grad) { +Tensor mkldnn_to_dense(const Tensor& mkldnn_tensor, std::optional dtype, c10::optional masked_grad) { TORCH_CHECK(mkldnn_tensor.scalar_type() == ScalarType::Float || mkldnn_tensor.scalar_type() == ScalarType::BFloat16 || mkldnn_tensor.scalar_type() == ScalarType::Half || @@ -73,7 +73,7 @@ Tensor mkldnn_to_dense(const Tensor& mkldnn_tensor, c10::optional dt return cpu_tensor.contiguous().resize_(dims, c10::MemoryFormat::Contiguous); } -Tensor dense_to_mkldnn(const Tensor& cpu_tensor, c10::optional dtype) { +Tensor dense_to_mkldnn(const Tensor& cpu_tensor, std::optional dtype) { TORCH_CHECK(cpu_tensor.device().is_cpu(), "dense_to_mkldnn expects CPU tensor input"); TORCH_CHECK(cpu_tensor.layout() == Layout::Strided, @@ -256,7 +256,7 @@ static Tensor mkldnn_reorder_conv_weight( static Tensor mkldnn_reorder_linear_weight( const Tensor& self, - c10::optional batch_size_opt) { + std::optional batch_size_opt) { mkldnn_check_low_precision(self.scalar_type(), "mkldnn_reorder_linear_weight"); auto out_features = self.size(0); auto in_features = self.size(1); @@ -525,11 +525,11 @@ TORCH_LIBRARY_IMPL(mkldnn, CPU, m) { #else -Tensor mkldnn_to_dense(const Tensor& mkldnn_tensor, c10::optional dtype, c10::optional masked_grad) { +Tensor mkldnn_to_dense(const Tensor& mkldnn_tensor, std::optional dtype, c10::optional masked_grad) { TORCH_CHECK(false, "MKL-DNN build is disabled"); } -Tensor dense_to_mkldnn(const Tensor& cpu_tensor, c10::optional dtype) { +Tensor dense_to_mkldnn(const Tensor& cpu_tensor, std::optional dtype) { TORCH_CHECK(false, "MKL-DNN build is disabled"); } diff --git a/aten/src/ATen/native/mkldnn/Normalization.cpp b/aten/src/ATen/native/mkldnn/Normalization.cpp index 0aced614a0ea3..e684a931f7752 100644 --- a/aten/src/ATen/native/mkldnn/Normalization.cpp +++ b/aten/src/ATen/native/mkldnn/Normalization.cpp @@ -21,7 +21,7 @@ namespace at { namespace native { std::tuple mkldnn_batch_norm( - const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, + const Tensor& self, const std::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, bool train, double momentum, double eps) { @@ -30,7 +30,7 @@ std::tuple mkldnn_batch_norm( std::tuple mkldnn_batch_norm_backward( const Tensor& grad_output, - const Tensor& input, const c10::optional& weight_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_invstd_opt, + const Tensor& input, const std::optional& weight_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_invstd_opt, bool train, double eps, std::array grad_input_mask) { @@ -45,7 +45,7 @@ static std::tuple mkldnn_layer_norm_last_index_weight_bi } std::tuple _mkldnn_batch_norm_legit( - const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, + const Tensor& input, const std::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps) { @@ -54,7 +54,7 @@ std::tuple _mkldnn_batch_norm_legit( std::tuple _mkldnn_batch_norm_legit_no_stats( - const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + const Tensor& input, const std::optional& weight_opt, const c10::optional& bias_opt, bool train, double momentum, double eps) { @@ -62,15 +62,15 @@ std::tuple _mkldnn_batch_norm_legit_no_stats( } std::tuple _batch_norm_with_update_mkldnn( - const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + const Tensor& input, const std::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, double momentum, double eps) { TORCH_CHECK(false, "_batch_norm_with_update_mkldnn: ATen not compiled with MKLDNN support"); } std::tuple _new_batch_norm_backward_mkldnn( const Tensor& grad_output, const Tensor& input, const Tensor& weight, - const c10::optional& running_mean_opt, const c10::optional& running_var_opt, - const c10::optional& save_mean_opt, const c10::optional& save_var_opt, + const std::optional& running_mean_opt, const c10::optional& running_var_opt, + const std::optional& save_mean_opt, const c10::optional& save_var_opt, bool update, double eps, std::array grad_input_mask, const Tensor& reserve) { TORCH_CHECK(false, "_new_batch_norm_backward_mkldnn: ATen not compiled with MKLDNN support"); } @@ -131,7 +131,7 @@ std::tuple mkldnn_layer_norm_last_index_weight_bias_f32( std::tuple mkldnn_batch_norm( - const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, + const Tensor& input, const std::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, bool train, double momentum, double eps) { @@ -209,7 +209,7 @@ std::tuple mkldnn_batch_norm( std::tuple _batch_norm_with_update_mkldnn( - const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + const Tensor& input, const std::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, double momentum, double eps) { Tensor output, save_mean, save_var; std::tie(output, save_mean, save_var) = @@ -220,7 +220,7 @@ std::tuple _batch_norm_with_update_mkldnn( std::tuple _mkldnn_batch_norm_legit( - const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, + const Tensor& input, const std::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps) { @@ -229,7 +229,7 @@ std::tuple _mkldnn_batch_norm_legit( std::tuple _mkldnn_batch_norm_legit_no_stats( - const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + const Tensor& input, const std::optional& weight_opt, const c10::optional& bias_opt, bool train, double momentum, double eps) { @@ -239,15 +239,15 @@ std::tuple _mkldnn_batch_norm_legit_no_stats( std::tuple _new_batch_norm_backward_mkldnn( const Tensor& grad_output, const Tensor& input, const Tensor& weight, - const c10::optional& running_mean_opt, const c10::optional& running_var_opt, - const c10::optional& save_mean_opt, const c10::optional& save_var_opt, + const std::optional& running_mean_opt, const c10::optional& running_var_opt, + const std::optional& save_mean_opt, const c10::optional& save_var_opt, bool update, double eps, std::array grad_input_mask, const Tensor& reserve) { return mkldnn_batch_norm_backward(grad_output, input, weight, running_mean_opt, running_var_opt, save_mean_opt, save_var_opt, update, eps, grad_input_mask); } std::tuple mkldnn_batch_norm_backward(const Tensor& grad_output, - const Tensor& input, const c10::optional& weight_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_invstd_opt, + const Tensor& input, const std::optional& weight_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_invstd_opt, bool train, double eps, std::array grad_input_mask) { diff --git a/aten/src/ATen/native/mkldnn/OpContext.cpp b/aten/src/ATen/native/mkldnn/OpContext.cpp index 3de67ceacf002..820f1273b0cb5 100644 --- a/aten/src/ATen/native/mkldnn/OpContext.cpp +++ b/aten/src/ATen/native/mkldnn/OpContext.cpp @@ -9,7 +9,7 @@ namespace mkldnn { c10::intrusive_ptr MkldnnConvOpContext::create_context( at::Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& padding, std::vector&& stride, std::vector&& dilation, diff --git a/aten/src/ATen/native/mkldnn/OpContext.h b/aten/src/ATen/native/mkldnn/OpContext.h index 21e8cc78a5134..5ae5344ccf509 100644 --- a/aten/src/ATen/native/mkldnn/OpContext.h +++ b/aten/src/ATen/native/mkldnn/OpContext.h @@ -17,7 +17,7 @@ const static std::map fusion_attr_map = { using SerializationTypeConvPrePack = std::tuple< Tensor, - c10::optional, + std::optional, std::vector, std::vector, std::vector, @@ -28,7 +28,7 @@ using SerializationTypeConvPrePack = std::tuple< class ConvOpContext : public torch::jit::CustomClassHolder { protected: Tensor orig_weight_; - c10::optional orig_bias_; + std::optional orig_bias_; std::vector stride_; std::vector padding_; std::vector dilation_; @@ -60,7 +60,7 @@ class MkldnnConvOpContext final : public ConvOpContext { public: MkldnnConvOpContext( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& padding, std::vector&& stride, std::vector&& dilation, @@ -83,7 +83,7 @@ class MkldnnConvOpContext final : public ConvOpContext { static c10::intrusive_ptr create_context( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& padding, std::vector&& stride, std::vector&& dilation, diff --git a/aten/src/ATen/native/mkldnn/Pooling.cpp b/aten/src/ATen/native/mkldnn/Pooling.cpp index 7b59d7b85fe93..e1a5cfe5dff32 100644 --- a/aten/src/ATen/native/mkldnn/Pooling.cpp +++ b/aten/src/ATen/native/mkldnn/Pooling.cpp @@ -56,7 +56,7 @@ Tensor mkldnn_avg_pool2d( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { TORCH_CHECK(false, "mkldnn_avg_pool2d: ATen not compiled with MKLDNN support"); } @@ -66,7 +66,7 @@ Tensor& mkldnn_avg_pool2d_out(const Tensor& self, IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, Tensor& output) { TORCH_CHECK(false, "mkldnn_avg_pool2d_out: ATen not compiled with MKLDNN support"); } @@ -78,7 +78,7 @@ Tensor mkldnn_avg_pool3d( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { TORCH_CHECK(false, "mkldnn_avg_pool3d: ATen not compiled with MKLDNN support"); } @@ -88,7 +88,7 @@ Tensor& mkldnn_avg_pool3d_out(const Tensor& self, IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, Tensor& output) { TORCH_CHECK(false, "mkldnn_avg_pool3d_out: ATen not compiled with MKLDNN support"); } @@ -140,7 +140,7 @@ Tensor& mkldnn_avg_pool2d_backward_out(const Tensor & grad_output, IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, Tensor & grad_input) { TORCH_CHECK(false, "mkldnn_avg_pool2d_backward_out: ATen not compiled with MKLDNN support"); } @@ -153,7 +153,7 @@ Tensor mkldnn_avg_pool2d_backward( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { TORCH_CHECK(false, "mkldnn_avg_pool2d_backward: ATen not compiled with MKLDNN support"); } @@ -164,7 +164,7 @@ Tensor& mkldnn_avg_pool3d_backward_out(const Tensor & grad_output, IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, Tensor & grad_input) { TORCH_CHECK(false, "mkldnn_avg_pool3d_backward_out: ATen not compiled with MKLDNN support"); } @@ -177,7 +177,7 @@ Tensor mkldnn_avg_pool3d_backward( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { TORCH_CHECK(false, "mkldnn_avg_pool3d_backward: ATen not compiled with MKLDNN support"); } @@ -418,7 +418,7 @@ Tensor mkldnn_avg_pool2d( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { TORCH_CHECK(!divisor_override.has_value(), "mkldnn_avg_pool2d operator does not support divisor"); if (input.scalar_type() == ScalarType::BFloat16) { @@ -443,7 +443,7 @@ Tensor& mkldnn_avg_pool2d_out(const Tensor& input, IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, Tensor& output) { TORCH_CHECK(false, "mkldnn_avg_pool2d_out: in-place mkldnn operations are not supported yet"); } @@ -455,7 +455,7 @@ Tensor mkldnn_avg_pool3d( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { TORCH_CHECK(!divisor_override.has_value(), "mkldnn_avg_pool3d operator does not support divisor"); if (input.scalar_type() == ScalarType::BFloat16) { TORCH_CHECK(mkldnn_bf16_device_check(), @@ -479,7 +479,7 @@ Tensor& mkldnn_avg_pool3d_out(const Tensor& input, IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, Tensor& output) { TORCH_CHECK(false, "mkldnn_avg_pool3d_out: in-place mkldnn operations are not supported yet"); } @@ -579,7 +579,7 @@ Tensor mkldnn_avg_pool2d_backward( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { return _mkldnn_pooling_backward( grad_output, grad_output, @@ -600,7 +600,7 @@ Tensor& mkldnn_avg_pool2d_backward_out(const Tensor & grad_output, IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, Tensor & grad_input) { TORCH_CHECK(false, "mkldnn_avg_pool2d_backward_out: in-place mkldnn operations are not supported yet"); } @@ -613,7 +613,7 @@ Tensor mkldnn_avg_pool3d_backward( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { return _mkldnn_pooling_backward( grad_output, grad_output, @@ -634,7 +634,7 @@ Tensor& mkldnn_avg_pool3d_backward_out(const Tensor & grad_output, IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override, + std::optional divisor_override, Tensor & grad_input) { TORCH_CHECK(false, "mkldnn_avg_pool3d_backward_out: in-place mkldnn operations are not supported yet"); } diff --git a/aten/src/ATen/native/mkldnn/RNN.cpp b/aten/src/ATen/native/mkldnn/RNN.cpp index afea7f91e79ea..b35504bc19cce 100644 --- a/aten/src/ATen/native/mkldnn/RNN.cpp +++ b/aten/src/ATen/native/mkldnn/RNN.cpp @@ -55,9 +55,9 @@ std::tuple mkldnn_rnn_la const Tensor& output, const Tensor& hy_, const Tensor& cy_, - const c10::optional& grad_output_r_opt, - const c10::optional& grad_hy_r_opt, - const c10::optional& grad_cy_r_opt, + const std::optional& grad_output_r_opt, + const std::optional& grad_hy_r_opt, + const std::optional& grad_cy_r_opt, bool reverse, int64_t mode, int64_t hidden_size, @@ -306,9 +306,9 @@ std::tuple mkldnn_rnn_la const Tensor& output, const Tensor& hy_, const Tensor& cy_, - const c10::optional& grad_output_r_opt, - const c10::optional& grad_hy_r_opt, - const c10::optional& grad_cy_r_opt, + const std::optional& grad_output_r_opt, + const std::optional& grad_hy_r_opt, + const std::optional& grad_cy_r_opt, bool reverse, int64_t mode, int64_t hidden_size, diff --git a/aten/src/ATen/native/mkldnn/TensorFactories.cpp b/aten/src/ATen/native/mkldnn/TensorFactories.cpp index 65a22aa74ed53..81dc5d8880cfa 100644 --- a/aten/src/ATen/native/mkldnn/TensorFactories.cpp +++ b/aten/src/ATen/native/mkldnn/TensorFactories.cpp @@ -12,7 +12,7 @@ namespace at { namespace native { #if AT_MKLDNN_ENABLED() -Tensor empty_mkldnn(IntArrayRef sizes, c10::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory, c10::optional optional_memory_format) { +Tensor empty_mkldnn(IntArrayRef sizes, std::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory, c10::optional optional_memory_format) { TORCH_CHECK( !optional_memory_format.has_value(), "'memory_format' argument is incompatible with mkldnn tensor"); @@ -26,7 +26,7 @@ Tensor empty_mkldnn(IntArrayRef sizes, c10::optional dtype, c10::opt #else -Tensor empty_mkldnn(IntArrayRef sizes, c10::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory, c10::optional optional_memory_format) { +Tensor empty_mkldnn(IntArrayRef sizes, std::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory, c10::optional optional_memory_format) { TORCH_CHECK(false, "empty_mkldnn: MKL-DNN build is disabled"); } diff --git a/aten/src/ATen/native/mkldnn/TensorShape.cpp b/aten/src/ATen/native/mkldnn/TensorShape.cpp index ac47648294242..d653d2588ba22 100644 --- a/aten/src/ATen/native/mkldnn/TensorShape.cpp +++ b/aten/src/ATen/native/mkldnn/TensorShape.cpp @@ -26,7 +26,7 @@ Tensor mkldnn_reshape(const Tensor& self, IntArrayRef size) { TORCH_CHECK(false, "mkldnn_reshape: ATen not compiled with MKLDNN support"); } -Tensor mkldnn_clone(const Tensor& self, c10::optional optional_memory_format) { +Tensor mkldnn_clone(const Tensor& self, std::optional optional_memory_format) { TORCH_CHECK(false, "mkldnn_clone: ATen not compiled with MKLDNN support"); } @@ -65,7 +65,7 @@ Tensor mkldnn_reshape(const Tensor& self, IntArrayRef size) { self.options().device_opt()); } -Tensor mkldnn_clone(const Tensor& self, c10::optional optional_memory_format) { +Tensor mkldnn_clone(const Tensor& self, std::optional optional_memory_format) { TORCH_CHECK( !optional_memory_format.has_value(), "unsupported memory format option ", diff --git a/aten/src/ATen/native/mkldnn/Utils.cpp b/aten/src/ATen/native/mkldnn/Utils.cpp index 400eb9165f347..6578b23ff9c92 100644 --- a/aten/src/ATen/native/mkldnn/Utils.cpp +++ b/aten/src/ATen/native/mkldnn/Utils.cpp @@ -79,14 +79,14 @@ void check_mkldnn_binary_fusion_inputs( #if AT_MKLDNN_ENABLED() #define ATTR_FUNC(NAME) \ - [](torch::List> scalars, \ - c10::optional algorithm) { \ + [](torch::List> scalars, \ + std::optional algorithm) { \ return ideep::attr_t::fuse_##NAME(); \ } AttrFunction attr_func_leaky_relu = - [](torch::List> scalars, - c10::optional algorithm) { + [](torch::List> scalars, + std::optional algorithm) { TORCH_CHECK( scalars.size() == 1 && scalars[0].get().toOptional().has_value(), @@ -97,8 +97,8 @@ AttrFunction attr_func_leaky_relu = }; AttrFunction attr_func_hardtanh = - [](torch::List> scalars, - c10::optional algorithm) { + [](torch::List> scalars, + std::optional algorithm) { TORCH_CHECK( scalars.size() == 2 && scalars[0].get().toOptional().has_value() && @@ -112,8 +112,8 @@ AttrFunction attr_func_hardtanh = return ideep::attr_t::fuse_clamp(lower_bound_value, upper_bound_value); }; -AttrFunction attr_func_gelu = [](torch::List> scalars, - c10::optional algorithm) { +AttrFunction attr_func_gelu = [](torch::List> scalars, + std::optional algorithm) { TORCH_CHECK( algorithm.has_value(), "gelu is expected to have one str input: algorithm"); @@ -131,8 +131,8 @@ AttrFunction attr_func_gelu = [](torch::List> scalars, }; AttrFunction attr_func_hardsigmoid = - [](torch::List> scalars, - c10::optional algorithm) { + [](torch::List> scalars, + std::optional algorithm) { ideep::attr_t attr; ideep::post_ops po; po.append_eltwise( diff --git a/aten/src/ATen/native/mkldnn/Utils.h b/aten/src/ATen/native/mkldnn/Utils.h index aa804d6bc1877..75f1b2c1b709a 100644 --- a/aten/src/ATen/native/mkldnn/Utils.h +++ b/aten/src/ATen/native/mkldnn/Utils.h @@ -73,8 +73,8 @@ static inline Tensor may_convert_to_default_contiguous_strides(const Tensor& inp #if AT_MKLDNN_ENABLED() using AttrFunction = std::function>, - c10::optional)>; + torch::List>, + std::optional)>; const std::map& fusion_unary_attr_map(); diff --git a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp index 8ac19605b1c79..7f84704d30907 100644 --- a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp @@ -563,7 +563,7 @@ Tensor _convolution( Tensor convolution_overrideable( const Tensor& input_r, const Tensor& weight_r, - const c10::optional& bias_r_opt, + const std::optional& bias_r_opt, IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_, diff --git a/aten/src/ATen/native/mps/TensorFactory.cpp b/aten/src/ATen/native/mps/TensorFactory.cpp index 6fe145a6cc556..03ff521db1046 100644 --- a/aten/src/ATen/native/mps/TensorFactory.cpp +++ b/aten/src/ATen/native/mps/TensorFactory.cpp @@ -50,7 +50,7 @@ static inline void maybe_resize_storage_mps(TensorImpl* self, uint64_t new_size) inline TensorImpl* resize_impl_mps_( TensorImpl* self, IntArrayRef size, - c10::optional stride, + std::optional stride, bool device_guard = true) { if (self->sizes() == size && (!stride || self->strides() == stride)) { return self; @@ -72,11 +72,11 @@ inline TensorImpl* resize_impl_mps_( Tensor empty_mps( IntArrayRef size, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt) { return at::detail::empty_mps(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt); } @@ -84,10 +84,10 @@ Tensor empty_mps( Tensor empty_strided_mps( IntArrayRef size, IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) { check_size_nonnegative(size); // empty memory formatempty auto t = at::native::empty_mps( @@ -103,7 +103,7 @@ Tensor empty_strided_mps( const Tensor& resize_mps_( const Tensor& self, IntArrayRef size, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { if (self.has_names()) { return resize_named_tensor_(self, size, optional_memory_format); } @@ -142,17 +142,17 @@ Tensor& set_storage_mps_(Tensor& result, Storage storage, int64_t storage_offset checkSetStorage(result, storage, storage_offset, size, stride); //std::cout << "set storage_mps " << storage_offset << " stride " << stride << std::endl; result.unsafeGetTensorImpl()->set_storage_offset(storage_offset); - c10::optional stride_opt = stride.data() != nullptr ? - c10::optional(stride) : c10::nullopt; + std::optional stride_opt = stride.data() != nullptr ? + std::optional(stride) : c10::nullopt; at::native::resize_impl_mps_(result.unsafeGetTensorImpl(), size, stride_opt); return result; } Tensor _efficientzerotensor_mps(IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { auto device_ = device_or_default(device); auto allocator = at::native::ZeroTensorAllocator(device_); auto dtype_ = dtype_or_default(dtype); diff --git a/aten/src/ATen/native/mps/operations/Quantized.mm b/aten/src/ATen/native/mps/operations/Quantized.mm index 3c77ec67b42df..4d0f569ea062b 100644 --- a/aten/src/ATen/native/mps/operations/Quantized.mm +++ b/aten/src/ATen/native/mps/operations/Quantized.mm @@ -12,6 +12,8 @@ #include #include +// #define _CAPTURE_KERNEL 1 + namespace at::native { using namespace mps; @@ -82,6 +84,85 @@ kernel void int4pack_mm( INSTANTIATE_INT4MM(bfloat, 128); INSTANTIATE_INT4MM(bfloat, 256); #endif + +template +struct Vec4Type {}; + +template<> +struct Vec4Type { + using type = float4; +}; + +template<> +struct Vec4Type { + using type = half4; +}; + +#if __METAL_VERSION__ >= 310 +template<> +struct Vec4Type { + using type = bfloat4; +}; +#endif + +template +kernel void +int8pack_mm(constant T *A [[buffer(0)]], constant char *B [[buffer(1)]], + constant T *scales [[buffer(2)]], + device T *outputData [[buffer(3)]], + constant int3 &sizes [[buffer(4)]], + uint2 group_index [[threadgroup_position_in_grid]], + uint2 threadgroup_index [[thread_position_in_threadgroup]]) { + using vecT = typename Vec4Type::type; + const uint lda = sizes.y; + const uint ldc = sizes.z; + int out_idx = (group_index.x * blockSize + threadgroup_index.x) * 4; + int n = out_idx % sizes.z; + int m = out_idx / sizes.z; + // Offset pointers + A += m * lda; + B += n * lda; + outputData += m *ldc; + + float4 rc = 0; + for (unsigned k = threadgroup_index.y * 4; k < sizes.y; k += 4 * blockSize) { + threadgroup_barrier(mem_flags::mem_none); + auto a_val = float4(*reinterpret_cast(A + k)); + float4x4 b_val; + for (int i = 0; i < 4; ++i) { + b_val[i] = float4(*reinterpret_cast(B + i * lda + k)); + } + rc += transpose(b_val) * a_val; + } + + // Accumulate results acorss SIMD group? (8 threads using vec4) + threadgroup float4 tgp_memory[blockSize][blockSize]; + tgp_memory[threadgroup_index.x][threadgroup_index.y] = rc; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (threadgroup_index.y == 0) { + for (int i = 1; i < blockSize; i++) { + rc += tgp_memory[threadgroup_index.x][i]; + } + *reinterpret_cast(outputData + n) = + vecT(rc * float4(*reinterpret_cast(scales + n))); + } +} + +#define INSTANTIATE_INT8MM(DTYPE) \ + template [[host_name("int8pack_mm_" #DTYPE)]] kernel void \ + int8pack_mm( \ + constant DTYPE * A [[buffer(0)]], constant char *B [[buffer(1)]], \ + constant DTYPE *scales [[buffer(2)]], \ + device DTYPE *outputData [[buffer(3)]], \ + constant int3 &sizes [[buffer(4)]], \ + uint2 group_index [[threadgroup_position_in_grid]], \ + uint2 threadgroup_index [[thread_position_in_threadgroup]]); + +INSTANTIATE_INT8MM(half); +INSTANTIATE_INT8MM(float); +#if __METAL_VERSION__ >= 310 +INSTANTIATE_INT8MM(bfloat); +#endif )METAL_QUANTIZED"); Tensor _weight_int4pack_mm_mps(const Tensor& A, const Tensor& B, int64_t qGroupSize, const Tensor& qScaleAndZeros) { @@ -114,8 +195,7 @@ Tensor _weight_int4pack_mm_mps(const Tensor& A, const Tensor& B, int64_t qGroupS auto C = at::empty({M, N}, A.options()); MPSStream* mpsStream = getCurrentMPSStream(); - std::array sizes = {static_cast(M), static_cast(K), static_cast(N)}; - static bool firstCapture = false; + std::array sizes = {static_cast(M), static_cast(K), static_cast(N), 0}; dispatch_sync_with_rethrow(mpsStream->queue(), ^() { @autoreleasepool { #if _CAPTURE_KERNEL @@ -163,7 +243,35 @@ Tensor _weight_int8pack_mm_mps(const Tensor& A, const Tensor& B, const Tensor& s TORCH_CHECK(scales.dim() == 1 && scales.size(0) == N, __func__, " : expect scales to be 1d tensor with size ", N); auto C = at::empty({M, N}, A.options()); - + TORCH_CHECK(N % 32 == 0 && K % 32 == 0); +#if 1 + MPSStream* mpsStream = getCurrentMPSStream(); + std::array sizes = {static_cast(M), static_cast(K), static_cast(N), 0}; + dispatch_sync_with_rethrow(mpsStream->queue(), ^() { + @autoreleasepool { +#if _CAPTURE_KERNEL + if (getMPSProfiler().isCaptureEnabled()) { + getMPSProfiler().startCapture(fmt::format("int8pack_mm_{}x{}x{}", M, N, K), mpsStream); + } +#endif + id computeEncoder = mpsStream->commandEncoder(); + const std::string kernel = fmt::format("int8pack_mm_{}", scalarToMetalTypeString(A)); + id quantizedPSO = lib.getPipelineStateForFunc(kernel); + [computeEncoder setComputePipelineState:quantizedPSO]; + mtl_setBuffer(computeEncoder, A, 0); + mtl_setBuffer(computeEncoder, B, 1); + mtl_setBuffer(computeEncoder, scales, 2); + mtl_setBuffer(computeEncoder, C, 3); + [computeEncoder setBytes:sizes.data() length:sizeof(uint32_t) * sizes.size() atIndex:4]; + [computeEncoder dispatchThreads:MTLSizeMake(M * N / 4, 8, 1) threadsPerThreadgroup:MTLSizeMake(8, 8, 1)]; +#if _CAPTURE_KERNEL + if (getMPSProfiler().isCapturing()) { + getMPSProfiler().stopCapture(mpsStream); + } +#endif + } + }); +#else struct CachedGraph : public MPSCachedGraph { CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} MPSGraphTensor *ATensor = nil, *BTensor = nil, *scalesTensor = nil; @@ -193,6 +301,7 @@ Tensor _weight_int8pack_mm_mps(const Tensor& A, const Tensor& B, const Tensor& s dictionaryFromPlaceholders(APlaceholder, BPlaceholder, scalesPlaceholder), outputPlaceholder); } +#endif return C; } diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 119c0b8572301..1ea973f93261b 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -15527,6 +15527,7 @@ CPU: foobar autogen: _foobar.out +# Fused Optimizer CUDA kernels. - func: _fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). variants: function @@ -15581,12 +15582,6 @@ CUDA: _fused_sgd_kernel_cuda_ autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out -- func: _fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () - variants: function - dispatch: - CPU: _fused_adagrad_kernel_cpu_ - autogen: _fused_adagrad, _fused_adagrad.out - # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts. - func: _propagate_xla_data(Tensor input, Tensor output) -> () variants: function diff --git a/aten/src/ATen/native/nested/NestedTensorBackward.cpp b/aten/src/ATen/native/nested/NestedTensorBackward.cpp index e4465b792c21e..488dab9e37cb2 100644 --- a/aten/src/ATen/native/nested/NestedTensorBackward.cpp +++ b/aten/src/ATen/native/nested/NestedTensorBackward.cpp @@ -197,8 +197,8 @@ std::tuple layer_norm_backward_nested( IntArrayRef normalized_shape, const Tensor& mean, const Tensor& rstd, - const c10::optional& weight_opt /* optional */, - const c10::optional& bias_opt /*{ optional */, + const std::optional& weight_opt /* optional */, + const std::optional& bias_opt /*{ optional */, std::array grad_input_mask) { // For NestedTensors weight and bias are non nested. auto* nt_impl_grad = get_nested_tensor_impl(grad); diff --git a/aten/src/ATen/native/nested/NestedTensorFactories.cpp b/aten/src/ATen/native/nested/NestedTensorFactories.cpp index 45425ed63315c..40e5082832021 100644 --- a/aten/src/ATen/native/nested/NestedTensorFactories.cpp +++ b/aten/src/ATen/native/nested/NestedTensorFactories.cpp @@ -8,11 +8,11 @@ namespace native { static TensorOptions verify_empty_parameters( const at::Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { TensorOptions options_ = TensorOptions() .dtype(dtype) .layout(layout) @@ -37,11 +37,11 @@ static TensorOptions verify_empty_parameters( Tensor empty_like_nested( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { auto options = verify_empty_parameters( self, dtype, layout, device, pin_memory, optional_memory_format); auto self_nt = get_nested_tensor_impl(self); @@ -83,12 +83,12 @@ static inline Device ensure_has_index(Device device) { Tensor _to_copy_nested( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, bool non_blocking, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { TORCH_CHECK( !layout.has_value() || self.layout() == layout.value(), "to(options) doesn't support converting to a different layout, " @@ -132,7 +132,7 @@ Tensor& copy_nested_(Tensor& self, const Tensor& src, bool non_blocking) { Tensor clone_nested( const Tensor& self, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { auto memory_format = optional_memory_format.value_or(c10::MemoryFormat::Preserve); auto self_ptr = get_nested_tensor_impl(self); if (memory_format == c10::MemoryFormat::Preserve || diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp index 7d3e826ef53e9..1974b4fe2cea0 100644 --- a/aten/src/ATen/native/nested/NestedTensorMath.cpp +++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp @@ -113,10 +113,10 @@ bool NestedTensor_nested_tensor_from_mask_left_aligned(const Tensor& t, const Te Tensor _nested_tensor_from_tensor_list( TensorList list, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { for (const auto i : c10::irange(list.size())) { if (i > 0) { int64_t dim_i = list[i].dim(); @@ -146,8 +146,8 @@ Tensor _nested_tensor_from_tensor_list( std::tuple nested_layer_norm( const Tensor& input, IntArrayRef normalized_shape, - const c10::optional& weight_opt, - const c10::optional& bias_opt, + const std::optional& weight_opt, + const std::optional& bias_opt, double eps) { TORCH_CHECK(weight_opt && bias_opt, "NestedTensor layer_norm requires weight and bias"); const auto& weight = *weight_opt; @@ -356,7 +356,7 @@ Tensor NestedTensor_sum_dim_CPU( const Tensor& self, OptionalIntArrayRef opt_dims, bool keepdim, - c10::optional dtype) { + std::optional dtype) { // Only allow reductions across the last dim auto dims = opt_dims.value_or(IntArrayRef{}); TORCH_CHECK( @@ -479,7 +479,7 @@ Tensor select_nested(const Tensor& self, int64_t dim, int64_t index) { } -std::tuple native_dropout_nested(const Tensor& input, double p, c10::optional train) { +std::tuple native_dropout_nested(const Tensor& input, double p, std::optional train) { auto input_ptr = get_nested_tensor_impl(input); const Tensor& input_buffer = input_ptr-> get_unsafe_storage_as_tensor(), & sizemat = input_ptr->get_nested_sizes(), @@ -587,7 +587,7 @@ Tensor squeeze_dim_nested(const Tensor& self, IntArrayRef dims) { // if tensor.size(dim) != 1 torch.squeeze will return the result, we do the same here for (const auto d : c10::irange(ndim)) { if (mask.test(d)) { - c10::optional size_dim = self_ptr->opt_size(d); + std::optional size_dim = self_ptr->opt_size(d); if (!(size_dim.has_value() && *size_dim == 1)) { mask.reset(d); } @@ -925,7 +925,7 @@ Tensor reshape_as_nested(const Tensor& self, const Tensor& other) { // if an accessor is provided in the future, can replace this std::vector sizes; for (int64_t i = 0; i < other_ptr->dim(); i++) { - c10::optional opt_size = other_ptr->opt_size(i); + std::optional opt_size = other_ptr->opt_size(i); if (opt_size.has_value()) { sizes.push_back(*opt_size); } @@ -937,7 +937,7 @@ Tensor reshape_as_nested(const Tensor& self, const Tensor& other) { return self.reshape(sizes); } -Tensor& normal_nested_(Tensor& self, double mean, double std, c10::optional gen) { +Tensor& normal_nested_(Tensor& self, double mean, double std, std::optional gen) { const auto& self_buf = get_nested_tensor_impl(self)->get_buffer(); self_buf.normal_(mean, std, gen); return self; diff --git a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp index 88e2a94570185..aa683ff854ef6 100644 --- a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp +++ b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp @@ -320,7 +320,7 @@ Tensor& matmul_out_nested( // if an accessor is provided in the future, can replace this std::vector sizes; for (int64_t i = 0; i < function_result_ptr->dim(); i++) { - c10::optional opt_size = function_result_ptr->opt_size(i); + std::optional opt_size = function_result_ptr->opt_size(i); if (opt_size.has_value()) { sizes.push_back(*opt_size); } else { diff --git a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp index 96d13c366f7ac..6285f2ca1223e 100644 --- a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp +++ b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp @@ -59,7 +59,7 @@ inline void check_nested_tensor_matrix_constraints( Tensor nested_linear( const Tensor& input, const Tensor& weight, - const c10::optional& bias_opt) { + const std::optional& bias_opt) { check_nested_tensor_matrix_constraints(input, weight, c10::string_view{"Linear"}); auto* nt_input = get_nested_tensor_impl(input); const Tensor& input_buffer = nt_input->get_buffer(); @@ -93,7 +93,7 @@ Tensor NestedTensor_times_Tensor_plus_Tensor_addmm( const Tensor& mat2, const c10::Scalar& beta, const c10::Scalar& alpha, - c10::optional use_gelu) { + std::optional use_gelu) { // Interesting case: alpha * NT * T + beta * T const auto* nt_mat1 = get_nested_tensor_impl_or_null(mat1); TORCH_INTERNAL_ASSERT(nt_mat1 != nullptr); @@ -184,7 +184,7 @@ Tensor NestedTensor_softmax_dropout(const Tensor& self, const Tensor& query) { } Tensor NestedTensor_softmax_dropout_cuda(const Tensor& self, const Tensor& query) { - c10::optional attn_mask; + std::optional attn_mask; attn_mask = NestedTensor_to_mask(query, 2, self.size(2)); attn_mask = attn_mask->to(query.device(), /*non-blocking=*/true); @@ -211,7 +211,7 @@ Tensor NestedTensor_batch_offsets_from_size_tensor( } -Tensor NestedTensor_to_mask(const Tensor& nt, c10::optional mask_dim, c10::optional mask_dim_length) { +Tensor NestedTensor_to_mask(const Tensor& nt, std::optional mask_dim, c10::optional mask_dim_length) { auto* nt_impl = get_nested_tensor_impl(nt); TORCH_CHECK(nested_tensor_impl_is_contiguous(nt_impl), "to_mask only works on contiguous NestedTensors."); TORCH_CHECK( diff --git a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h index cee721d7bc8f6..b0df6975304d2 100644 --- a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h +++ b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h @@ -36,7 +36,7 @@ Tensor NestedTensor_times_Tensor_plus_Tensor_addmm( const Tensor& mat2, const c10::Scalar& beta, const c10::Scalar& alpha, - c10::optional use_gelu = c10::nullopt); + std::optional use_gelu = c10::nullopt); Tensor NestedTensor_add_NestedTensor_in_place( const Tensor& self, @@ -50,7 +50,7 @@ Tensor NestedTensor_from_padded_tensor_cpu( const Tensor& padded, const NestedTensorImpl& nt); -Tensor NestedTensor_to_mask(const Tensor& nt, c10::optional mask_dim, c10::optional mask_dim_length); +Tensor NestedTensor_to_mask(const Tensor& nt, std::optional mask_dim, c10::optional mask_dim_length); template void remove_padding_kernelLauncher( diff --git a/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp b/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp index c41b6f15214aa..dc31b2c0de240 100644 --- a/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp +++ b/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp @@ -132,7 +132,7 @@ Tensor cos_nested(const Tensor& self) { return map_nt(self, at::cos); } -Tensor _pin_memory_nested(const Tensor& self, c10::optional device) { +Tensor _pin_memory_nested(const Tensor& self, std::optional device) { auto* nt_input = get_nested_tensor_impl(self); const auto& input_buffer = nt_input->get_unsafe_storage_as_tensor(); return wrap_buffer( diff --git a/aten/src/ATen/native/nested/NestedTensorUtils.cpp b/aten/src/ATen/native/nested/NestedTensorUtils.cpp index a5394404543f8..6539475cd1fdd 100644 --- a/aten/src/ATen/native/nested/NestedTensorUtils.cpp +++ b/aten/src/ATen/native/nested/NestedTensorUtils.cpp @@ -59,7 +59,7 @@ std::vector NestedTensor_get_max_size(const NestedTensorImpl& nt) { } int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt) { - c10::optional last_dim = nt.opt_size(-1); + std::optional last_dim = nt.opt_size(-1); TORCH_CHECK( last_dim != c10::nullopt, "Expected all tensors in nested tensor to have the same trailing dimension, instead last dimension equals: ", diff --git a/aten/src/ATen/native/nested/NestedTensorUtils.h b/aten/src/ATen/native/nested/NestedTensorUtils.h index 3b4f18f11b64b..572b0a827dd06 100644 --- a/aten/src/ATen/native/nested/NestedTensorUtils.h +++ b/aten/src/ATen/native/nested/NestedTensorUtils.h @@ -340,10 +340,10 @@ inline TensorNode get_nested_tensor_structure(at::Tensor tensor) { inline Tensor wrap_tensor_node( TensorNode tensor_node, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { TORCH_CHECK( !tensor_node.is_leaf(), "Expected TensorNode to wrap a list of Tensors."); TensorOptions options_ = diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp index 0da0c3e361d1f..977ace14fb34d 100644 --- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp +++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp @@ -234,7 +234,7 @@ _scaled_dot_product_flash_attention_nestedtensor_cuda( double dropout_p, bool is_causal, bool return_debug_mask, - c10::optional scale) { + std::optional scale) { Tensor query_buffer_reshaped, key_buffer_reshaped, value_buffer_reshaped, cumulative_sequence_length_q, cumulative_sequence_length_kv, output_shape; int64_t max_seqlen_batch_q{0}, max_seqlen_batch_kv{0}; @@ -285,11 +285,11 @@ _scaled_dot_product_efficient_attention_nestedtensor_cuda( const Tensor& query, const Tensor& key, const Tensor& value, - const c10::optional& attn_bias, + const std::optional& attn_bias, bool compute_log_sumexp, double dropout_p, bool is_causal, - c10::optional scale) { + std::optional scale) { Tensor query_buffer_reshaped, key_buffer_reshaped, value_buffer_reshaped, cumulative_sequence_length_q, cumulative_sequence_length_kv, output_shape; int64_t max_seqlen_batch_q{0}; @@ -344,7 +344,7 @@ std::tuple _scaled_dot_product_flash_attenti bool is_causal, const at::Tensor& philox_seed, const at::Tensor& philox_offset, - c10::optional scale){ + std::optional scale){ if (!grad_out_.defined()) { return std::make_tuple(Tensor{}, Tensor{}, Tensor{}); } diff --git a/aten/src/ATen/native/quantized/PackedParams.h b/aten/src/ATen/native/quantized/PackedParams.h index a442628573fec..d73bc0adbc4ef 100644 --- a/aten/src/ATen/native/quantized/PackedParams.h +++ b/aten/src/ATen/native/quantized/PackedParams.h @@ -111,11 +111,11 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder { return output; } - virtual std::tuple> unpack() = 0; + virtual std::tuple> unpack() = 0; - virtual c10::optional bias() = 0; + virtual std::optional bias() = 0; - virtual void set_bias(c10::optional /*bias*/) { + virtual void set_bias(std::optional /*bias*/) { throw std::runtime_error( "set_bias is not implemented for this packed " "parameter type"); @@ -136,7 +136,7 @@ struct ConvPackedParamsBase : public torch::jit::CustomClassHolder { const at::Tensor& input, bool reduce_range) = 0; - virtual std::tuple> unpack() = 0; + virtual std::tuple> unpack() = 0; virtual torch::List stride() const = 0; virtual torch::List padding() const = 0; diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp index 9705de0a4a54d..a6817984c12d2 100644 --- a/aten/src/ATen/native/quantized/QTensor.cpp +++ b/aten/src/ATen/native/quantized/QTensor.cpp @@ -188,13 +188,13 @@ QScheme qscheme_quant(const Tensor& self) { Tensor quantized_clone( const Tensor& self, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { auto memory_format = optional_memory_format.value_or(MemoryFormat::Contiguous); // TODO: To support all features of MemoryFormat::Preserve we need to add // _empty_affine_quantized_strided function and use it similarly to - // Tensor clone(const Tensor& src, c10::optional + // Tensor clone(const Tensor& src, std::optional // optional_memory_format) if (self.is_non_overlapping_and_dense()) -> // _empty_affine_quantized_strided if (memory_format == MemoryFormat::Preserve) { diff --git a/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp index 4f06b133771d9..11b005dc924c9 100644 --- a/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp @@ -121,7 +121,7 @@ Tensor & masked_fill__quantized_cuda(Tensor& self, const Tensor & mask, const Te return masked_fill_impl_quantized_cuda(self, mask, value.item()); } -Tensor& _index_put_impl_quantized_cpu_(Tensor & self, const torch::List>& indices, const Tensor & value, const bool accumulate, const bool unsafe) { +Tensor& _index_put_impl_quantized_cpu_(Tensor & self, const torch::List>& indices, const Tensor & value, const bool accumulate, const bool unsafe) { TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); TORCH_CHECK(!value.is_quantized(), "Value argument for quantized input_put should not be quantized"); TORCH_CHECK(self.qscheme() == c10::kPerTensorAffine, "index_put for quantized tensors is currently only supported for per tensor quantized tensors"); @@ -145,7 +145,7 @@ Tensor& _index_put_impl_quantized_cpu_(Tensor & self, const torch::List& index: indices) { + for (const std::optional& index: indices) { if (index.has_value()) { at::assert_no_overlap(self, *index); } @@ -157,7 +157,7 @@ Tensor& _index_put_impl_quantized_cpu_(Tensor & self, const torch::List>& indices, const Tensor & value, const bool accumulate, const bool unsafe) { +Tensor& _index_put_impl_quantized_cuda_(Tensor & self, const torch::List>& indices, const Tensor & value, const bool accumulate, const bool unsafe) { TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); TORCH_CHECK(!value.is_quantized(), "Value argument for quantized input_put should not be quantized"); TORCH_CHECK(self.qscheme() == c10::kPerTensorAffine, "index_put for quantized tensors is currently only supported for per tensor quantized tensors"); @@ -183,7 +183,7 @@ Tensor& _index_put_impl_quantized_cuda_(Tensor & self, const torch::List& index: indices) { + for (const std::optional& index: indices) { if (index.has_value()) { at::assert_no_overlap(self, *index); } diff --git a/aten/src/ATen/native/quantized/TensorCompare.cpp b/aten/src/ATen/native/quantized/TensorCompare.cpp index def1622863e1d..2cc6ebcda603f 100644 --- a/aten/src/ATen/native/quantized/TensorCompare.cpp +++ b/aten/src/ATen/native/quantized/TensorCompare.cpp @@ -47,7 +47,7 @@ Tensor& min_quantized_unary_out(const Tensor& self, Tensor& out) { std::tuple sort_quantized_cpu_stable( const Tensor& self, - c10::optional stable, + std::optional stable, int64_t dim, bool descending) { auto [sort_int, sort_indicies] = diff --git a/aten/src/ATen/native/quantized/TensorFactories.cpp b/aten/src/ATen/native/quantized/TensorFactories.cpp index e79f657e0de95..54dcdc37c5b23 100644 --- a/aten/src/ATen/native/quantized/TensorFactories.cpp +++ b/aten/src/ATen/native/quantized/TensorFactories.cpp @@ -14,13 +14,13 @@ namespace native { // change to use quantizer Tensor empty_affine_quantized( IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, double scale, int64_t zero_point, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -44,11 +44,11 @@ Tensor empty_per_channel_affine_quantized( const Tensor& scales, const Tensor& zero_points, int64_t axis, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -70,11 +70,11 @@ Tensor empty_per_channel_affine_quantized( Tensor empty_unknown_quantized( IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -93,10 +93,10 @@ Tensor empty_unknown_quantized( Tensor empty_strided_unknown_quantized( IntArrayRef size, IntArrayRef strided, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { TORCH_CHECK(false, "empty_strided not supported on quantized tensors yet see https://github.com/pytorch/pytorch/issues/74540") @@ -105,13 +105,13 @@ Tensor empty_strided_unknown_quantized( // Provide better error message if dtype is wrong Tensor empty_affine_quantized_other_backends_stub( IntArrayRef, - c10::optional, - c10::optional, - c10::optional, - c10::optional, + std::optional, + std::optional, + std::optional, + std::optional, double, int64_t, - c10::optional) { + std::optional) { TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8"); } @@ -120,11 +120,11 @@ Tensor empty_per_channel_affine_quantized_other_backends_stub( const Tensor&, const Tensor&, int64_t, - c10::optional, - c10::optional, - c10::optional, - c10::optional, - c10::optional) { + std::optional, + std::optional, + std::optional, + std::optional, + std::optional) { TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8"); } @@ -133,11 +133,11 @@ Tensor empty_per_channel_affine_quantized_other_backends_stub( Tensor empty_quantized( IntArrayRef size, const Tensor& qtensor, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional memory_format) { TensorOptions specified_options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); diff --git a/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp b/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp index 754c7d6bd529b..d7b53f8457868 100644 --- a/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp +++ b/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp @@ -47,7 +47,7 @@ static void avg_pool2d_out_frame( int padW, int padH, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { Tensor input_contig = input.contiguous(); auto input_data = input_contig.data_ptr(); auto output_data = output.data_ptr(); @@ -185,7 +185,7 @@ Tensor q_avg_pool2d( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) auto [kW, kH] = get_kernel(kernel_size); auto [dW, dH] = get_stride(stride, kW, kH); @@ -265,7 +265,7 @@ Tensor qnnpack_avg_pool2d( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { auto [kW, kH] = get_kernel(kernel_size); auto [dW, dH] = get_stride(stride, kW, kH); auto [padW, padH] = get_padding(padding); @@ -362,7 +362,7 @@ Tensor avg_pool2d_quantized_cpu( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { Tensor output; #ifdef USE_PYTORCH_QNNPACK if (at::globalContext().qEngine() == at::QEngine::QNNPACK && diff --git a/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp b/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp index 875ae28e46a96..b83e3e313cd08 100644 --- a/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp +++ b/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp @@ -100,7 +100,7 @@ Tensor q_avg_pool3d( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { auto [kW, kH, kD] = get_kernel(kernel_size); auto [dW, dH, dD] = get_stride(stride, kW, kH, kD); auto [padW, padH, padD] = get_padding(padding); @@ -165,7 +165,7 @@ Tensor avg_pool3d_quantized_cpu( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { Tensor output; AT_DISPATCH_QINT_TYPES(input.scalar_type(), "avg_pool3d_quantized_cpu", [&]() { output = q_avg_pool3d( diff --git a/aten/src/ATen/native/quantized/cpu/EmbeddingPackedParams.h b/aten/src/ATen/native/quantized/cpu/EmbeddingPackedParams.h index 140b716df2691..e6f47d611a19f 100644 --- a/aten/src/ATen/native/quantized/cpu/EmbeddingPackedParams.h +++ b/aten/src/ATen/native/quantized/cpu/EmbeddingPackedParams.h @@ -6,19 +6,19 @@ struct EmbeddingPackedParamsBase : public torch::jit::CustomClassHolder { virtual at::Tensor embeddingbag_byte( const at::Tensor& indices, - const c10::optional& offsets, + const std::optional& offsets, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset, bool is_embedding_op) = 0; virtual at::Tensor embeddingbag_4bit( const at::Tensor& indices, - const c10::optional& offsets, + const std::optional& offsets, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset, bool is_embedding_op) = 0; diff --git a/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp b/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp index 7bff3e3d4b443..df74b10d70f97 100644 --- a/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp +++ b/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp @@ -22,7 +22,7 @@ int register_linear_params(); #ifdef USE_FBGEMM -std::tuple> PackedLinearWeight::unpack() { +std::tuple> PackedLinearWeight::unpack() { auto packB = w.get(); int64_t N = static_cast(packB->numCols()); @@ -53,16 +53,16 @@ std::tuple> PackedLinearWeight::unpack() { // (QLinearUnpackWeightInt8): "); packB->unpack(weight_ptr_int8); - return std::tuple>( + return std::tuple>( weight_origin, bias_); } #endif // USE_FBGEMM #ifdef USE_PYTORCH_QNNPACK -std::tuple> PackedLinearWeightsQnnp:: +std::tuple> PackedLinearWeightsQnnp:: unpack() { if (orig_weight.defined()) { - return std::tuple>( + return std::tuple>( orig_weight, bias_); } else { // Unpacking requires reverting *make_zero_points_and_scales_tensor* @@ -110,14 +110,14 @@ std::tuple> PackedLinearWeightsQnnp:: weight_ptr_int8[i] = (int8_t)(weight_ptr_int8[i] - 128); } - return std::tuple>( + return std::tuple>( weight_origin, bias_); } } #endif // USE_PYTORCH_QNNPACK #ifdef USE_FBGEMM -std::tuple> PackedLinearWeightFp16:: +std::tuple> PackedLinearWeightFp16:: unpack() { auto& packed_weight_ptr = w; @@ -135,8 +135,8 @@ std::tuple> PackedLinearWeightFp16:: #endif // USE_FBGEMM #if AT_MKLDNN_ENABLED() -std::tuple> PackedLinearWeightsOnednn::unpack() { - return std::tuple>( +std::tuple> PackedLinearWeightsOnednn::unpack() { + return std::tuple>( orig_weight_, orig_bias_); } #endif // #if AT_MKLDNN_ENABLED() diff --git a/aten/src/ATen/native/quantized/cpu/Normalization.cpp b/aten/src/ATen/native/quantized/cpu/Normalization.cpp index 0f5fb9884a9c5..e92a9669cce04 100644 --- a/aten/src/ATen/native/quantized/cpu/Normalization.cpp +++ b/aten/src/ATen/native/quantized/cpu/Normalization.cpp @@ -54,8 +54,8 @@ void compute_fused_params( template Tensor q_batch_norm1d_impl( Tensor qx, - c10::optional mb_weight, - c10::optional mb_bias, + std::optional mb_weight, + std::optional mb_bias, Tensor mean, Tensor var, double eps, @@ -162,8 +162,8 @@ Tensor q_batch_norm1d_impl( template Tensor q_batch_norm2d_impl( Tensor qx, - c10::optional mb_weight, - c10::optional mb_bias, + std::optional mb_weight, + std::optional mb_bias, Tensor mean, Tensor var, double eps, @@ -256,8 +256,8 @@ Tensor q_batch_norm2d_impl( template Tensor q_batch_norm3d_impl( Tensor qx, - c10::optional mb_weight, - c10::optional mb_bias, + std::optional mb_weight, + std::optional mb_bias, Tensor mean, Tensor var, double eps, @@ -353,8 +353,8 @@ Tensor q_batch_norm3d_impl( template Tensor q_batch_norm_impl( Tensor qx, - c10::optional mb_weight, - c10::optional mb_bias, + std::optional mb_weight, + std::optional mb_bias, Tensor mean, Tensor var, double eps, @@ -380,7 +380,7 @@ Tensor q_batch_norm_impl( } // namespace Tensor quantized_batch_norm( - const Tensor& qx, const c10::optional& weight_opt /* optional */, const c10::optional& bias_opt /* optional */, + const Tensor& qx, const std::optional& weight_opt /* optional */, const c10::optional& bias_opt /* optional */, const Tensor& mean /* optional */, const Tensor& var /* optional */, double eps, diff --git a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h index 8887bb83deb91..535ccaf9acba1 100644 --- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h +++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h @@ -119,9 +119,9 @@ enum PostOps { struct PackedLinearWeightsOnednn : public LinearPackedParamsBase { PackedLinearWeightsOnednn( std::unique_ptr weight, - c10::optional bias, + std::optional bias, at::Tensor orig_weight, - c10::optional orig_bias) + std::optional orig_bias) : weight_(std::move(weight)), bias_(std::move(bias)), orig_weight_(std::move(orig_weight)), @@ -129,9 +129,9 @@ struct PackedLinearWeightsOnednn : public LinearPackedParamsBase { cache_initialized_flag = std::make_unique(); } std::unique_ptr weight_; - c10::optional bias_; + std::optional bias_; at::Tensor orig_weight_; - c10::optional orig_bias_; + std::optional orig_bias_; at::Tensor apply( at::Tensor input, @@ -156,15 +156,15 @@ struct PackedLinearWeightsOnednn : public LinearPackedParamsBase { double output_scale, int64_t output_zero_point); - std::tuple> unpack() override; + std::tuple> unpack() override; - c10::optional bias() override { + std::optional bias() override { return orig_bias_; } static c10::intrusive_ptr prepack( at::Tensor weight, - c10::optional bias); + std::optional bias); private: LinearPrimitiveCache prim_cache; @@ -189,9 +189,9 @@ template struct PackedConvWeightsOnednn : public ConvPackedParamsBase { PackedConvWeightsOnednn( std::unique_ptr weight, - c10::optional bias, + std::optional bias, at::Tensor orig_weight, - c10::optional orig_bias, + std::optional orig_bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -212,9 +212,9 @@ struct PackedConvWeightsOnednn : public ConvPackedParamsBase { } std::unique_ptr weight_; - c10::optional bias_; + std::optional bias_; at::Tensor orig_weight_; - c10::optional orig_bias_; + std::optional orig_bias_; torch::List stride_; torch::List padding_; torch::List output_padding_; @@ -248,11 +248,11 @@ struct PackedConvWeightsOnednn : public ConvPackedParamsBase { double output_scale, int64_t output_zero_point); - std::tuple> unpack() override; + std::tuple> unpack() override; static c10::intrusive_ptr> prepack( at::Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -292,7 +292,7 @@ struct PackedConvWeightsOnednn : public ConvPackedParamsBase { template at::Tensor apply_impl( const at::Tensor& input, - const c10::optional& accum, + const std::optional& accum, double output_scale, int64_t output_zero_point); @@ -316,7 +316,7 @@ static ideep::attr_t create_attr_by_post_op( int64_t input1_zero_point, const ideep::tensor::desc& input1_desc, const c10::string_view& unary_post_op, - const torch::List>& unary_post_op_args, + const torch::List>& unary_post_op_args, const c10::string_view& unary_post_op_algorithm) { using ideep::tensor; if (binary_post_op == "none") { @@ -470,7 +470,7 @@ at::Tensor _qconv_prepack_onednn( torch::List padding, torch::List dilation, int64_t groups, - c10::optional> input_shape=c10::nullopt); + std::optional> input_shape=c10::nullopt); static at::Tensor _quantized_convolution_onednn( at::Tensor act, // contains quantized values but not QTensor @@ -479,7 +479,7 @@ static at::Tensor _quantized_convolution_onednn( at::Tensor weight, // MKLDNN tensor with quantized values at::Tensor weight_scales, at::Tensor weight_zero_points, - c10::optional bias, // Bias is packed if not None + std::optional bias, // Bias is packed if not None torch::List stride, torch::List padding, torch::List dilation, @@ -487,14 +487,14 @@ static at::Tensor _quantized_convolution_onednn( int64_t groups, double output_scale, int64_t output_zero_point, - c10::optional accum=c10::nullopt, // accum to fused with conv add + std::optional accum=c10::nullopt, // accum to fused with conv add double accum_scale=1.0, int64_t accum_zero_point=0, bool fp32_output=false, - c10::optional binary_attr=c10::nullopt, - c10::optional binary_alpha=c10::nullopt, - c10::optional unary_attr=c10::nullopt, - torch::List> unary_scalars=torch::List>(), - c10::optional unary_algorithm=c10::nullopt); + std::optional binary_attr=c10::nullopt, + std::optional binary_alpha=c10::nullopt, + std::optional unary_attr=c10::nullopt, + torch::List> unary_scalars=torch::List>(), + std::optional unary_algorithm=c10::nullopt); #endif // #if AT_MKLDNN_ENABLED() diff --git a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h index 88ff258be891f..b217c757740b3 100644 --- a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h +++ b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h @@ -38,7 +38,7 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase { std::unique_ptr w, at::Tensor orig_weight, at::Tensor bias, - c10::optional input_scale, + std::optional input_scale, at::Tensor w_scales, std::vector&& w_zps) : w(std::move(w)), @@ -57,7 +57,7 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase { at::Tensor orig_weight; at::Tensor bias_; bool per_channel_; - c10::optional input_scale; + std::optional input_scale; at::Tensor w_scales; std::vector w_zero_points; std::vector requantization_scales; @@ -76,15 +76,15 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase { at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override; at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override; - std::tuple> unpack() override; + std::tuple> unpack() override; - c10::optional bias() override { + std::optional bias() override { return bias_; } static c10::intrusive_ptr prepack( at::Tensor weight, - c10::optional bias); + std::optional bias); bool per_channel() const { return per_channel_; @@ -125,7 +125,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase { torch::List dilation, int64_t groups, bool transpose, - c10::optional input_scale, + std::optional input_scale, std::vector kernel, at::Tensor w_scale, std::vector&& w_zps, @@ -302,7 +302,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase { int64_t groups_; bool transpose_; bool is_per_channel_; - c10::optional input_scale; + std::optional input_scale; std::vector kernel_; at::Tensor w_scales; std::vector w_zero_points; @@ -323,11 +323,11 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase { const at::Tensor& input, bool reduce_range=false) override; - std::tuple> unpack() override; + std::tuple> unpack() override; static c10::intrusive_ptr> prepack( at::Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -438,7 +438,7 @@ Tensor qnnpack_avg_pool2d( IntArrayRef padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override); + std::optional divisor_override); } // qnnp_avgpool_helper } // namespace native } // namespace at diff --git a/aten/src/ATen/native/quantized/cpu/QuantizedOps.h b/aten/src/ATen/native/quantized/cpu/QuantizedOps.h index 3ef8a3f4f4f42..9257f57b65dcd 100644 --- a/aten/src/ATen/native/quantized/cpu/QuantizedOps.h +++ b/aten/src/ATen/native/quantized/cpu/QuantizedOps.h @@ -129,7 +129,7 @@ using qavg_pool2d_fn = void (*)( int padW, int padH, bool count_include_pad, - c10::optional divisor_override); + std::optional divisor_override); using qavg_pool3d_fn = void (*)( const Tensor& qx, @@ -152,7 +152,7 @@ using qavg_pool3d_fn = void (*)( int padH, int padD, bool count_include_pad, - c10::optional divisor_override); + std::optional divisor_override); using qupsample_bilinear2d_fn = void (*)( Tensor& output, @@ -164,8 +164,8 @@ using qupsample_bilinear2d_fn = void (*)( int64_t nbatch, int64_t channels, bool align_corners, - c10::optional scales_h, - c10::optional scales_w); + std::optional scales_h, + std::optional scales_w); using qcat_nhwc_fn = Tensor (*)( const MaterializedITensorListRef& qxs, @@ -192,13 +192,13 @@ using qmean_inner_dim_fn = void (*)( const Tensor& /* X */, OptionalIntArrayRef /* opt_dim */, bool /* keepdim */, - c10::optional /* opt_dtype */, + std::optional /* opt_dtype */, Tensor& /* Y */); using qstd_inner_dim_fn = void (*)( const Tensor& /* X */, OptionalIntArrayRef /* dim */, - const c10::optional& /* correction */, + const std::optional& /* correction */, bool /* keepdim */, Tensor& /* Y */); diff --git a/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp b/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp index 0ad1a5ae013bc..113c57f2cc351 100644 --- a/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp +++ b/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp @@ -47,7 +47,7 @@ inline bool is_innnermost_dim( inline bool is_mean_inner_dim_fast_path( const Tensor& self, OptionalIntArrayRef opt_dim, - c10::optional opt_dtype) { + std::optional opt_dtype) { bool is_fast_path = is_innnermost_dim(self, opt_dim) && (!opt_dtype.has_value() || opt_dtype.value() == self.scalar_type()); @@ -131,7 +131,7 @@ Tensor& mean_out_quantized_cpu( const Tensor& self, OptionalIntArrayRef opt_dim, bool keepdim, - c10::optional opt_dtype, + std::optional opt_dtype, Tensor& result) { #ifdef USE_PYTORCH_QNNPACK if (at::globalContext().qEngine() == at::QEngine::QNNPACK && @@ -177,7 +177,7 @@ static Tensor& mean_out_quantized_cpu( const Tensor& self, DimnameList dim, bool keepdim, - c10::optional opt_dtype) { + std::optional opt_dtype) { return mean_out_quantized_cpu( self, dimnames_to_positions(self, dim), keepdim, opt_dtype, result); } @@ -186,7 +186,7 @@ static Tensor& mean_out_quantized_cpu( inline bool is_std_inner_dim_fast_path( const Tensor& self, OptionalIntArrayRef dim, - const c10::optional& correction) { + const std::optional& correction) { // Do not enter fast path if there are too few elements IntArrayRef dims = dim.has_value() ? dim.value() : IntArrayRef(); auto all_dims = std::vector(self.dim()); @@ -206,7 +206,7 @@ inline bool is_std_inner_dim_fast_path( Tensor& std_out_quantized_cpu( const Tensor& self, OptionalIntArrayRef dim, - const c10::optional& correction, + const std::optional& correction, bool keepdim, Tensor& result) { // Fast path @@ -230,7 +230,7 @@ Tensor& std_out_quantized_cpu( Tensor std_quantized_cpu( const Tensor& self, OptionalIntArrayRef dim, - const c10::optional& correction, + const std::optional& correction, bool keepdim) { Tensor result; std_out_quantized_cpu(self, dim, correction, keepdim, result); @@ -240,7 +240,7 @@ Tensor std_quantized_cpu( static Tensor std_quantized_cpu( const Tensor& self, DimnameList dim, - const c10::optional& correction, + const std::optional& correction, bool keepdim) { return std_quantized_cpu( self, dimnames_to_positions(self, dim), correction, keepdim); @@ -250,7 +250,7 @@ static Tensor& std_out_quantized_cpu( Tensor& result, const Tensor& self, DimnameList dim, - const c10::optional& correction, + const std::optional& correction, bool keepdim) { return std_out_quantized_cpu( self, dimnames_to_positions(self, dim), correction, keepdim, result); diff --git a/aten/src/ATen/native/quantized/cpu/TensorOperators.cpp b/aten/src/ATen/native/quantized/cpu/TensorOperators.cpp index 1ee305c64fc5f..388218c01ca02 100644 --- a/aten/src/ATen/native/quantized/cpu/TensorOperators.cpp +++ b/aten/src/ATen/native/quantized/cpu/TensorOperators.cpp @@ -81,7 +81,7 @@ AT_FORALL_OPERATORS(DEFINE_COMPARATOR) const Tensor& quantized_resize_cpu_( const Tensor& self, IntArrayRef size, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { // See Note [Writing Nondeterministic Operations] // Nondeterministic because if storage is resized, new elements are uninitialized globalContext().alertNotDeterministic("quantized_resize_cpu_"); diff --git a/aten/src/ATen/native/quantized/cpu/TensorShape.cpp b/aten/src/ATen/native/quantized/cpu/TensorShape.cpp index 58af539cb142f..4c810ef97b5bc 100644 --- a/aten/src/ATen/native/quantized/cpu/TensorShape.cpp +++ b/aten/src/ATen/native/quantized/cpu/TensorShape.cpp @@ -126,8 +126,8 @@ template Tensor qcat( const c10::List& qxs, int64_t dim, - c10::optional scale, - c10::optional zero_point) { + std::optional scale, + std::optional zero_point) { TORCH_CHECK(is_valid_quantization_scheme(qxs[0]), "Only per-tensor quantization is supported in 'cat'!") double _scale = scale.has_value() ? scale.value() : qxs.get(0).q_scale(); diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp index f428745eaa86f..d4dfa7ff08c91 100644 --- a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp +++ b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp @@ -46,8 +46,8 @@ static void upsample_bilinear2d_out_frame( int64_t nbatch, int64_t channels, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { auto* idata = static_cast(input.const_data_ptr()); auto* odata = static_cast(output.data_ptr()); @@ -146,8 +146,8 @@ Tensor upsample_bilinear2d_quantized_cpu( const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { TORCH_CHECK( output_size.size() == 2, "It is expected output_size equals to 2, but got size ", @@ -223,7 +223,7 @@ static Tensor upsample_bilinear2d_quantized_cpu( const Tensor& input, at::OptionalIntArrayRef output_size, bool align_corners, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_h = get_scale_value(scale_factors, 0); auto scale_w = get_scale_value(scale_factors, 1); diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp index 1020aef797e50..191407bed66a8 100644 --- a/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp +++ b/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp @@ -36,8 +36,8 @@ static void upsample_nearest2d_out_frame( int64_t output_width, int64_t nbatch, int64_t channels, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { float height_scale = compute_scales_value(scales_h, input_height, output_height); float width_scale = compute_scales_value(scales_w, input_width, output_width); @@ -92,8 +92,8 @@ static void upsample_nearest2d_out_frame_nhwc( int64_t output_width, int64_t nbatch, int64_t channels, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { float height_scale = compute_scales_value(scales_h, input_height, output_height); float width_scale = compute_scales_value(scales_w, input_width, output_width); @@ -121,8 +121,8 @@ template Tensor _upsample_nearest2d_quantized_cpu( const Tensor& input, IntArrayRef output_size, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { TORCH_CHECK( output_size.size() == 2, "It is expected output_size equals to 2, but got size ", @@ -205,23 +205,23 @@ using at::native::upsample::get_scale_value; Tensor upsample_nearest2d_quantized_cpu( const Tensor& input, IntArrayRef osize, - c10::optional scale_h, - c10::optional scale_w) { + std::optional scale_h, + std::optional scale_w) { return _upsample_nearest2d_quantized_cpu(input, osize, scale_h, scale_w); } Tensor _upsample_nearest_exact2d_quantized_cpu( const Tensor& input, IntArrayRef osize, - c10::optional scale_h, - c10::optional scale_w) { + std::optional scale_h, + std::optional scale_w) { return _upsample_nearest2d_quantized_cpu(input, osize, scale_h, scale_w); } static Tensor upsample_nearest2d_quantized_cpu( const Tensor& input, at::OptionalIntArrayRef output_size, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_h = get_scale_value(scale_factors, 0); auto scale_w = get_scale_value(scale_factors, 1); @@ -231,7 +231,7 @@ static Tensor upsample_nearest2d_quantized_cpu( static Tensor _upsample_nearest_exact2d_quantized_cpu( const Tensor& input, at::OptionalIntArrayRef output_size, - c10::optional> scale_factors) { + std::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_h = get_scale_value(scale_factors, 0); auto scale_w = get_scale_value(scale_factors, 1); diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp index 91ddfefcd4d4e..d98883123f057 100644 --- a/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp +++ b/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp @@ -36,9 +36,9 @@ static void upsample_nearest3d_out_frame( int64_t output_width, int64_t nbatch, int64_t channels, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_d, + std::optional scales_h, + std::optional scales_w) { float depth_scale = compute_scales_value(scales_d, input_depth, output_depth); float height_scale = compute_scales_value(scales_h, input_height, output_height); float width_scale = compute_scales_value(scales_w, input_width, output_width); @@ -93,9 +93,9 @@ static void upsample_nearest3d_out_frame_nhwc( int64_t output_width, int64_t nbatch, int64_t channels, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_d, + std::optional scales_h, + std::optional scales_w) { float depth_scale = compute_scales_value(scales_d, input_depth, output_depth); float height_scale = compute_scales_value(scales_h, input_height, output_height); float width_scale = compute_scales_value(scales_w, input_width, output_width); @@ -133,9 +133,9 @@ template Tensor _upsample_nearest3d_quantized_cpu( const Tensor& input, IntArrayRef output_size, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_d, + std::optional scales_h, + std::optional scales_w) { TORCH_CHECK( output_size.size() == 3, "It is expected output_size equals to 3, but got size ", @@ -217,9 +217,9 @@ Tensor _upsample_nearest3d_quantized_cpu( Tensor upsample_nearest3d_quantized_cpu( const Tensor& input, IntArrayRef osize, - c10::optional scale_d, - c10::optional scale_h, - c10::optional scale_w) { + std::optional scale_d, + std::optional scale_h, + std::optional scale_w) { return _upsample_nearest3d_quantized_cpu( input, osize, scale_d, scale_h, scale_w); } @@ -227,9 +227,9 @@ Tensor upsample_nearest3d_quantized_cpu( Tensor _upsample_nearest_exact3d_quantized_cpu( const Tensor& input, IntArrayRef osize, - c10::optional scale_d, - c10::optional scale_h, - c10::optional scale_w) { + std::optional scale_d, + std::optional scale_h, + std::optional scale_w) { return _upsample_nearest3d_quantized_cpu( input, osize, scale_d, scale_h, scale_w); } diff --git a/aten/src/ATen/native/quantized/cpu/conv_serialization.h b/aten/src/ATen/native/quantized/cpu/conv_serialization.h index 9f452a1cc7213..85451fb57482a 100644 --- a/aten/src/ATen/native/quantized/cpu/conv_serialization.h +++ b/aten/src/ATen/native/quantized/cpu/conv_serialization.h @@ -73,7 +73,7 @@ using ConvParamsSerializationTypeV2 = std::tuple< // non-optional tensors std::vector, // optional tensors - std::vector>>; + std::vector>>; using ConvParamsSerializationTypeV3 = std::tuple< // version, int for versions 3 and up @@ -81,7 +81,7 @@ using ConvParamsSerializationTypeV3 = std::tuple< // configuration values std::vector, // optional tensors - std::vector>>; + std::vector>>; // Parses any historical conv packed params format into // the current format. @@ -119,7 +119,7 @@ ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) { const auto& elements = v.toTupleRef().elements(); at::Tensor weight = elements[0].toTensor(); - c10::optional bias = elements[1].toOptional(); + std::optional bias = elements[1].toOptional(); torch::List stride_x_kSpatialDim = elements[2].toTensorList(); torch::List padding_x_kSpatialDim = elements[3].toTensorList(); torch::List dilation_x_kSpatialDim = elements[4].toTensorList(); @@ -150,7 +150,7 @@ ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) { // transpose does not exist in v1, so we fill in a default value config_vals.push_back(0); - std::vector> tensors; + std::vector> tensors; tensors.emplace_back(); tensors.emplace_back(weight); tensors.emplace_back(bias); @@ -161,7 +161,7 @@ ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) { // version 2 const auto& elements = v.toTupleRef().elements(); std::vector non_optional = elements[1].toTensorList().vec(); - std::vector> optional; + std::vector> optional; if (elements[2].isTensorList()) { for (const auto& elem : elements[2].toTensorList()) { @@ -187,7 +187,7 @@ ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) { auto weight = non_optional[1]; auto bias = optional[0]; - std::vector> tensors; + std::vector> tensors; tensors.emplace_back(); tensors.emplace_back(weight); tensors.emplace_back(bias); @@ -213,7 +213,7 @@ ConvParamsSerializationTypeV2 serialize_conv( std::string version = "2"; std::vector non_optional; - std::vector> optional; + std::vector> optional; // create a packed int8_t tensor for conv params std::vector params_vec; @@ -267,7 +267,7 @@ ConvParamsSerializationTypeV3 serialize_conv( auto [weight, bias] = params->unpack(); - std::vector> tensors; + std::vector> tensors; tensors.emplace_back(); tensors.emplace_back(weight); tensors.emplace_back(bias); @@ -287,8 +287,8 @@ c10::intrusive_ptr> deserialize_conv( TORCH_INTERNAL_ASSERT(version == 3, "Unexpected serialized qconv version: ", version); TORCH_CHECK(tensors.size() == 3, "Wrong number of tensors", tensors.size()); - c10::optional weight = tensors[1]; - c10::optional bias = tensors[2]; + std::optional weight = tensors[1]; + std::optional bias = tensors[2]; TORCH_INTERNAL_ASSERT(weight, "Weight should always be present in serialized qconv."); torch::List stride, padding, output_padding, dilation; diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp index d942e2f161a26..d6ac157a116b5 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp @@ -433,7 +433,7 @@ TORCH_API int register_conv_params<3>(); TORCH_API int register_linear_params(); TORCH_API int register_linear_params() { - using SerializationType = std::tuple>; + using SerializationType = std::tuple>; static auto register_linear_params = torch::selective_class_( "quantized", TORCH_SELECTIVE_CLASS("LinearPackedParamsBase")) @@ -446,7 +446,7 @@ TORCH_API int register_linear_params() { -> c10::intrusive_ptr< LinearPackedParamsBase> { // __setstate__ at::Tensor weight; - c10::optional bias; + std::optional bias; weight = std::move(std::get<0>(state)); bias = std::move(std::get<1>(state)); diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h index bfaf5b93d667b..75b5047713bb0 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h @@ -23,7 +23,7 @@ struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase { PackedLinearWeight( std::unique_ptr> w, - c10::optional bias, + std::optional bias, std::vector col_offsets, std::vector w_scale, std::vector w_zp, @@ -35,7 +35,7 @@ struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase { w_zp(std::move(w_zp)), q_scheme(std::move(q_scheme)) {} std::unique_ptr> w; - c10::optional bias_; + std::optional bias_; std::vector col_offsets; std::vector w_scale; std::vector w_zp; @@ -79,15 +79,15 @@ struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase { at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) override; - std::tuple> unpack() override; + std::tuple> unpack() override; - c10::optional bias() override { + std::optional bias() override { return bias_; } static c10::intrusive_ptr prepack( at::Tensor weight, - c10::optional bias); + std::optional bias); private: template @@ -110,11 +110,11 @@ struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase { struct TORCH_API PackedLinearWeightFp16 : public LinearPackedParamsBase { PackedLinearWeightFp16( std::unique_ptr w, - c10::optional bias) + std::optional bias) : w(std::move(w)), bias_(std::move(bias)) {} std::unique_ptr w; - c10::optional bias_; + std::optional bias_; at::Tensor apply( at::Tensor /*input*/, @@ -143,17 +143,17 @@ struct TORCH_API PackedLinearWeightFp16 : public LinearPackedParamsBase { at::Tensor& output, bool reduce_range = false) override; - std::tuple> unpack() override; + std::tuple> unpack() override; - c10::optional bias() override { + std::optional bias() override { return bias_; } static c10::intrusive_ptr prepack( at::Tensor weight, - c10::optional bias); + std::optional bias); - void set_bias(c10::optional bias) override; + void set_bias(std::optional bias) override; private: template @@ -164,7 +164,7 @@ template struct TORCH_API PackedConvWeight : public ConvPackedParamsBase { PackedConvWeight( std::unique_ptr> w, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -191,7 +191,7 @@ struct TORCH_API PackedConvWeight : public ConvPackedParamsBase { q_scheme(q_scheme) {} std::unique_ptr> w; - c10::optional bias; + std::optional bias; torch::List stride_; torch::List padding_; torch::List output_padding_; @@ -218,11 +218,11 @@ struct TORCH_API PackedConvWeight : public ConvPackedParamsBase { const at::Tensor& input, bool reduce_range) override; - std::tuple> unpack() override; + std::tuple> unpack() override; static c10::intrusive_ptr> prepack( at::Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -393,19 +393,19 @@ struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase { at::Tensor embeddingbag_byte( const at::Tensor& indices, - const c10::optional& offsets, + const std::optional& offsets, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset, bool is_embedding_op) override; at::Tensor embeddingbag_4bit( const at::Tensor& indices, - const c10::optional& offsets, + const std::optional& offsets, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset, bool is_embedding_op) override; }; diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp index dc9063ecf46f1..11828f273bbc8 100644 --- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp +++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp @@ -2023,7 +2023,7 @@ void _qavg_pool_nhwc_kernel( int padH, int padD, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { T* idata = static_cast(qx.data_ptr()); T* odata = static_cast(qy.data_ptr()); int strideC = 1; @@ -2135,7 +2135,7 @@ void qavg_pool2d_nhwc_kernel( int padW, int padH, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "avg_pool2d_nhwc", [&]() { _qavg_pool_nhwc_kernel( qx, @@ -2183,7 +2183,7 @@ void qavg_pool3d_nhwc_kernel( int padH, int padD, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "avg_pool3d_nhwc", [&]() { _qavg_pool_nhwc_kernel( qx, @@ -2288,8 +2288,8 @@ void qupsample_bilinear2d_nhwc_kernel( int64_t nbatch, int64_t channels, bool align_corners, - c10::optional scales_h, - c10::optional scales_w) { + std::optional scales_h, + std::optional scales_w) { AT_DISPATCH_QINT_TYPES(input.scalar_type(), "upsample_bilinear2d_nhwc", [&]() { auto* idata = static_cast(input.data_ptr()); auto* odata = static_cast(output.data_ptr()); @@ -2940,7 +2940,7 @@ void qmean_inner_dim_kernel( const Tensor& self, OptionalIntArrayRef opt_dim, bool keepdim, - c10::optional opt_dtype, + std::optional opt_dtype, Tensor& result) { // 'opt_dtype' should be none or equal to that of input ScalarType dtype = self.scalar_type(); @@ -2989,7 +2989,7 @@ void qmean_inner_dim_kernel( void qstd_inner_dim_kernel( const Tensor& self, OptionalIntArrayRef dim, - const c10::optional& correction_opt, + const std::optional& correction_opt, bool keepdim, Tensor& result) { ScalarType dtype = self.scalar_type(); diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp index f915c014af143..82223d6d3314c 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp @@ -1152,7 +1152,7 @@ template template at::Tensor PackedConvWeightsOnednn::apply_impl( const at::Tensor& act, - const c10::optional& accum, + const std::optional& accum, double output_scale, int64_t output_zero_point) { std::string func_name = "quantized::conv"; @@ -1391,7 +1391,7 @@ static at::Tensor _quantized_convolution_onednn( at::Tensor weight, // MKLDNN tensor with quantized values at::Tensor weight_scales, at::Tensor weight_zero_points, - c10::optional bias, // Bias is not packed into MKLDNN tensor + std::optional bias, // Bias is not packed into MKLDNN tensor torch::List stride, torch::List padding, torch::List dilation, @@ -1399,15 +1399,15 @@ static at::Tensor _quantized_convolution_onednn( int64_t groups, double output_scale, int64_t output_zero_point, - c10::optional accum, // accum to fused with conv add + std::optional accum, // accum to fused with conv add double accum_scale, int64_t accum_zero_point, - c10::optional output_dtype, - c10::optional binary_attr, - c10::optional binary_alpha, - c10::optional unary_attr, - torch::List> unary_scalars, - c10::optional unary_algorithm) { + std::optional output_dtype, + std::optional binary_attr, + std::optional binary_alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm) { /*********************************/ /* Checks */ /*********************************/ @@ -1867,17 +1867,17 @@ class QConvoneDNN final { at::Tensor weight, // contains quantized values but not QTensor at::Tensor weight_scales, at::Tensor weight_zero_points, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List dilation, int64_t groups, double output_scale, int64_t output_zero_point, - c10::optional output_dtype, + std::optional output_dtype, c10::string_view attr, - torch::List> scalars, - c10::optional algorithm) { + torch::List> scalars, + std::optional algorithm) { #if AT_MKLDNN_ENABLED() if (act.dim() == 3 || act.dim() == 5) { // Conv1D/3D post op check @@ -1919,19 +1919,19 @@ class QConvoneDNN final { at::Tensor weight, // contains quantized values but not QTensor at::Tensor weight_scales, at::Tensor weight_zero_points, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List dilation, int64_t groups, double output_scale, int64_t output_zero_point, - c10::optional output_dtype, + std::optional output_dtype, c10::string_view binary_attr, - c10::optional alpha, - c10::optional unary_attr, - torch::List> unary_scalars, - c10::optional unary_algorithm) { + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm) { #if AT_MKLDNN_ENABLED() // Conv2D post op check TORCH_CHECK( diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp index 46172f0c199f4..5f76890da2cae 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp @@ -28,7 +28,7 @@ c10::intrusive_ptr> PackedConvWeight< kSpatialDim>:: prepack( at::Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -155,7 +155,7 @@ c10::intrusive_ptr> PackedConvWeight< } } - c10::optional bias_contig; + std::optional bias_contig; if (bias.has_value()) { at::Tensor bias_vec = bias.value(); TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)"); @@ -196,7 +196,7 @@ c10::intrusive_ptr> PackedConvWeightsQnnp< kSpatialDim>:: prepack( at::Tensor weight, - c10::optional bias_in, + std::optional bias_in, torch::List stride, torch::List padding, torch::List output_padding, @@ -313,7 +313,7 @@ c10::intrusive_ptr> PackedConvWeightsQnnp< 2>:: prepack( at::Tensor weight, - c10::optional bias_in, + std::optional bias_in, torch::List stride, torch::List padding, torch::List output_padding, @@ -328,7 +328,7 @@ c10::intrusive_ptr> PackedConvWeightsOnednn< kSpatialDim>:: prepack( at::Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -458,7 +458,7 @@ c10::intrusive_ptr> PackedConvWeightsOnednn< packed_weight_p->set_zero_point(wgt_zero_points); std::unique_ptr weight_ptr(packed_weight_p); // Bias - c10::optional onednn_bias{c10::nullopt}; + std::optional onednn_bias{c10::nullopt}; if (bias.has_value()) { at::Tensor bias_vec = bias.value(); TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)"); @@ -468,7 +468,7 @@ c10::intrusive_ptr> PackedConvWeightsOnednn< auto bias_desc = ideep::tensor::desc(bias.value().sizes().vec(), dnnl::memory::data_type::f32); ideep::tensor packed_bias; packed_bias.init(bias_desc, bias.value().data_ptr()); - onednn_bias = c10::optional(packed_bias); + onednn_bias = std::optional(packed_bias); } auto ret_ptr = c10::make_intrusive>( PackedConvWeightsOnednn{ @@ -499,7 +499,7 @@ at::Tensor _qconv_prepack_onednn( torch::List padding, torch::List dilation, int64_t groups, - c10::optional> input_shape) { + std::optional> input_shape) { int kSpatialDim = weight.ndimension() - 2; TORCH_CHECK( weight.ndimension() == kSpatialDim + 2, @@ -624,7 +624,7 @@ class QConvPackWeightInt8 final { public: static c10::intrusive_ptr> run_conv( Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List dilation, @@ -640,7 +640,7 @@ class QConvPackWeightInt8 final { static c10::intrusive_ptr> run_deconv( Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -653,7 +653,7 @@ class QConvPackWeightInt8 final { private: static c10::intrusive_ptr> _run( Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -713,7 +713,7 @@ class QConv1dPackWeightInt8 final { public: static c10::intrusive_ptr> run_conv( Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List dilation, @@ -725,7 +725,7 @@ class QConv1dPackWeightInt8 final { static c10::intrusive_ptr> run_deconv( Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -738,7 +738,7 @@ class QConv1dPackWeightInt8 final { private: static c10::intrusive_ptr> _run( Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -814,7 +814,7 @@ class QConvPrepackOneDNN final { torch::List padding, torch::List dilation, int64_t groups, - c10::optional> input_shape) { + std::optional> input_shape) { #if AT_MKLDNN_ENABLED() return _qconv_prepack_onednn( weight, weight_scales, input_scale, input_zero_point, diff --git a/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp b/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp index 8af8d62f2f8a9..4f11cc2bc9393 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp @@ -11,7 +11,7 @@ #ifdef USE_FBGEMM template -std::tuple> PackedConvWeight< +std::tuple> PackedConvWeight< kSpatialDim>::unpack() { auto* packed_weights_p = w.get(); // output channels @@ -90,19 +90,19 @@ std::tuple> PackedConvWeight< at::native::fbgemm_utils::TransposeConvTensorUnpackConversion< kSpatialDim>(unpacked_weights, groups); } - return std::tuple>( + return std::tuple>( unpacked_weights, bias); } -template std::tuple> PackedConvWeight< +template std::tuple> PackedConvWeight< 2>::unpack(); -template std::tuple> PackedConvWeight< +template std::tuple> PackedConvWeight< 3>::unpack(); #endif // USE_FBGEMM #ifdef USE_PYTORCH_QNNPACK template -std::tuple> PackedConvWeightsQnnp< +std::tuple> PackedConvWeightsQnnp< kSpatialDim>::unpack() { TORCH_CHECK( kSpatialDim == 2, @@ -112,25 +112,25 @@ std::tuple> PackedConvWeightsQnnp< orig_weight.defined(), "Cannot unpack weights. " "Call at::globalContext()::setReleaseOriginalWeights(false) before packing or loading to enable unpacking."); - return std::tuple>(orig_weight, bias); + return std::tuple>(orig_weight, bias); } -template std::tuple> PackedConvWeightsQnnp< +template std::tuple> PackedConvWeightsQnnp< 2>::unpack(); -template std::tuple> PackedConvWeightsQnnp< +template std::tuple> PackedConvWeightsQnnp< 3>::unpack(); #endif // USE_PYTORCH_QNNPACK #if AT_MKLDNN_ENABLED() template -std::tuple> PackedConvWeightsOnednn< +std::tuple> PackedConvWeightsOnednn< kSpatialDim>::unpack() { - return std::tuple>( + return std::tuple>( orig_weight_.clone(), orig_bias_); } -template std::tuple> PackedConvWeightsOnednn< +template std::tuple> PackedConvWeightsOnednn< 2>::unpack(); -template std::tuple> PackedConvWeightsOnednn< +template std::tuple> PackedConvWeightsOnednn< 3>::unpack(); #endif // #if AT_MKLDNN_ENABLED() diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp index 7e5083057a0ba..8b3f9b8afc8d2 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp @@ -38,8 +38,8 @@ at::Tensor& embedding_lookup_fallback_impl( const at::Tensor& weight, const at::Tensor& indices, const at::Tensor& offsets, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, at::Tensor& output, const int64_t block_size, const int64_t output_size, @@ -227,8 +227,8 @@ at::Tensor& embedding_bag_nbit_impl( const at::Tensor& indices, const at::Tensor& offsets, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset, bool is_embedding_op) { TORCH_CHECK(weight.dim() == 2); @@ -399,8 +399,8 @@ at::Tensor& embedding_bag_byte_impl( const at::Tensor& indices, const at::Tensor& offsets, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset, bool is_embedding_op) { TORCH_CHECK(weight.scalar_type() == at::kByte); @@ -558,10 +558,10 @@ at::Tensor& embedding_bag_byte_helper( at::Tensor& output, const at::Tensor& weight, const at::Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset, bool is_embedding_op) { c10::MaybeOwned offsets; @@ -656,10 +656,10 @@ at::Tensor& _embedding_bag_nbit_helper( const at::Tensor& weight, const int bit_width, const at::Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset, bool is_embedding_op) { c10::MaybeOwned offsets; @@ -760,10 +760,10 @@ at::Tensor& _embedding_bag_nbit_helper( at::Tensor PackedEmbeddingBagWeight::embeddingbag_byte( const at::Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset, bool is_embedding_op) { auto output = at::empty({0}, packed_w.options().dtype(at::kFloat)); @@ -781,10 +781,10 @@ at::Tensor PackedEmbeddingBagWeight::embeddingbag_byte( at::Tensor PackedEmbeddingBagWeight::embeddingbag_4bit( const at::Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset, bool is_embedding_op) { if (per_sample_weights_.has_value()) { @@ -819,12 +819,12 @@ Tensor& embedding_bag_byte_rowwise_offsets_out( Tensor& output, const Tensor& weight, const Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, const bool /* scale_grad_by_freq */, const int64_t /* mode */, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset) { return embedding_bag_byte_helper( output, @@ -842,12 +842,12 @@ Tensor& embedding_bag_4bit_rowwise_offsets_out( Tensor& output, const Tensor& weight, const Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, const bool /* scale_grad_by_freq */, const int64_t /* mode */, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset) { if (per_sample_weights_.has_value()) { @@ -877,12 +877,12 @@ static Tensor& embedding_bag_2bit_rowwise_offsets_out( Tensor& output, const Tensor& weight, const Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, const bool /* scale_grad_by_freq */, const int64_t /* mode */, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset) { if (per_sample_weights_.has_value()) { @@ -921,12 +921,12 @@ inline at::Tensor create_empty_from( Tensor embedding_bag_byte_rowwise_offsets( const Tensor& weight, const Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, const bool /* scale_grad_by_freq */, const int64_t /* mode */, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset) { auto output = create_empty_from(weight, at::kFloat); embedding_bag_byte_rowwise_offsets_out( @@ -946,12 +946,12 @@ Tensor embedding_bag_byte_rowwise_offsets( Tensor embedding_bag_4bit_rowwise_offsets( const Tensor& weight, const Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, const bool /* scale_grad_by_freq */, const int64_t /* mode */, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset) { auto output = create_empty_from(weight, at::kFloat); embedding_bag_4bit_rowwise_offsets_out( @@ -971,12 +971,12 @@ Tensor embedding_bag_4bit_rowwise_offsets( Tensor embedding_bag_2bit_rowwise_offsets( const Tensor& weight, const Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, const bool /* scale_grad_by_freq */, const int64_t /* mode */, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset) { auto output = create_empty_from(weight, at::kFloat); embedding_bag_2bit_rowwise_offsets_out( @@ -996,12 +996,12 @@ Tensor embedding_bag_2bit_rowwise_offsets( Tensor embedding_bag_byte_rowwise_offsets_meta( const Tensor& weight, const Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, const bool /* scale_grad_by_freq */, const int64_t /* mode */, bool /* pruned_weights */, - const c10::optional& /* per_sample_weights_ */, - const c10::optional& /* compressed_indices_mapping */, + const std::optional& /* per_sample_weights_ */, + const std::optional& /* compressed_indices_mapping */, bool include_last_offset) { TORCH_CHECK( indices.dim() == 1 || indices.dim() == 2, @@ -1038,12 +1038,12 @@ class QEmbeddingBag final { static at::Tensor run( const c10::intrusive_ptr& packed_weight, const Tensor& indices, - const c10::optional& offsets, + const std::optional& offsets, const bool /* scale_grad_by_freq */, const int64_t /* mode */, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset) { if (bit_rate == 8) { return packed_weight->embeddingbag_byte( diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.h b/aten/src/ATen/native/quantized/cpu/qembeddingbag.h index 86ed0f530f9c3..644d85fa357ee 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.h +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.h @@ -8,24 +8,24 @@ Tensor& embedding_bag_byte_rowwise_offsets_out( Tensor& output, const Tensor& weight, const Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, const bool /* scale_grad_by_freq */, const int64_t /* mode */, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset); Tensor& embedding_bag_4bit_rowwise_offsets_out( Tensor& output, const Tensor& weight, const Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, const bool /* scale_grad_by_freq */, const int64_t /* mode */, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset); Tensor& qembeddingbag_byte_unpack_out(Tensor& output, const Tensor& packed_weight); diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp index df6df3c35201d..1c180173aab53 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp @@ -917,17 +917,17 @@ static at::Tensor linear_int8_with_onednn_weight( at::Tensor onednn_weight, // int8 tensor from MkldnnCPU at::Tensor weight_scales, at::Tensor weight_zero_points, - c10::optional bias, // plain tensor + std::optional bias, // plain tensor double output_scale, int64_t output_zero_point, - c10::optional output_dtype, - c10::optional other, // extra input for binary post-op + std::optional output_dtype, + std::optional other, // extra input for binary post-op double other_scale, int64_t other_zero_point, const c10::string_view& binary_post_op, // e.g. "none", "sum", "add" double binary_alpha, const c10::string_view& unary_post_op, // e.g. "none", "relu" - torch::List>& unary_post_op_args, + torch::List>& unary_post_op_args, c10::string_view& unary_post_op_algorithm) { using ideep::tensor; const int64_t dim = input.dim(); @@ -989,7 +989,7 @@ static at::Tensor linear_int8_with_onednn_weight( auto output_size = input.sizes().vec(); output_size[dim - 1] = N; - c10::optional onednn_bias{c10::nullopt}; + std::optional onednn_bias{c10::nullopt}; bool with_bias = bias.has_value(); at::Tensor bias_val_float; if (with_bias) { @@ -1194,15 +1194,15 @@ class QLinearOnednn final { Tensor onednn_weight, // int8 tensor from MkldnnCPU Tensor weight_scales, Tensor weight_zero_points, - c10::optional bias, + std::optional bias, double output_scale, int64_t output_zero_point, - c10::optional output_dtype, + std::optional output_dtype, c10::string_view post_op_name, - torch::List> post_op_args, + torch::List> post_op_args, c10::string_view post_op_algorithm) { #if AT_MKLDNN_ENABLED() - static c10::optional other = c10::nullopt; + static std::optional other = c10::nullopt; static const c10::string_view binary_post_op = "none"; return linear_int8_with_onednn_weight( act, act_scale, act_zero_point, @@ -1223,17 +1223,17 @@ class QLinearOnednn final { Tensor onednn_weight, // int8 tensor from MkldnnCPU Tensor weight_scales, Tensor weight_zero_points, - c10::optional bias, + std::optional bias, double output_scale, int64_t output_zero_point, - c10::optional output_dtype, + std::optional output_dtype, c10::string_view post_op_name, - torch::List> post_op_args, + torch::List> post_op_args, c10::string_view post_op_algorithm) { #if AT_MKLDNN_ENABLED() TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1, "onednn int8 linear: act scale/zp size should be 1"); - static c10::optional other = c10::nullopt; + static std::optional other = c10::nullopt; static const c10::string_view binary_post_op = "none"; return linear_int8_with_onednn_weight( act, act_scale.item().toDouble(), act_zero_point.item().toLong(), @@ -1254,17 +1254,17 @@ class QLinearOnednn final { Tensor onednn_weight, // int8 tensor from MkldnnCPU Tensor weight_scales, Tensor weight_zero_points, - c10::optional bias, + std::optional bias, double output_scale, int64_t output_zero_point, - c10::optional output_dtype, - c10::optional other, // extra input for binary post-op + std::optional output_dtype, + std::optional other, // extra input for binary post-op double other_scale, int64_t other_zero_point, c10::string_view binary_post_op, // e.g. "none", "sum", "add" double binary_alpha, c10::string_view unary_post_op, // e.g. "none", "relu" - torch::List> unary_post_op_args, + torch::List> unary_post_op_args, c10::string_view unary_post_op_algorithm) { #if AT_MKLDNN_ENABLED() return linear_int8_with_onednn_weight( @@ -1286,17 +1286,17 @@ class QLinearOnednn final { Tensor onednn_weight, // int8 tensor from MkldnnCPU Tensor weight_scales, Tensor weight_zero_points, - c10::optional bias, + std::optional bias, double output_scale, int64_t output_zero_point, - c10::optional output_dtype, - c10::optional other, // extra input for binary post-op + std::optional output_dtype, + std::optional other, // extra input for binary post-op double other_scale, int64_t other_zero_point, c10::string_view binary_post_op, // e.g. "none", "sum", "add" double binary_alpha, c10::string_view unary_post_op, // e.g. "none", "relu" - torch::List> unary_post_op_args, + torch::List> unary_post_op_args, c10::string_view unary_post_op_algorithm) { #if AT_MKLDNN_ENABLED() TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1, diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp index 935ad081bd908..111990ad4e277 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp @@ -483,7 +483,7 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_relu_out( return apply_dynamic_impl(input, output); } -void PackedLinearWeightFp16::set_bias(c10::optional bias) { +void PackedLinearWeightFp16::set_bias(std::optional bias) { bias_ = std::move(bias); } diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp index a2fb34f90b289..d8427076b5afd 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp @@ -58,7 +58,7 @@ void calc_col_offsets_transpose( c10::intrusive_ptr PackedLinearWeight::prepack( at::Tensor weight, - c10::optional bias) { + std::optional bias) { TORCH_CHECK( weight.dim() == 2, "The weight tensor for quantized::linear_prepack (fbgemm) should" @@ -102,7 +102,7 @@ c10::intrusive_ptr PackedLinearWeight::prepack( /*col_offsets=*/col_offsets.data(), /*qtype=*/qtype); - c10::optional bias_contig; + std::optional bias_contig; if (bias.has_value()) { at::Tensor bias_vec = bias.value(); TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)"); @@ -132,7 +132,7 @@ c10::intrusive_ptr PackedLinearWeight::prepack( #ifdef USE_PYTORCH_QNNPACK c10::intrusive_ptr PackedLinearWeightsQnnp::prepack( at::Tensor weight, - c10::optional bias_in) { + std::optional bias_in) { TORCH_CHECK( weight.dim() == 2, "quantized::linear_prepack (qnnpack): Weight tensor rank should be == 2"); @@ -181,7 +181,7 @@ c10::intrusive_ptr PackedLinearWeightsQnnp::prepack( c10::intrusive_ptr PackedLinearWeightFp16::prepack( at::Tensor weight, - c10::optional bias) { + std::optional bias) { weight = at::_saturate_weight_to_fp16(weight); @@ -208,7 +208,7 @@ c10::intrusive_ptr PackedLinearWeightFp16::prepack( #if AT_MKLDNN_ENABLED() c10::intrusive_ptr PackedLinearWeightsOnednn::prepack( at::Tensor weight, - c10::optional bias) { + std::optional bias) { TORCH_CHECK( weight.dim() == 2, "The weight tensor for quantized::linear_prepack (onednn) should" @@ -257,7 +257,7 @@ c10::intrusive_ptr PackedLinearWeightsOnednn::prepack( packed_weight_p->set_zero_point(wgt_zero_points); std::unique_ptr weight_ptr(packed_weight_p); // Bias - c10::optional onednn_bias{c10::nullopt}; + std::optional onednn_bias{c10::nullopt}; if (bias.has_value()) { auto& b = bias.value(); auto bias_size = b.sizes().vec(); @@ -270,7 +270,7 @@ c10::intrusive_ptr PackedLinearWeightsOnednn::prepack( auto bias_desc = ideep::tensor::desc(bias_size, dnnl::memory::data_type::f32); ideep::tensor packed_bias; packed_bias.init(bias_desc, b.data_ptr()); - onednn_bias = c10::optional(packed_bias); + onednn_bias = std::optional(packed_bias); } auto ret_ptr = c10::make_intrusive( PackedLinearWeightsOnednn{ @@ -283,7 +283,7 @@ c10::intrusive_ptr PackedLinearWeightsOnednn::prepack( inline at::Tensor pack_weight_to_onednn_tensor( const at::Tensor& weight, - c10::optional>& input_shape) { + std::optional>& input_shape) { std::vector w_dims = weight.sizes().vec(); ideep::tensor wei = ideep::tensor({w_dims, dnnl::memory::data_type::s8}, weight.data_ptr()); wei.transpose_(0, 1); // oneDNN requires transposed weight @@ -319,7 +319,7 @@ class QLinearPackWeightInt8 final { public: static c10::intrusive_ptr run( at::Tensor weight, - c10::optional bias) { + std::optional bias) { auto& ctx = at::globalContext(); #ifdef USE_FBGEMM @@ -350,7 +350,7 @@ class QLinearPackWeightFp16 final { public: static c10::intrusive_ptr run( at::Tensor weight, - c10::optional bias) { + std::optional bias) { auto& ctx = at::globalContext(); #ifdef USE_FBGEMM // temporarily convert weight back to fp32, needs to be fixed @@ -387,7 +387,7 @@ class QLinearPackWeightFp16 final { class QLinearPackWeightInt8Legacy final { public: - static Tensor run(at::Tensor weight, c10::optional bias) { + static Tensor run(at::Tensor weight, std::optional bias) { TORCH_CHECK(false, "This model uses an outdated version of quantized.linear_prepack. " "Please re-export your model using the newer definitions in torch.jit.quantized"); @@ -396,7 +396,7 @@ class QLinearPackWeightInt8Legacy final { class QLinearPackWeightFp16Legacy final { public: - static Tensor run(at::Tensor weight, c10::optional bias) { + static Tensor run(at::Tensor weight, std::optional bias) { TORCH_CHECK(false, "This model uses an outdated version of quantized.linear_prepack_fp16. " "Please re-export your model using the newer definitions in torch.jit.quantized"); @@ -407,7 +407,7 @@ class QLinearPackWeightInt8Onednn final { public: static at::Tensor run( at::Tensor weight, // Not QTensor - c10::optional> input_shape) { + std::optional> input_shape) { #if AT_MKLDNN_ENABLED() return pack_weight_to_onednn_tensor(weight, input_shape); #else diff --git a/aten/src/ATen/native/quantized/cpu/qnormalization.cpp b/aten/src/ATen/native/quantized/cpu/qnormalization.cpp index b803bdd8aff7a..9de75e80bc4df 100644 --- a/aten/src/ATen/native/quantized/cpu/qnormalization.cpp +++ b/aten/src/ATen/native/quantized/cpu/qnormalization.cpp @@ -135,8 +135,8 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { m.impl(TORCH_SELECTIVE_NAME("quantized::layer_norm"), []( Tensor input, std::vector normalized_shape, // because IntArrayRef doesn't work - c10::optional weight, - c10::optional bias, + std::optional weight, + std::optional bias, double eps, double output_scale, int64_t output_zero_point) { @@ -149,8 +149,8 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { m.impl(TORCH_SELECTIVE_NAME("quantized::group_norm"), []( Tensor qx, int64_t num_groups, - c10::optional weight, - c10::optional bias, + std::optional weight, + std::optional bias, double eps, double output_scale, int64_t output_zero_point) { @@ -162,8 +162,8 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { }); m.impl(TORCH_SELECTIVE_NAME("quantized::instance_norm"), []( Tensor qx, - c10::optional weight, - c10::optional bias, + std::optional weight, + std::optional bias, double eps, double output_scale, int64_t output_zero_point) { diff --git a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp index a2d3ed6305fc3..0d764aee90d09 100644 --- a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp +++ b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp @@ -44,8 +44,8 @@ Tensor qsoftmax_qnnpack(const Tensor& qx, const int64_t dim) { */ const int64_t last_dim = qx.dim() - 1; - c10::optional> permuted_dims = c10::nullopt; - c10::optional qx_contig = c10::nullopt; + std::optional> permuted_dims = c10::nullopt; + std::optional qx_contig = c10::nullopt; const at::Tensor* qx_contig_ptr = nullptr; if (qx.stride(dim) == 1) { diff --git a/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu b/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu index 3574bfe28f505..de3f1032dbcae 100644 --- a/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu @@ -90,7 +90,7 @@ __global__ void embedding_bag_nbits_rowwise_offsets_kernel( const PackedTensorAccessor32 offsets, const bool /* pruned_weights */, const PackedTensorAccessor32 per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& compressed_indices_mapping, const bool include_last_offset, PackedTensorAccessor32 output) { static_assert(bits_per_dim == 4 || bits_per_dim == 8, "the current embedding_bag_nbits_rowwise_offsets_kernel only has been tested for 4 and 8 bits per dim"); @@ -192,8 +192,8 @@ at::Tensor& embedding_bag_byte_impl( const at::Tensor& indices, const at::Tensor& offsets, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset, bool is_embedding_op) { TORCH_CHECK(weight.is_cuda()); @@ -267,12 +267,12 @@ at::Tensor& embedding_bag_byte_impl( Tensor embedding_bag_byte_rowwise_offsets( const Tensor& weight, const Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, const bool /* scale_grad_by_freq */, const int64_t /* mode */, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset) { bool is_embedding_op = false; auto output = create_empty_from(weight, at::kFloat); @@ -375,8 +375,8 @@ at::Tensor& embedding_bag_4bit_impl( const at::Tensor& indices, const at::Tensor& offsets, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset) { TORCH_CHECK(weight.is_cuda()); TORCH_CHECK(indices.is_cuda()); @@ -449,12 +449,12 @@ at::Tensor& embedding_bag_4bit_impl( Tensor embedding_bag_4bit_rowwise_offsets( const Tensor& weight, const Tensor& indices, - const c10::optional& offsets_in, + const std::optional& offsets_in, const bool /* scale_grad_by_freq */, const int64_t /* mode */, bool pruned_weights, - const c10::optional& per_sample_weights_, - const c10::optional& compressed_indices_mapping, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, bool include_last_offset) { auto output = create_empty_from(weight, at::kFloat); diff --git a/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp b/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp index a225a86eeb903..07ccc19c48282 100644 --- a/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp +++ b/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp @@ -186,7 +186,7 @@ Tensor add(Tensor qa, Tensor qb, double output_scale, int64_t output_zero_point) // relu_op computes // relu( (qa_int8 + qb_int8 * ( qb_scale/qa_scale ) ) ) // output is a fp32 tensor - c10::optional relu_op; + std::optional relu_op; if (kReluFused) { // we use inplace operation here where the output is assigned to the input relu_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) diff --git a/aten/src/ATen/native/quantized/cudnn/Conv.cpp b/aten/src/ATen/native/quantized/cudnn/Conv.cpp index bb97a69859cb4..606d769fe6eb4 100644 --- a/aten/src/ATen/native/quantized/cudnn/Conv.cpp +++ b/aten/src/ATen/native/quantized/cudnn/Conv.cpp @@ -70,8 +70,8 @@ void PackedConvWeightCudnn::apply_impl_helper(const at::Tensor& qua auto requantize_multiplier = act_scale * weight_scale / output_scale; at::Tensor requantize_multiplier_tensor = cudnn_utils::getRequantMultiplierTensor(requantize_multiplier, kSpatialDim + 2); - c10::optional bias_multiplier_tensor; - c10::optional broadcasted_bias; + std::optional bias_multiplier_tensor; + std::optional broadcasted_bias; if (bias_.has_value()) { // the input bias is a 1-D tensor whose size is the same as the size of the second dimension of quantized_output. // we need to add trailing dimensions in order to properly broadcast bias, otherwise broadcast_to will fail. @@ -154,12 +154,12 @@ void PackedConvWeightCudnn::apply_impl_helper(const at::Tensor& qua .build(); // std::cout << "operator:" << conv_op.describe() << std::endl; - c10::optional bias_mult_op; - c10::optional sum_conv_bias_op; + std::optional bias_mult_op; + std::optional sum_conv_bias_op; if (bias_.has_value()) { // we can't directly assign bias_mult_op because operator= is deleted for cudnn_frontend::Operation; // alternatively, I think we can use std::unique_ptr and dynamically allocate these builder ops - // but here, we chose to do it statically. c10::optional::emplace() enables this approach + // but here, we chose to do it statically. std::optional::emplace() enables this approach // bias_mult_op computes bias_fp32 / (act_scale * w_scale) or bias_fp32 * (1 / (act_scale * w_scale)) // where bias_multiplier = (1 / (act_scale * w_scale)) @@ -188,7 +188,7 @@ void PackedConvWeightCudnn::apply_impl_helper(const at::Tensor& qua // relu_op computes relu(act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)] // or relu(act_int8 * w_int8) if bias is not present. // output is a fp32 tensor - c10::optional relu_op; + std::optional relu_op; std::shared_ptr tensor2requant_ptr = bias_.has_value() ? sum_conv_bias_op.value().getOutputTensor() : conv_op.getOutputTensor(); if (kReluFused) { // we use inplace operation here where the output is assigned to the input diff --git a/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp b/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp index 44d37f27bf6f6..b1bd94ee7a55c 100644 --- a/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp +++ b/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp @@ -27,7 +27,7 @@ c10::intrusive_ptr> PackedConvWeightCudnn< kSpatialDim>:: prepack( at::Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -116,7 +116,7 @@ c10::intrusive_ptr> PackedConvWeightCudnn< 2>:: prepack( at::Tensor weight, - c10::optional bias_in, + std::optional bias_in, torch::List stride, torch::List padding, torch::List output_padding, @@ -133,7 +133,7 @@ class QConvPackWeightInt8Cudnn final { public: static c10::intrusive_ptr> run_conv( Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List dilation, @@ -150,7 +150,7 @@ class QConvPackWeightInt8Cudnn final { private: static c10::intrusive_ptr> _run( Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -167,7 +167,7 @@ class QConv1dPackWeightInt8Cudnn final { public: static c10::intrusive_ptr> run_conv( Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List dilation, @@ -180,7 +180,7 @@ class QConv1dPackWeightInt8Cudnn final { private: static c10::intrusive_ptr> _run( Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, diff --git a/aten/src/ATen/native/quantized/cudnn/ConvUnpackImpl.cpp b/aten/src/ATen/native/quantized/cudnn/ConvUnpackImpl.cpp index ce5ee36cad4f0..fbb4a1fe94111 100644 --- a/aten/src/ATen/native/quantized/cudnn/ConvUnpackImpl.cpp +++ b/aten/src/ATen/native/quantized/cudnn/ConvUnpackImpl.cpp @@ -11,12 +11,12 @@ #include template -std::tuple> PackedConvWeightCudnn< +std::tuple> PackedConvWeightCudnn< kSpatialDim>::unpack() { - return std::tuple>{maybe_padded_weight_, bias_}; + return std::tuple>{maybe_padded_weight_, bias_}; } -template std::tuple> PackedConvWeightCudnn< +template std::tuple> PackedConvWeightCudnn< 2>::unpack(); #endif // AT_CUDNN_ENABLED diff --git a/aten/src/ATen/native/quantized/cudnn/Linear.cpp b/aten/src/ATen/native/quantized/cudnn/Linear.cpp index f9333d6fbed7a..d3219592e25bb 100644 --- a/aten/src/ATen/native/quantized/cudnn/Linear.cpp +++ b/aten/src/ATen/native/quantized/cudnn/Linear.cpp @@ -98,8 +98,8 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp auto weight_scale = orig_weight.q_scale(); auto requantize_multiplier = act_scale * weight_scale / output_scale; at::Tensor requantize_multiplier_tensor = cudnn_utils::getRequantMultiplierTensor(requantize_multiplier, quantized_output.dim()); - c10::optional bias_multiplier_tensor; - c10::optional broadcasted_bias; + std::optional bias_multiplier_tensor; + std::optional broadcasted_bias; if (bias_.has_value()) { // the input bias is a 1-D tensor whose size is the same as the size of the last dimension of quantized_output // we need to add trailing dimensions in order to properly broadcast bias, otherwise broadcast_to will fail. @@ -183,12 +183,12 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp .build(); // std::cout << "operator:" << linear_op.describe() << std::endl; - c10::optional bias_mult_op; - c10::optional sum_linear_bias_op; + std::optional bias_mult_op; + std::optional sum_linear_bias_op; if (bias_.has_value()) { // we can't directly assign bias_mult_op because operator= is deleted for cudnn_frontend::Operation; // alternatively, I think we can use std::unique_ptr and dynamically allocate these builder ops - // but here, we chose to do it statically. c10::optional::emplace() enables this approach + // but here, we chose to do it statically. std::optional::emplace() enables this approach // bias_mult_op computes bias_fp32 / (act_scale * w_scale) or bias_fp32 * (1 / (act_scale * w_scale)) // where bias_multiplier = (1 / (act_scale * w_scale)) @@ -222,7 +222,7 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp // relu_op computes relu(act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)] // or relu(act_int8 * w_int8) if bias is not present. // output is a fp32 tensor - c10::optional relu_op; + std::optional relu_op; std::shared_ptr tensor2requant_ptr = bias_.has_value() ? sum_linear_bias_op.value().getOutputTensor() : linear_op.getOutputTensor(); if (kReluFused) { // we use inplace operation here where the output is assigned to the input diff --git a/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp b/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp index abbb5922f3933..fd7c870e006d1 100644 --- a/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp +++ b/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp @@ -16,7 +16,7 @@ int register_linear_params(); c10::intrusive_ptr PackedLinearWeightCudnn::prepack( at::Tensor weight, - c10::optional bias) { + std::optional bias) { TORCH_CHECK(weight.qscheme() == c10::kPerTensorAffine, "Unsupported qscheme: ", toString(weight.qscheme())); const int output_channels = weight.size(0); const auto qtype = weight.qscheme(); @@ -42,7 +42,7 @@ class QLinearPackWeightInt8Cudnn final { public: static c10::intrusive_ptr run( at::Tensor weight, - c10::optional bias) { + std::optional bias) { return PackedLinearWeightCudnn::prepack(std::move(weight), std::move(bias)); } }; diff --git a/aten/src/ATen/native/quantized/cudnn/LinearUnpackImpl.cpp b/aten/src/ATen/native/quantized/cudnn/LinearUnpackImpl.cpp index 7200872480efd..40088052cd151 100644 --- a/aten/src/ATen/native/quantized/cudnn/LinearUnpackImpl.cpp +++ b/aten/src/ATen/native/quantized/cudnn/LinearUnpackImpl.cpp @@ -10,8 +10,8 @@ #include -std::tuple> PackedLinearWeightCudnn::unpack() { - return std::tuple>{orig_weight, bias_}; +std::tuple> PackedLinearWeightCudnn::unpack() { + return std::tuple>{orig_weight, bias_}; } #endif // AT_CUDNN_ENABLED diff --git a/aten/src/ATen/native/quantized/cudnn/utils.h b/aten/src/ATen/native/quantized/cudnn/utils.h index 18c891fcaa1c0..fbd10e2ec95e7 100644 --- a/aten/src/ATen/native/quantized/cudnn/utils.h +++ b/aten/src/ATen/native/quantized/cudnn/utils.h @@ -27,7 +27,7 @@ C10_DIAGNOSTIC_POP() struct PackedLinearWeightCudnn : public LinearPackedParamsBase { PackedLinearWeightCudnn( at::Tensor orig_weight, - c10::optional bias, + std::optional bias, c10::QScheme q_scheme) : orig_weight(std::move(orig_weight)), bias_(std::move(bias)), @@ -53,19 +53,19 @@ struct PackedLinearWeightCudnn : public LinearPackedParamsBase { "parameter type"); } - std::tuple> unpack() override; + std::tuple> unpack() override; - c10::optional bias() override { + std::optional bias() override { return bias_; } static c10::intrusive_ptr prepack( at::Tensor weight, - c10::optional bias); + std::optional bias); private: at::Tensor orig_weight; - c10::optional bias_; + std::optional bias_; c10::QScheme q_scheme; template @@ -85,7 +85,7 @@ template struct PackedConvWeightCudnn : public ConvPackedParamsBase { PackedConvWeightCudnn( at::Tensor orig_weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -127,11 +127,11 @@ struct PackedConvWeightCudnn : public ConvPackedParamsBase { TORCH_CHECK(false, "apply_dynamic_relu is currently not reported"); } - std::tuple> unpack() override; + std::tuple> unpack() override; static c10::intrusive_ptr> prepack( at::Tensor weight, - c10::optional bias, + std::optional bias, torch::List stride, torch::List padding, torch::List output_padding, @@ -171,7 +171,7 @@ struct PackedConvWeightCudnn : public ConvPackedParamsBase { // convention "maybe"_padded_weight. // TODO: when and if cudnn enables padding in their operators, we can remove padding on our end and rename this to orig_weight_ at::Tensor maybe_padded_weight_; - c10::optional bias_; + std::optional bias_; torch::List stride_; torch::List padding_; torch::List output_padding_; diff --git a/aten/src/ATen/native/quantized/qconv_unpack.cpp b/aten/src/ATen/native/quantized/qconv_unpack.cpp index fe4007c712ce5..1fdc7745cfa2e 100644 --- a/aten/src/ATen/native/quantized/qconv_unpack.cpp +++ b/aten/src/ATen/native/quantized/qconv_unpack.cpp @@ -49,7 +49,7 @@ namespace { template class QConvUnpackWeightsInt8 final { public: - static std::tuple> run( + static std::tuple> run( const c10::intrusive_ptr>& packed_weight) { auto& ctx = at::globalContext(); @@ -85,17 +85,17 @@ class QConvUnpackWeightsInt8 final { class QConv1dUnpackWeightsInt8 final { public: - static std::tuple> run( + static std::tuple> run( const c10::intrusive_ptr>& packed_weight) { auto& ctx = at::globalContext(); at::Tensor weight; - c10::optional bias; + std::optional bias; #ifdef USE_FBGEMM if (ctx.qEngine() == at::QEngine::FBGEMM || ctx.qEngine() == at::QEngine::X86) { std::tie(weight, bias) = packed_weight->unpack(); weight = weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2); - return std::tuple>(weight, bias); + return std::tuple>(weight, bias); } #endif @@ -104,7 +104,7 @@ class QConv1dUnpackWeightsInt8 final { std::tie(weight, bias) = packed_weight->unpack(); at::Tensor new_weight = weight.clone(); new_weight = new_weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2); - return std::tuple>(new_weight, bias); + return std::tuple>(new_weight, bias); } #endif @@ -113,7 +113,7 @@ class QConv1dUnpackWeightsInt8 final { std::tie(weight, bias) = packed_weight->unpack(); at::Tensor new_weight = weight.clone(); new_weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2); - return std::tuple>(new_weight, bias); + return std::tuple>(new_weight, bias); } #endif diff --git a/aten/src/ATen/native/quantized/qlinear_unpack.cpp b/aten/src/ATen/native/quantized/qlinear_unpack.cpp index 19c9890c82e38..85eab571df9e0 100644 --- a/aten/src/ATen/native/quantized/qlinear_unpack.cpp +++ b/aten/src/ATen/native/quantized/qlinear_unpack.cpp @@ -21,7 +21,7 @@ namespace { class QLinearUnpackWeightInt8 final { public: - static std::tuple> run( + static std::tuple> run( const c10::intrusive_ptr& packed_weight) { return packed_weight->unpack(); } @@ -29,7 +29,7 @@ class QLinearUnpackWeightInt8 final { class QLinearUnpackWeightFp16 final { public: - static std::tuple> run( + static std::tuple> run( const c10::intrusive_ptr& packed_weight) { auto& ctx = at::globalContext(); @@ -44,7 +44,7 @@ class QLinearUnpackWeightFp16 final { class QLinearUnpackWeightInt8Legacy final { public: - static std::tuple> run( + static std::tuple> run( const at::Tensor& packed_weight) { TORCH_CHECK(false, "quantized.linear_unpack(Tensor) is unsupported! Please " @@ -55,7 +55,7 @@ class QLinearUnpackWeightInt8Legacy final { class QLinearUnpackWeightFp16Legacy final { public: - static std::tuple> run( + static std::tuple> run( const at::Tensor& packed_weight) { TORCH_CHECK(false, "quantized.linear_unpack(Tensor) is unsupported! Please " diff --git a/aten/src/ATen/native/sparse/SoftMax.cpp b/aten/src/ATen/native/sparse/SoftMax.cpp index 883c2b9c4ea95..179db48beacca 100644 --- a/aten/src/ATen/native/sparse/SoftMax.cpp +++ b/aten/src/ATen/native/sparse/SoftMax.cpp @@ -615,7 +615,7 @@ static Tensor _sparse_softmax(const Tensor& input_, const int64_t dim_) { return result; } -Tensor _sparse_softmax(const Tensor& input_, const int64_t dim_, c10::optional dtype) { +Tensor _sparse_softmax(const Tensor& input_, const int64_t dim_, std::optional dtype) { auto result = [&]() { NoNamesGuard guard; if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){ @@ -642,7 +642,7 @@ static Tensor _sparse_log_softmax(const Tensor& input_, const int64_t dim_) { return result; } -Tensor _sparse_log_softmax(const Tensor& input_, const int64_t dim_, c10::optional dtype) { +Tensor _sparse_log_softmax(const Tensor& input_, const int64_t dim_, std::optional dtype) { auto result = [&]() { NoNamesGuard guard; if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){ diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h index 8782031c49aa1..608f5291e607e 100644 --- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h +++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h @@ -133,8 +133,8 @@ void _sparse_binary_op_intersection_kernel_impl( const Tensor& x_, const Tensor& y_, const std::vector& broadcasted_shape, - const c10::optional& x_hash_opt_ = c10::nullopt, - const c10::optional& y_hash_opt_ = c10::nullopt, + const std::optional& x_hash_opt_ = c10::nullopt, + const std::optional& y_hash_opt_ = c10::nullopt, const bool accumulate_matches = true, const bool distributive_with_sum = true ) { @@ -148,7 +148,7 @@ void _sparse_binary_op_intersection_kernel_impl( " to output ", res.scalar_type()); using KernelLauncher = KernelLauncher; - using OptTensor = c10::optional; + using OptTensor = std::optional; // If the op and sum are not distributive, coalesce is required. const auto coalesce_if_not_distributive = [distributive_with_sum](const Tensor& t, const OptTensor& t_hash_opt) -> auto { @@ -275,8 +275,11 @@ void _sparse_binary_op_intersection_kernel_impl( KernelLauncher::launch(iter, // NOTE: capture by value required by CUDA [=] FUNCAPI (index_t nnz_idx) -> int64_t { - const auto* RESTRICT ptr_indices_dim = ptr_indices ? ptr_indices + nnz_idx * indices_nnz_stride : nullptr; int64_t hash = 0; + if (!ptr_indices) { + return hash; + } + const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride; for (int64_t dim = 0; dim < sparse_dim; ++dim) { const auto dim_hash_coeff = hash_coeffs[dim]; const auto dim_index = ptr_indices_dim[dim * indices_dim_stride]; @@ -423,8 +426,8 @@ void _sparse_binary_op_intersection_kernel_out( Tensor& res, const Tensor& x, const Tensor& y, - const c10::optional& x_hash_opt = c10::nullopt, - const c10::optional& y_hash_opt = c10::nullopt, + const std::optional& x_hash_opt = c10::nullopt, + const std::optional& y_hash_opt = c10::nullopt, // If op distributes with the sum, the arguments are processed as is, // without the calls to coalesce(). const bool distributive_with_sum = true @@ -439,7 +442,7 @@ void _sparse_binary_op_intersection_kernel_out( x._indices().scalar_type() == y._indices().scalar_type(), NAME, "(): expects inputs' indices to be of the same dtype (i.e. long or int)"); - const auto check_hash_validity = [](const Tensor& t, const c10::optional& t_hash_opt) { + const auto check_hash_validity = [](const Tensor& t, const std::optional& t_hash_opt) { if (!t_hash_opt.has_value()) { return; } diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp index 2db8c9e9404cc..94a1e3d622355 100644 --- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp +++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp @@ -119,7 +119,7 @@ struct CPUValueSelectionIntersectionKernel { } }; -using OptTensor = c10::optional; +using OptTensor = std::optional; void mul_sparse_sparse_out_cpu_kernel( Tensor& result, diff --git a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp index d1973c43e9ad7..59b048f5d147c 100644 --- a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp @@ -363,10 +363,10 @@ Tensor sparse_compressed_tensor_with_dims( c10::IntArrayRef size, c10::IntArrayRef blocksize, ScalarType index_dtype, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // sparse_compressed_tensor_with_dims is a generalization of empty // that enables the specification of nnz, dense_dim, blocksize, and // index_dtype for sparse compressed tensors. @@ -435,10 +435,10 @@ Tensor _sparse_compressed_tensor_unsafe_symint( const Tensor& plain_indices, const Tensor& values, c10::SymIntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { if (!layout) { AT_ERROR("sparse_compressed_tensor_unsafe expected sparse compressed tensor layout but got none"); } @@ -458,10 +458,10 @@ Tensor _sparse_compressed_tensor_unsafe_template(const Tensor& compressed_indice const Tensor& plain_indices, const Tensor& values, IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { Layout layout_ = layout.value_or(required_layout); TORCH_CHECK(layout_ == required_layout, "sparse compressed layout must be ",required_layout, " but got ", layout_); if (at::globalContext().checkSparseTensorInvariants()) { @@ -478,10 +478,10 @@ Tensor _sparse_compressed_tensor_unsafe_template(const Tensor& compressed_indice const Tensor& plain_indices, \ const Tensor& values, \ IntArrayRef size, \ - c10::optional dtype, \ - c10::optional layout, \ - c10::optional device, \ - c10::optional pin_memory) { \ + std::optional dtype, \ + std::optional layout, \ + std::optional device, \ + std::optional pin_memory) { \ return _sparse_compressed_tensor_unsafe_template(compressed_indices, plain_indices, values, size, dtype, layout, device, pin_memory); \ } @@ -554,10 +554,10 @@ Tensor sparse_compressed_tensor( const Tensor& plain_indices, const Tensor& values, IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { if (!layout) { AT_ERROR("sparse_compressed_tensor expected sparse compressed tensor layout but got none"); @@ -583,10 +583,10 @@ Tensor sparse_compressed_tensor( const Tensor& compressed_indices, const Tensor& plain_indices, const Tensor& values, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { if (!layout) { AT_ERROR("sparse_compressed_tensor expected sparse compressed tensor layout but got none"); @@ -614,28 +614,28 @@ Tensor sparse_compressed_tensor( Tensor sparse_##KIND##_tensor(const Tensor& compressed_indices, \ const Tensor& plain_indices, \ const Tensor& values, \ - c10::optional dtype, \ - c10::optional layout, \ - c10::optional device, \ - c10::optional pin_memory) { \ + std::optional dtype, \ + std::optional layout, \ + std::optional device, \ + std::optional pin_memory) { \ if (layout) { \ TORCH_CHECK(layout.value() == REQUIRED_LAYOUT, "sparse " # KIND " layout must be ", REQUIRED_LAYOUT, " but got ", layout.value()); \ } \ - c10::optional layout_(REQUIRED_LAYOUT); \ + std::optional layout_(REQUIRED_LAYOUT); \ return at::native::sparse_compressed_tensor(compressed_indices, plain_indices, values, dtype, layout_, device, pin_memory); \ } \ Tensor sparse_##KIND##_tensor(const Tensor& compressed_indices, \ const Tensor& plain_indices, \ const Tensor& values, \ IntArrayRef size, \ - c10::optional dtype, \ - c10::optional layout, \ - c10::optional device, \ - c10::optional pin_memory) { \ + std::optional dtype, \ + std::optional layout, \ + std::optional device, \ + std::optional pin_memory) { \ if (layout) { \ TORCH_CHECK(layout.value() == REQUIRED_LAYOUT, "sparse " # KIND " layout must be ", REQUIRED_LAYOUT, " but got ", layout.value()); \ } \ - c10::optional layout_(REQUIRED_LAYOUT); \ + std::optional layout_(REQUIRED_LAYOUT); \ return at::native::sparse_compressed_tensor(compressed_indices, plain_indices, values, size, dtype, layout_, device, pin_memory); \ } @@ -650,11 +650,11 @@ SPARSE_COMPRESSED_TENSOR(bsc, kSparseBsc) // indices. The implementation below is kept for BC. Tensor empty_sparse_compressed( IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { check_size_nonnegative(size); TORCH_CHECK(size.size() >= 2, "torch.empty: Only batched sparse compressed (non-block) tensors are supported, but got size ", size); @@ -699,7 +699,7 @@ Tensor empty_sparse_compressed( const Tensor& resize_sparse_csr_( const Tensor& self, IntArrayRef size, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { check_size_nonnegative(size); TORCH_CHECK(size.size() >= 2, "torch.resize_: Only batched sparse CSR matrices are supported, but got size ", size); TORCH_CHECK( @@ -836,7 +836,7 @@ const SparseCsrTensor& resize_as_sparse_compressed_( SparseCsrTensor clone_sparse_compressed( const SparseCsrTensor& self, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { TORCH_CHECK( !optional_memory_format.has_value(), "unsupported memory format option ", @@ -863,11 +863,11 @@ SparseCsrTensor clone_sparse_compressed( Tensor empty_like_sparse_csr( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); TensorOptions options = self.options() diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp index bff9842a2a3ab..ccac30d65a1a7 100644 --- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp @@ -326,7 +326,7 @@ Tensor& normal_sparse_csr_( Tensor& self, double mean, double std, - c10::optional gen) { + std::optional gen) { return unary_op_inplace(self, &Tensor::normal_, mean, std, gen); } @@ -1000,7 +1000,7 @@ struct Reduction...Op { inline scalar_t identity() const { return ...; } }; -Tensor _sparse_csr_..._cpu(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional dtype) { +Tensor _sparse_csr_..._cpu(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, std::optional dtype) { ... result = reduce_sparse_csr_cpu_template(input_, dims_to_sum, keepdim, Reduction...Op()); ... @@ -1336,7 +1336,7 @@ struct ReductionMulOp { } // namespace -Tensor _sparse_csr_sum_cpu(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional dtype) { +Tensor _sparse_csr_sum_cpu(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, std::optional dtype) { ScalarType dtype_ = dtype.value_or(input.scalar_type()); Tensor input_ = at::sparse_csr::to_type(input, dtype_); Tensor result; @@ -1352,7 +1352,7 @@ Tensor _sparse_csr_sum_cpu(const Tensor& input, IntArrayRef dims_to_sum, bool ke return result; } -Tensor _sparse_csr_prod_cpu(const Tensor& input, IntArrayRef dims_to_reduce, bool keepdim, c10::optional dtype) { +Tensor _sparse_csr_prod_cpu(const Tensor& input, IntArrayRef dims_to_reduce, bool keepdim, std::optional dtype) { ScalarType dtype_ = dtype.value_or(input.scalar_type()); Tensor input_ = input.to(dtype_); Tensor result; diff --git a/aten/src/ATen/native/sparse/SparseFactories.cpp b/aten/src/ATen/native/sparse/SparseFactories.cpp index 6ee92320e12d1..38a59b40c808a 100644 --- a/aten/src/ATen/native/sparse/SparseFactories.cpp +++ b/aten/src/ATen/native/sparse/SparseFactories.cpp @@ -22,7 +22,7 @@ Tensor spdiags( const Tensor& diagonals, const Tensor& offsets, IntArrayRef shape, - c10::optional layout) { + std::optional layout) { auto diagonals_2d = diagonals.dim() == 1 ? diagonals.unsqueeze(0) : diagonals; TORCH_CHECK(diagonals_2d.dim() == 2, "Diagonals must be vector or matrix"); TORCH_CHECK(shape.size() == 2, "Output shape must be 2d"); diff --git a/aten/src/ATen/native/sparse/SparseStubs.h b/aten/src/ATen/native/sparse/SparseStubs.h index 2a3aef5c8bd92..af6df0785fe92 100644 --- a/aten/src/ATen/native/sparse/SparseStubs.h +++ b/aten/src/ATen/native/sparse/SparseStubs.h @@ -13,10 +13,10 @@ namespace native { using mul_sparse_sparse_out_fn = void (*)(Tensor& res, const Tensor& x, const Tensor& y); DECLARE_DISPATCH(mul_sparse_sparse_out_fn, mul_sparse_sparse_out_stub); -using sparse_mask_intersection_out_fn = void (*)(Tensor& res, const Tensor& x, const Tensor& y, const c10::optional& x_hash_opt); +using sparse_mask_intersection_out_fn = void (*)(Tensor& res, const Tensor& x, const Tensor& y, const std::optional& x_hash_opt); DECLARE_DISPATCH(sparse_mask_intersection_out_fn, sparse_mask_intersection_out_stub); -using sparse_mask_projection_out_fn = void (*)(Tensor& res, const Tensor& x, const Tensor& y, const c10::optional& x_hash_opt, bool accumulate_matches); +using sparse_mask_projection_out_fn = void (*)(Tensor& res, const Tensor& x, const Tensor& y, const std::optional& x_hash_opt, bool accumulate_matches); DECLARE_DISPATCH(sparse_mask_projection_out_fn, sparse_mask_projection_out_stub); using flatten_indices_fn = Tensor (*)(const Tensor& indices, IntArrayRef size); diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index add7f433731a2..e9f10d964b320 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -143,10 +143,10 @@ Tensor values_default(const Tensor& self) { /*** Helper methods ***/ static SparseTensor new_sparse( - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { AT_ASSERT(layout.has_value() && *layout == kSparse); DispatchKey dispatch_key; switch (device_or_default(device).type()) { @@ -170,10 +170,10 @@ SparseTensor new_with_dims_sparse( int64_t sparse_dim, int64_t dense_dim, ArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { SparseTensor self = new_sparse(dtype, layout, device, pin_memory); get_sparse_impl(self)->resize_and_clear_(sparse_dim, dense_dim, size); return self; @@ -185,11 +185,11 @@ SparseTensor new_with_dims_and_tensor_sparse_symint( c10::SymIntArrayRef size, const Tensor& indices, const Tensor& values, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional is_coalesced) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional is_coalesced) { SparseTensor self = new_sparse(dtype, layout, device, pin_memory); auto impl = get_sparse_impl(self); impl->resize_(sparse_dim, dense_dim, size); @@ -228,11 +228,11 @@ SparseTensor new_with_dims_and_tensor_sparse_symint( /** Empty init **/ Tensor empty_sparse( IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { TORCH_CHECK( !pin_memory.has_value() || !*pin_memory, "Only dense CPU tensors can be pinned"); @@ -242,10 +242,10 @@ Tensor empty_sparse( /* Shape init */ Tensor sparse_coo_tensor(IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -268,11 +268,11 @@ static inline Tensor expand_values_if_needed(const Tensor& values) { } // namespace Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional is_coalesced) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional is_coalesced) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); @@ -352,7 +352,7 @@ void _validate_sparse_coo_tensor_args( const Tensor& indices, const Tensor& values_, ArrayRef size, - c10::optional is_coalesced_) { + std::optional is_coalesced_) { Tensor values = expand_values_if_needed(values_); bool is_coalesced = is_coalesced_.value_or(false); @@ -425,11 +425,11 @@ void _validate_sparse_coo_tensor_args( // NB: Got rid of the sizes == NULL case Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional is_coalesced) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional is_coalesced) { // See [Note: hacky wrapper removal for TensorOptions] TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); // arg checking @@ -449,11 +449,11 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, IntArrayRe } Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values_, at::IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional is_coalesced) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional is_coalesced) { if (at::globalContext().checkSparseTensorInvariants()) { at::native::_validate_sparse_coo_tensor_args(indices, values_, size, is_coalesced); } @@ -467,11 +467,11 @@ Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values_, a // _validate_sparse_coo_tensor_args before using the tensor. // NB: Got rid of the size == NULL case Tensor _sparse_coo_tensor_unsafe_symint(const Tensor& indices, const Tensor& values_, c10::SymIntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional is_coalesced) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional is_coalesced) { // See [Note: hacky wrapper removal for TensorOptions] Tensor values = expand_values_if_needed(values_); @@ -495,7 +495,7 @@ Tensor _sparse_coo_tensor_unsafe_symint(const Tensor& indices, const Tensor& val SparseTensor clone_sparse( const SparseTensor& self, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { TORCH_CHECK( !optional_memory_format.has_value(), "unsupported memory format option ", @@ -687,7 +687,7 @@ SparseTensor _coalesce_sparse_cpu(const SparseTensor& self) { DEFINE_DISPATCH(sparse_mask_intersection_out_stub); DEFINE_DISPATCH(sparse_mask_projection_out_stub); -using OptTensor = c10::optional; +using OptTensor = std::optional; static std::tuple sparse_mask_like_prepare_sparse_inputs( const std::string& method_name, @@ -814,11 +814,11 @@ Tensor sparse_mask_projection(const Tensor& t, const Tensor& mask, bool accumula Tensor empty_like_sparse_coo( const Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional optional_memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional optional_memory_format) { TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); TORCH_CHECK( diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index a3227df942c45..f058c68579f86 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -220,7 +220,7 @@ static SparseTensor& coalesce_(SparseTensor& tensor) { // div(SparseTensor, Scalar) // -------------------------------------------------------------------- -SparseTensor& div_out_sparse_zerodim(const SparseTensor& t, const Tensor& value, c10::optional rounding_mode, SparseTensor& r) { +SparseTensor& div_out_sparse_zerodim(const SparseTensor& t, const Tensor& value, std::optional rounding_mode, SparseTensor& r) { TORCH_CHECK(value.dim() == 0, "Sparse division requires a scalar or ", "zero-dim dense tensor divisor (got shape ", value.sizes(), " for divisor)"); TORCH_CHECK(!value.is_sparse(), "Sparse division requires a scalar or ", @@ -274,7 +274,7 @@ static SparseTensor& div_out_sparse_scalar(const SparseTensor& t, Scalar value, return div_out_sparse_zerodim(t, wrapped_scalar_tensor(value), r); } -Tensor div_sparse(const Tensor& self, const Tensor& value, c10::optional rounding_mode) { +Tensor div_sparse(const Tensor& self, const Tensor& value, std::optional rounding_mode) { auto commonDtype = at::result_type(self, value); if (c10::isIntegralType(commonDtype, /*includeBool=*/true) && !rounding_mode.has_value()) { commonDtype = typeMetaToScalarType(at::get_default_dtype()); @@ -283,11 +283,11 @@ Tensor div_sparse(const Tensor& self, const Tensor& value, c10::optional rounding_mode) { +Tensor& div_sparse_(Tensor& self, const Tensor& value, std::optional rounding_mode) { return div_out_sparse_zerodim(self, value, std::move(rounding_mode), self); } -static SparseTensor& div_out_sparse_scalar(const SparseTensor& t, Scalar value, c10::optional rounding_mode, SparseTensor& r) { +static SparseTensor& div_out_sparse_scalar(const SparseTensor& t, Scalar value, std::optional rounding_mode, SparseTensor& r) { return div_out_sparse_zerodim(t, wrapped_scalar_tensor(value), std::move(rounding_mode), r); } diff --git a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp index ce6e3d4eac11b..f5445ba4bd48d 100644 --- a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp +++ b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp @@ -257,16 +257,16 @@ Tensor& threshold_backward_sparse_out( } Tensor nan_to_num_sparse( - const Tensor &self, c10::optional nan, - c10::optional posinf, c10::optional neginf) { + const Tensor &self, std::optional nan, + std::optional posinf, c10::optional neginf) { return coalesced_unary_ufunc( self, [&](const Tensor &t) { return at::nan_to_num(t, nan, posinf, neginf); }); } Tensor& nan_to_num_sparse_out( - const Tensor &self, c10::optional nan, - c10::optional posinf, c10::optional neginf, + const Tensor &self, std::optional nan, + std::optional posinf, c10::optional neginf, Tensor &out) { return coalesced_unary_ufunc_out( self, out, [&](const Tensor &t, Tensor &out) { @@ -274,8 +274,8 @@ Tensor& nan_to_num_sparse_out( }); } Tensor& nan_to_num_sparse_( - Tensor &self, c10::optional nan, - c10::optional posinf, c10::optional neginf) { + Tensor &self, std::optional nan, + std::optional posinf, c10::optional neginf) { TORCH_CHECK(self.is_coalesced(), "nan_to_num_ requires coalesced input"); return nan_to_num_sparse_out(self, nan, posinf, neginf, self); } diff --git a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu index 75474e77ea848..1ee5a8b9d2c01 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu @@ -389,7 +389,7 @@ struct Reduction...Op { }; -Tensor _sparse_csr_..._cuda(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional dtype) { +Tensor _sparse_csr_..._cuda(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, std::optional dtype) { ... result = reduce_sparse_csr_cuda_template(input_, dims_to_sum, keepdim, Reduction...Op()); ... @@ -708,7 +708,7 @@ struct ReductionMulOp { } // namespace -Tensor _sparse_csr_sum_cuda(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional dtype) { +Tensor _sparse_csr_sum_cuda(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, std::optional dtype) { ScalarType dtype_ = dtype.value_or(input.scalar_type()); Tensor input_ = at::sparse_csr::to_type(input, dtype_); Tensor result; @@ -724,7 +724,7 @@ Tensor _sparse_csr_sum_cuda(const Tensor& input, IntArrayRef dims_to_sum, bool k return result; } -Tensor _sparse_csr_prod_cuda(const Tensor& input, IntArrayRef dims_to_reduce, bool keepdim, c10::optional dtype) { +Tensor _sparse_csr_prod_cuda(const Tensor& input, IntArrayRef dims_to_reduce, bool keepdim, std::optional dtype) { ScalarType dtype_ = dtype.value_or(input.scalar_type()); Tensor input_ = input.to(dtype_); Tensor result; diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu index 47ee1568beb1e..01aa11dbdecb5 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu @@ -600,9 +600,9 @@ Tensor two_four_sgemm_dispatch_layouts_bias_activation( // number of checks throughout the code. Tensor _sparse_semi_structured_linear( const Tensor& input, const Tensor& weight, - const Tensor& meta, const c10::optional& bias_opt, - const c10::optional activation_opt, - const c10::optional out_dtype_opt) { + const Tensor& meta, const std::optional& bias_opt, + const std::optional activation_opt, + const std::optional out_dtype_opt) { TORCH_WARN_ONCE("_sparse_semi_structured_linear is deprecated and will be " "removed in a future PyTorch release. Please use " "_sparse_semi_structured_mm/_sparse_semi_structured_addmm " diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu index 8c05acc66bc92..abd6cf9739c63 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu @@ -522,8 +522,8 @@ void spgemm_cutlass_dispatch_layouts_tensor_c( // aten._sparse_semi_structured_addmm operators. Tensor sparse_semi_structured_mad_op( const Tensor& mat1, const Tensor& mat1_meta, const Tensor& mat2, - const c10::optional& input_opt, const Scalar& alpha, - const Scalar& beta, const c10::optional out_dtype_opt) { + const std::optional& input_opt, const Scalar& alpha, + const Scalar& beta, const std::optional out_dtype_opt) { #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080) AT_ERROR(__func__, " : CUTLASS not supported"); return Tensor{}; @@ -787,9 +787,9 @@ Tensor sparse_semi_structured_mad_op( // Implementation of aten._sparse_semi_structured_mm operator. Tensor _sparse_semi_structured_mm( const Tensor& mat1, const Tensor& mat1_meta, const Tensor& mat2, - const c10::optional out_dtype_opt) { + const std::optional out_dtype_opt) { return sparse_semi_structured_mad_op(mat1, mat1_meta, mat2, - c10::optional(), 1, 0, + std::optional(), 1, 0, out_dtype_opt); } @@ -797,7 +797,7 @@ Tensor _sparse_semi_structured_mm( Tensor _sparse_semi_structured_addmm( const Tensor& input, const Tensor& mat1, const Tensor& mat1_meta, const Tensor& mat2, const Scalar& alpha, const Scalar& beta, - const c10::optional out_dtype_opt) { + const std::optional out_dtype_opt) { return sparse_semi_structured_mad_op(mat1, mat1_meta, mat2, input, alpha, beta, out_dtype_opt); } diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu index fd5a04fa61039..b5382b5b08486 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu @@ -207,7 +207,7 @@ std::tuple sparse_semi_structured_tile_t std::string algorithm) { using KT = KernelTypes; - c10::optional device_guard; + std::optional device_guard; if (!input.is_meta()) { device_guard.emplace(input.device()); } diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu index 023e8f73930fd..2fbbaa0290703 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu @@ -34,7 +34,7 @@ std::tuple _sparse_semi_structured_apply_typed(Tensor input, Ten if (input.stride(1) != 1) { input = input.contiguous(); } - c10::optional device_guard; + std::optional device_guard; if (!kIsMeta) { device_guard.emplace(input.device()); } diff --git a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp index c66fbf8f2a93d..384fa2422b247 100644 --- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp +++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp @@ -101,9 +101,9 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) std::tuple _cslt_sparse_mm_impl( const Tensor& compressed_A, const Tensor& dense_B, - const c10::optional& bias_opt, - const c10::optional& alpha_opt, - const c10::optional out_dtype_opt, + const std::optional& bias_opt, + const std::optional& alpha_opt, + const std::optional out_dtype_opt, bool transpose_result, int alg_id, bool search_alg_id @@ -343,9 +343,9 @@ std::tuple _cslt_sparse_mm_impl( at::Tensor _cslt_sparse_mm( const Tensor& compressed_A, const Tensor& dense_B, - const c10::optional& bias_opt, - const c10::optional& alpha_opt, - const c10::optional out_dtype_opt, + const std::optional& bias_opt, + const std::optional& alpha_opt, + const std::optional out_dtype_opt, bool transpose_result, int64_t alg_id ) @@ -365,9 +365,9 @@ at::Tensor _cslt_sparse_mm( int64_t _cslt_sparse_mm_search( const Tensor& compressed_A, const Tensor& dense_B, - const c10::optional& bias_opt, - const c10::optional& alpha_opt, - const c10::optional out_dtype_opt, + const std::optional& bias_opt, + const std::optional& alpha_opt, + const std::optional out_dtype_opt, bool transpose_result ) { @@ -398,9 +398,9 @@ at::Tensor _cslt_compress(const Tensor& sparse_input){ at::Tensor _cslt_sparse_mm( const Tensor& compressed_A, const Tensor& dense_B, - const c10::optional& bias_opt, - const c10::optional& alpha_opt, - const c10::optional out_dtype, + const std::optional& bias_opt, + const std::optional& alpha_opt, + const std::optional out_dtype, bool transpose_result, int64_t alg_id) { @@ -410,9 +410,9 @@ at::Tensor _cslt_sparse_mm( int64_t _cslt_sparse_mm_search( const Tensor& compressed_A, const Tensor& dense_B, - const c10::optional& bias_opt, - const c10::optional& alpha_opt, - const c10::optional out_dtype, + const std::optional& bias_opt, + const std::optional& alpha_opt, + const std::optional out_dtype, bool transpose_result ) { diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp index e26de29537954..ede02ab1352f0 100644 --- a/aten/src/ATen/native/transformers/attention.cpp +++ b/aten/src/ATen/native/transformers/attention.cpp @@ -106,9 +106,9 @@ Tensor bmm_nt(const Tensor& a, const Tensor& b) { Tensor masked_softmax( Tensor& attn_scores, - c10::optional attn_mask, + std::optional attn_mask, const Tensor& query, - c10::optional mask_type) { + std::optional mask_type) { if (query.is_nested() && !attn_mask) { return at::_nested_tensor_softmax_with_shape(attn_scores, query); } @@ -267,10 +267,10 @@ std::tuple native_multi_head_attention_cpu( const Tensor& qkv_bias, const Tensor& proj_weight, const Tensor& proj_bias, - const c10::optional& mask, + const std::optional& mask, bool need_weights, bool average_attn_weights, - const c10::optional mask_type) { + const std::optional mask_type) { // query shape: [B, T, D] // qkv_weight shape: [3 * D, D] @@ -423,7 +423,7 @@ std::tuple native_multi_head_attention_cpu( } int64_t _fused_sdp_choice_cpp(const Tensor& query_, const Tensor& key, const Tensor& value, - const c10::optional& attn_mask_, double dropout_p, bool is_causal, c10::optional scale){ + const std::optional& attn_mask_, double dropout_p, bool is_causal, c10::optional scale){ sdp::sdp_params kernel_params{query_, key, value, attn_mask_, dropout_p, is_causal}; auto backend = sdp::select_sdp_backend_cpp(kernel_params); if (backend == sdp::SDPBackend::error) { @@ -445,10 +445,10 @@ int64_t _fused_sdp_choice_meta( const Tensor& query_, const Tensor& key, const Tensor& value, - const c10::optional& attn_mask_, + const std::optional& attn_mask_, double dropout_p, bool is_causal, - c10::optional scale) { + std::optional scale) { auto query_key_set = query_.key_set(); #if defined(USE_ROCM) bool has_rocm = query_key_set.has(c10::DispatchKey::HIP); @@ -479,10 +479,10 @@ inline void validate_sdpa_input( const Tensor& query_, const Tensor& key, const Tensor& value, - const c10::optional& attn_mask_, + const std::optional& attn_mask_, double dropout_p, bool is_causal, - c10::optional scale) { + std::optional scale) { TORCH_CHECK( query_.dtype() == key.dtype() && query_.dtype() == value.dtype(), "Expected query, key, and value to have the same dtype, but got query.dtype: ", @@ -512,7 +512,7 @@ inline void validate_sdpa_input( // the math and memory efficient attn_mask implementation // Args: // attn_mask: attn_mask of shape (B, L, S) or (L, S) or (B, N_heads, L, S) -c10::optional convert_boolean_attn_mask(const c10::optional& attn_mask, caffe2::TypeMeta dtype) { +std::optional convert_boolean_attn_mask(const c10::optional& attn_mask, caffe2::TypeMeta dtype) { // Pass through if(!attn_mask.has_value()){ return c10::nullopt; @@ -598,7 +598,7 @@ at::Tensor post_process_flash_output( } int64_t handle_private_use(const Tensor& query_, const Tensor& key, const Tensor& value, - const c10::optional& attn_mask_, double dropout_p, bool is_causal, c10::optional scale){ + const std::optional& attn_mask_, double dropout_p, bool is_causal, c10::optional scale){ int64_t choice_int = static_cast(sdp::SDPBackend::math); try { choice_int = _fused_sdp_choice_stub(query_.device().type(), @@ -643,10 +643,10 @@ Tensor scaled_dot_product_attention( const Tensor& query_, const Tensor& key, const Tensor& value, - const c10::optional& attn_mask_, + const std::optional& attn_mask_, double dropout_p, bool is_causal, - c10::optional scale) { + std::optional scale) { validate_sdpa_input(query_, key, value, attn_mask_, dropout_p, is_causal, scale); int64_t choice_int = static_cast(sdp::SDPBackend::math); if (query_.device().type() == DeviceType::CUDA @@ -662,7 +662,7 @@ Tensor scaled_dot_product_attention( } } sdp::SDPBackend backend = static_cast(choice_int); - c10::optional attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype()); + std::optional attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype()); switch (backend) { case sdp::SDPBackend::cudnn_attention: { bool compute_logsumexp = @@ -719,8 +719,8 @@ Tensor scaled_dot_product_attention( std::tuple _scaled_dot_product_attention_math( const Tensor& query_, const Tensor& key, const Tensor& value, - const c10::optional& attn_mask_, double dropout_p, bool is_causal, - const c10::optional& dropout_mask, c10::optional scale) { + const std::optional& attn_mask_, double dropout_p, bool is_causal, + const std::optional& dropout_mask, c10::optional scale) { C10_LOG_API_USAGE_ONCE("torch.sdpa.math_fallback"); if (query_.is_nested() || key.is_nested() || value.is_nested()) { TORCH_CHECK( @@ -779,8 +779,8 @@ _scaled_dot_product_flash_attention_cpu( const Tensor& value, double dropout_p, bool is_causal, - const c10::optional& attn_mask, - c10::optional scale) { + const std::optional& attn_mask, + std::optional scale) { const auto dtype = query.scalar_type(); int64_t batchSize = query.size(0); int64_t qSize = query.size(2); @@ -827,8 +827,8 @@ _scaled_dot_product_flash_attention_cpu_backward( const Tensor& logsumexp, double dropout_p, bool is_causal, - const c10::optional& attn_mask, - c10::optional scale) { + const std::optional& attn_mask, + std::optional scale) { if (!grad_out.defined()) { return std::make_tuple(Tensor{}, Tensor{}, Tensor{}); } @@ -864,7 +864,7 @@ Tensor triton_multi_head_attention( const Tensor& qkv_bias, const Tensor& proj_weight, const Tensor& proj_bias, - const c10::optional& mask) { + const std::optional& mask) { // query shape: [B, T, D] // qkv_weight shape: [3 * D, D] TORCH_CHECK(!mask, "Only causal mask is supported for Triton."); diff --git a/aten/src/ATen/native/transformers/attention.h b/aten/src/ATen/native/transformers/attention.h index 2d2740a92e7dc..0e4a52f445442 100644 --- a/aten/src/ATen/native/transformers/attention.h +++ b/aten/src/ATen/native/transformers/attention.h @@ -9,16 +9,16 @@ namespace at { namespace native { using fused_sdp_choice_fn = int64_t (*)(const Tensor& query_, const Tensor& key, const Tensor& value, - const c10::optional& attn_mask_, double dropout_p, bool is_causal, c10::optional scale); + const std::optional& attn_mask_, double dropout_p, bool is_causal, c10::optional scale); DECLARE_DISPATCH(fused_sdp_choice_fn, _fused_sdp_choice_stub); TORCH_API Tensor bmm_nt(const Tensor& a, const Tensor& b); TORCH_API Tensor masked_softmax( Tensor& attn_scores, - c10::optional attn_mask, + std::optional attn_mask, const Tensor& query, - c10::optional mask_type = {}); + std::optional mask_type = {}); using transform_bias_rescale_qkv_fn = void(*)( at::ScalarType type, @@ -53,8 +53,8 @@ using flash_attention_fn = void (*)( const Tensor& output, const Tensor& logsumexp, const Tensor& query, const Tensor& key, const Tensor& value, double dropout_p, bool is_causal, - c10::optional attn_mask, - c10::optional scale); + std::optional attn_mask, + std::optional scale); using flash_attention_backward_fn = void (*)( const Tensor& grad_q, const Tensor& grad_k, @@ -62,8 +62,8 @@ using flash_attention_backward_fn = void (*)( const Tensor& query, const Tensor& key, const Tensor& value, const Tensor& out, const Tensor& logsumexp, double dropout_p, bool is_causal, - c10::optional attn_mask, - c10::optional scale); + std::optional attn_mask, + std::optional scale); DECLARE_DISPATCH(flash_attention_fn, flash_attention_kernel); DECLARE_DISPATCH(flash_attention_backward_fn, flash_attention_backward_kernel); diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index dcf451feead7b..e55560791a085 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -479,10 +479,10 @@ std::tuple native_multi_head_attention_cuda( const Tensor& qkv_bias, const Tensor& proj_weight, const Tensor& proj_bias, - const c10::optional& mask, + const std::optional& mask, bool need_weights, bool average_attn_weights, - const c10::optional mask_type) { + const std::optional mask_type) { // query shape: [B, T, D] // qkv_weight shape: [3 * D, D] @@ -681,7 +681,7 @@ std::tuple scale) { + std::optional scale) { // Used for tracking usage statistics C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention"); // Query (Batch x Num_heads x Q_seq_len x Dim_per_head) @@ -733,7 +733,7 @@ std::tuple scale) { + std::optional scale) { // Used for tracking usage statistics C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention_cudnn"); // Query (Batch x Num_heads x Q_seq_len x Dim_per_head) @@ -780,11 +780,11 @@ std::tuple _scaled_dot_product_efficient_attenti const Tensor& query, const Tensor& key, const Tensor& value, - const c10::optional& attn_bias, + const std::optional& attn_bias, bool compute_log_sumexp, double dropout_p, bool is_causal, - c10::optional scale) { + std::optional scale) { // Used for tracking usage statistics C10_LOG_API_USAGE_ONCE("torch.sdpa.mem_efficient_attention"); // Query -> Query(Batch x Q_seq_len x Num_heads x Dim_per_head) @@ -817,7 +817,7 @@ std::tuple _scaled_dot_product_efficient_attenti } int64_t _fused_sdp_choice_cuda(const Tensor& query_, const Tensor& key, const Tensor& value, - const c10::optional& attn_mask_, double dropout_p, bool is_causal, c10::optional scale){ + const std::optional& attn_mask_, double dropout_p, bool is_causal, c10::optional scale){ sdp::sdp_params kernel_params{query_, key, value, attn_mask_, dropout_p, is_causal}; auto backend = select_sdp_backend(kernel_params); if (backend == sdp::SDPBackend::error) { @@ -834,23 +834,23 @@ _flash_attention_forward( const Tensor& query, const Tensor& key, const Tensor& value, - const c10::optional& cumulative_sequence_length_q, - const c10::optional& cumulative_sequence_length_k, + const std::optional& cumulative_sequence_length_q, + const std::optional& cumulative_sequence_length_k, int64_t max_seqlen_batch_q, int64_t max_seqlen_batch_k, double dropout_p, bool is_causal, bool return_debug_mask, - c10::optional scale) { + std::optional scale) { #if defined(USE_FLASH_ATTENTION) const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked(); - c10::optional out = c10::nullopt; + std::optional out = c10::nullopt; // This can be used when your sequence length k is not the full extent // of the tensor. This is useful for kv cache scenarios but for now // we will not support in this PR. - c10::optional seqused_k = c10::nullopt; - c10::optional alibi_slopes = c10::nullopt; + std::optional seqused_k = c10::nullopt; + std::optional alibi_slopes = c10::nullopt; // We are going to have two paths: // 1. The standard MHA path for dense tensors @@ -937,23 +937,23 @@ std::tuple _efficient_ const at::Tensor& query, // [b, seqlen, num_heads, K] const at::Tensor& key, // [b, seqlen, num_heads, K] const at::Tensor& value, // [b, seqlen, num_heads, Kv] - const c10::optional& bias, // [b, num_heads, seqlen, seqlen] + const std::optional& bias, // [b, num_heads, seqlen, seqlen] // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the // position of the first query token for batch $b - const c10::optional& seqstart_q, + const std::optional& seqstart_q, // (Mode 1MHK only) [b+1]: cu_seqlen_k[b] contains the // position of the first key token for batch $b - const c10::optional& seqstart_k, + const std::optional& seqstart_k, // (Mode 1MHK only) Maximum sequence length across batches - const c10::optional max_seqlen_q_, - const c10::optional max_seqlen_k_, + const std::optional max_seqlen_q_, + const std::optional max_seqlen_k_, double dropout_p, // attention matrix dropout probability int64_t custom_mask_type, bool compute_logsumexp, - c10::optional scale, - const c10::optional& causal_diagonal, - const c10::optional& seqlen_k, - const c10::optional window_size) { + std::optional scale, + const std::optional& causal_diagonal, + const std::optional& seqlen_k, + const std::optional window_size) { #if defined(USE_MEM_EFF_ATTENTION) // TODO In theory it is possible to compile with _CUDA_ARCH < 5.0 and run on a // machine that is >= 5.0. In practice, this is not a problem but since diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu index 0405b6d73329f..78c2d54fdc8a6 100644 --- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu +++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu @@ -66,22 +66,22 @@ std::tuple _flash_attention_backward( bool is_causal, const Tensor& philox_seed, const Tensor& philox_offset, - c10::optional scale) { + std::optional scale) { #if defined(USE_FLASH_ATTENTION) const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked(); // CUDA code assumes that dout is contiguous auto contiguous_grad_out = grad_out.contiguous(); auto contiguous_out = out.contiguous(); - c10::optional dq{c10::nullopt}; - c10::optional dk{c10::nullopt}; - c10::optional dv{c10::nullopt}; + std::optional dq{c10::nullopt}; + std::optional dk{c10::nullopt}; + std::optional dv{c10::nullopt}; // The kernel computes irregardless we will drop for this functions return Tensor grad_softmax; // Currently unused args: - c10::optional alibi_slopes{c10::nullopt}; + std::optional alibi_slopes{c10::nullopt}; bool determinisitic{false}; auto& ctx = at::globalContext(); @@ -167,7 +167,7 @@ std::tuple _scaled_dot_product_cudnn_attention_backward_ bool is_causal, const Tensor& philox_seed, const Tensor& philox_offset, - c10::optional scale) { + std::optional scale) { const int64_t batch_size = query.size(0); const int64_t num_heads = query.size(1); const int64_t head_dim = query.size(3); @@ -205,14 +205,14 @@ _efficient_attention_backward( const at::Tensor& query, const at::Tensor& key, const at::Tensor& value, - const c10::optional& kernel_bias, // additive attention bias + const std::optional& kernel_bias, // additive attention bias const at::Tensor& out, // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the // position of the first query token for batch $b - const c10::optional& cu_seqlens_q_dummy, + const std::optional& cu_seqlens_q_dummy, // (Mode 1MHK only) [b+1]: cu_seqlens_k[b] contains the // position of the first key token for batch $b - const c10::optional& cu_seqlens_k_dummy, + const std::optional& cu_seqlens_k_dummy, // (Mode 1MHK only) Maximum sequence length across batches int64_t max_seqlen_q, // (Mode 1MHK only) Maximum sequence length across batches @@ -223,9 +223,9 @@ _efficient_attention_backward( const at::Tensor& philox_offset, // offset into random number sequence int64_t custom_mask_type, const bool bias_requires_grad, - const c10::optional scale, - c10::optional num_splits_key, - const c10::optional window_size) { + const std::optional scale, + std::optional num_splits_key, + const std::optional window_size) { #if defined(USE_MEM_EFF_ATTENTION) if (!grad_out_.defined()) { return std::make_tuple(Tensor{}, Tensor{}, Tensor{}, Tensor{}); @@ -233,8 +233,8 @@ _efficient_attention_backward( // This path is used when we directly call _efficient_attention_forward // from python. // This is needed because SaveVariable automatically converts - // c10::optional to undefined tensor - c10::optional bias, cu_seqlens_q, cu_seqlens_k; + // std::optional to undefined tensor + std::optional bias, cu_seqlens_q, cu_seqlens_k; bias = kernel_bias.has_value() && !kernel_bias->defined() ? c10::nullopt : kernel_bias; cu_seqlens_q = cu_seqlens_q_dummy.has_value() && !cu_seqlens_q_dummy->defined() ? c10::nullopt : cu_seqlens_q_dummy; cu_seqlens_k = cu_seqlens_k_dummy.has_value() && !cu_seqlens_k_dummy->defined() ? c10::nullopt : cu_seqlens_k_dummy; @@ -603,7 +603,7 @@ std::tuple _scaled_dot_product_flash_attenti bool is_causal, const at::Tensor& philox_seed, const at::Tensor& philox_offset, - c10::optional scale){ + std::optional scale){ if (!grad_out_.defined()) { return std::make_tuple(Tensor{}, Tensor{}, Tensor{}); } @@ -653,7 +653,7 @@ std::tuple _scaled_dot_product_e double dropout_p, std::array grad_input_mask, bool causal, - c10::optional scale) { + std::optional scale) { if (!grad_out_.defined()) { return std::make_tuple(Tensor{}, Tensor{}, Tensor{}, Tensor{}); @@ -667,8 +667,8 @@ std::tuple _scaled_dot_product_e Tensor grad_q, grad_k, grad_v, grad_bias; // This is needed because SaveVariable automatically converts - // c10::optional to undefined tensor - c10::optional kernel_bias; + // std::optional to undefined tensor + std::optional kernel_bias; if (attn_bias.defined()) { kernel_bias = attn_bias; } diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp index 8f6f7a9f357dc..5c7db42368931 100644 --- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp +++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp @@ -322,7 +322,7 @@ void set_params_splitkv(Flash_fwd_params ¶ms, const int batch_size, } } -void set_params_alibi(Flash_fwd_params ¶ms, c10::optional &alibi_slopes_, int batch_size, int num_heads){ +void set_params_alibi(Flash_fwd_params ¶ms, std::optional &alibi_slopes_, int batch_size, int num_heads){ #ifdef FLASHATTENTION_DISABLE_ALIBI TORCH_CHECK(!alibi_slopes_.has_value(), "This flash attention build does not support alibi."); params.alibi_slopes_ptr = nullptr; @@ -346,15 +346,15 @@ std::tuple &out_, // batch_size x seqlen_q x num_heads x head_size - c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads + std::optional &out_, // batch_size x seqlen_q x num_heads x head_size + std::optional &alibi_slopes_, // num_heads or batch_size x num_heads const float p_dropout, const float softmax_scale, bool is_causal, int window_size_left, int window_size_right, const bool return_softmax, - c10::optional gen_) { + std::optional gen_) { auto dprops = at::cuda::getCurrentDeviceProperties(); // bool is_sm75 = dprops->major == 7 && dprops->minor == 5; @@ -532,11 +532,11 @@ std::tuple &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i const at::Tensor &cu_seqlens_q, // b+1 const at::Tensor &cu_seqlens_k, // b+1 - c10::optional &seqused_k, // b. If given, only this many elements of each batch element's keys are used. - c10::optional &alibi_slopes_, // num_heads or b x num_heads + std::optional &seqused_k, // b. If given, only this many elements of each batch element's keys are used. + std::optional &alibi_slopes_, // num_heads or b x num_heads int max_seqlen_q, const int max_seqlen_k, const float p_dropout, @@ -546,7 +546,7 @@ mha_varlen_fwd(const at::Tensor &q, // total_q x num_heads x head_size, total_q int window_size_left, int window_size_right, const bool return_softmax, - c10::optional gen_) { + std::optional gen_) { auto dprops = at::cuda::getCurrentDeviceProperties(); // bool is_sm75 = dprops->major == 7 && dprops->minor == 5; @@ -765,10 +765,10 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size const at::Tensor &out, // batch_size x seqlen_q x num_heads x head_size const at::Tensor &softmax_lse, // b x h x seqlen_q - c10::optional &dq_, // batch_size x seqlen_q x num_heads x head_size - c10::optional &dk_, // batch_size x seqlen_k x num_heads_k x head_size - c10::optional &dv_, // batch_size x seqlen_k x num_heads_k x head_size - c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads + std::optional &dq_, // batch_size x seqlen_q x num_heads x head_size + std::optional &dk_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional &dv_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional &alibi_slopes_, // num_heads or batch_size x num_heads const float p_dropout, // probability to drop const float softmax_scale, const bool is_causal, @@ -976,12 +976,12 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i const at::Tensor &out, // total_q x num_heads x head_size const at::Tensor &softmax_lse, // b x h x s softmax logsumexp - c10::optional &dq_, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i - c10::optional &dk_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i - c10::optional &dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional &dq_, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + std::optional &dk_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional &dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i const at::Tensor &cu_seqlens_q, // b+1 const at::Tensor &cu_seqlens_k, // b+1 - c10::optional &alibi_slopes_, // num_heads or b x num_heads + std::optional &alibi_slopes_, // num_heads or b x num_heads const int max_seqlen_q, const int max_seqlen_k, // max sequence length to choose the kernel const float p_dropout, // probability to drop @@ -1208,15 +1208,15 @@ std::tuple mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size const at::Tensor &kcache, // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. const at::Tensor &vcache, // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. - c10::optional &k_, // batch_size x seqlen_knew x num_heads_k x head_size - c10::optional &v_, // batch_size x seqlen_knew x num_heads_k x head_size - c10::optional &seqlens_k_, // batch_size - c10::optional &rotary_cos_, // seqlen_ro x (rotary_dim / 2) - c10::optional &rotary_sin_, // seqlen_ro x (rotary_dim / 2) - c10::optional &cache_batch_idx_, // indices to index into the KV cache - c10::optional &block_table_, // batch_size x max_num_blocks_per_seq - c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads - c10::optional &out_, // batch_size x seqlen_q x num_heads x head_size + std::optional &k_, // batch_size x seqlen_knew x num_heads_k x head_size + std::optional &v_, // batch_size x seqlen_knew x num_heads_k x head_size + std::optional &seqlens_k_, // batch_size + std::optional &rotary_cos_, // seqlen_ro x (rotary_dim / 2) + std::optional &rotary_sin_, // seqlen_ro x (rotary_dim / 2) + std::optional &cache_batch_idx_, // indices to index into the KV cache + std::optional &block_table_, // batch_size x max_num_blocks_per_seq + std::optional &alibi_slopes_, // num_heads or batch_size x num_heads + std::optional &out_, // batch_size x seqlen_q x num_heads x head_size const float softmax_scale, bool is_causal, int window_size_left, diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h index 2745b28dca29b..a3aa8aaa7adff 100644 --- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h +++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h @@ -11,25 +11,25 @@ std::tuple &out_, // batch_size x seqlen_q x num_heads x head_size - c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads + std::optional &out_, // batch_size x seqlen_q x num_heads x head_size + std::optional &alibi_slopes_, // num_heads or batch_size x num_heads const float p_dropout, const float softmax_scale, bool is_causal, int window_size_left, int window_size_right, const bool return_softmax, - c10::optional gen_); + std::optional gen_); std::tuple mha_varlen_fwd(const at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i - c10::optional &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i const at::Tensor &cu_seqlens_q, // b+1 const at::Tensor &cu_seqlens_k, // b+1 - c10::optional &seqused_k, // b. If given, only this many elements of each batch element's keys are used. - c10::optional &alibi_slopes_, // num_heads or b x num_heads + std::optional &seqused_k, // b. If given, only this many elements of each batch element's keys are used. + std::optional &alibi_slopes_, // num_heads or b x num_heads int max_seqlen_q, const int max_seqlen_k, const float p_dropout, @@ -39,7 +39,7 @@ mha_varlen_fwd(const at::Tensor &q, // total_q x num_heads x head_size, total_q int window_size_left, int window_size_right, const bool return_softmax, - c10::optional gen_); + std::optional gen_); std::tuple @@ -49,10 +49,10 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size const at::Tensor &out, // batch_size x seqlen_q x num_heads x head_size const at::Tensor &softmax_lse, // b x h x seqlen_q - c10::optional &dq_, // batch_size x seqlen_q x num_heads x head_size - c10::optional &dk_, // batch_size x seqlen_k x num_heads_k x head_size - c10::optional &dv_, // batch_size x seqlen_k x num_heads_k x head_size - c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads + std::optional &dq_, // batch_size x seqlen_q x num_heads x head_size + std::optional &dk_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional &dv_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional &alibi_slopes_, // num_heads or batch_size x num_heads const float p_dropout, // probability to drop const float softmax_scale, const bool is_causal, @@ -69,12 +69,12 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i const at::Tensor &out, // total_q x num_heads x head_size const at::Tensor &softmax_lse, // b x h x s softmax logsumexp - c10::optional &dq_, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i - c10::optional &dk_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i - c10::optional &dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional &dq_, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + std::optional &dk_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional &dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i const at::Tensor &cu_seqlens_q, // b+1 const at::Tensor &cu_seqlens_k, // b+1 - c10::optional &alibi_slopes_, // num_heads or b x num_heads + std::optional &alibi_slopes_, // num_heads or b x num_heads const int max_seqlen_q, const int max_seqlen_k, // max sequence length to choose the kernel const float p_dropout, // probability to drop diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp index 0debd93bf1d4f..d25c168fcbe88 100644 --- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp +++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp @@ -491,6 +491,18 @@ bool check_runtime_enabled_cudnn(sdp_params const& params, bool debug) { return true; } +bool check_runtime_disabled_cudnn(sdp_params const& params, bool debug) { + // We check the global context to see if user has explicitly turned of cudnn + // sdp kernels + if (!at::globalContext().userEnabledCuDNNSDP()) { + if (debug) { + TORCH_WARN("CuDNN attention has been runtime disabled."); + } + return false; + } + return true; +} + bool check_cudnn_requires_grad(sdp_params const& params, bool debug) { // Check that the input is causal if (input_requires_grad(params)) { @@ -511,6 +523,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) { constexpr auto general_constraints = array_of( check_runtime_enabled_cudnn, + check_runtime_disabled_cudnn, check_cudnn_hardware_support, check_all_tensors_on_device, check_cudnn_tensor_shapes, diff --git a/aten/src/ATen/native/transformers/sdp_utils_cpp.h b/aten/src/ATen/native/transformers/sdp_utils_cpp.h index 6e15a27fae542..7c56a1f617dbc 100644 --- a/aten/src/ATen/native/transformers/sdp_utils_cpp.h +++ b/aten/src/ATen/native/transformers/sdp_utils_cpp.h @@ -44,7 +44,7 @@ struct sdp_params { at::Tensor query; at::Tensor key; at::Tensor value; - c10::optional attn_mask; + std::optional attn_mask; double dropout; bool is_causal; }; @@ -53,7 +53,7 @@ SDPBackend select_sdp_backend_cpp(sdp_params const& kernel_params); inline c10::SymFloat calculate_scale( const at::Tensor& query, - c10::optional scale) { + std::optional scale) { const auto softmax_scale = scale.has_value() ? scale.value() : (c10::SymFloat(1.0) / (c10::SymFloat(query.sym_size(-1)).sqrt())); diff --git a/aten/src/ATen/native/transformers/transformer.cpp b/aten/src/ATen/native/transformers/transformer.cpp index 4f64c95b204b2..b551100555675 100644 --- a/aten/src/ATen/native/transformers/transformer.cpp +++ b/aten/src/ATen/native/transformers/transformer.cpp @@ -27,7 +27,7 @@ Tensor linear_for_ffn( const Tensor& bias, const Tensor& mat1, const Tensor& mat2, - c10::optional use_gelu) { + std::optional use_gelu) { if (mat1.is_nested()) { return NestedTensor_times_Tensor_plus_Tensor_addmm( bias, mat1, mat2.t(), 1, 1, use_gelu); @@ -91,8 +91,8 @@ Tensor transformer_encoder_layer_forward( const Tensor& ffn_bias_1, const Tensor& ffn_weight_2, const Tensor& ffn_bias_2, - const c10::optional& mask, - const c10::optional mask_type) { + const std::optional& mask, + const std::optional mask_type) { { const Tensor& check_for_empty = src.is_nested() ? get_nested_tensor_impl(src)->get_buffer() : src; if (check_for_empty.numel() == 0) { diff --git a/aten/src/ATen/native/utils/Factory.cpp b/aten/src/ATen/native/utils/Factory.cpp index ea6be4e017552..28ef6477e3335 100644 --- a/aten/src/ATen/native/utils/Factory.cpp +++ b/aten/src/ATen/native/utils/Factory.cpp @@ -12,7 +12,7 @@ Tensor empty_with_tail_padding( const IntArrayRef size, const caffe2::TypeMeta dtype, const c10::MemoryFormat memory_format, - c10::optional maybe_names) { + std::optional maybe_names) { auto* const allocator_ptr = c10::GetDefaultMobileCPUAllocator(); const int64_t nelements = c10::multiply_integers(size); size_t size_bytes = nelements * dtype.itemsize(); diff --git a/aten/src/ATen/native/utils/Factory.h b/aten/src/ATen/native/utils/Factory.h index bd153aaa67529..b0302417cdce0 100644 --- a/aten/src/ATen/native/utils/Factory.h +++ b/aten/src/ATen/native/utils/Factory.h @@ -17,7 +17,7 @@ at::Tensor empty_with_tail_padding( IntArrayRef size, const caffe2::TypeMeta dtype, c10::MemoryFormat memory_format, - c10::optional maybe_names); + std::optional maybe_names); } // namespace mobile } // namespace native diff --git a/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp b/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp index 3f583ddc3c4ae..e12e69c4ebec2 100644 --- a/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp +++ b/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp @@ -73,10 +73,10 @@ using namespace api::utils; Tensor batch_norm( const at::Tensor& input_arg, - const c10::optional& weight_opt /* optional */, - const c10::optional& bias_opt /* optional */, - const c10::optional& running_mean_opt /* optional */, - const c10::optional& running_var_opt /* optional */, + const std::optional& weight_opt /* optional */, + const std::optional& bias_opt /* optional */, + const std::optional& running_mean_opt /* optional */, + const std::optional& running_var_opt /* optional */, bool training, double /* momentum, not used in eval mode */, double eps, @@ -104,10 +104,10 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) { } // namespace BatchNormPackedContext::BatchNormPackedContext( - const c10::optional& weight_opt, - const c10::optional& bias_opt, - const c10::optional& running_mean_opt, - const c10::optional& running_var_opt, + const std::optional& weight_opt, + const std::optional& bias_opt, + const std::optional& running_mean_opt, + const std::optional& running_var_opt, double eps) : unpacked_{c10::AnyType::get()} { packed_.reserve(ListArgs::kNumArgs); @@ -181,10 +181,10 @@ BatchNormPackedContext BatchNormPackedContext::pack( } c10::intrusive_ptr create_batchnorm_context( - c10::optional&& weight_opt, - c10::optional&& bias_opt, - c10::optional&& running_mean_opt, - c10::optional&& running_var_opt, + std::optional&& weight_opt, + std::optional&& bias_opt, + std::optional&& running_mean_opt, + std::optional&& running_var_opt, bool training, double /* momentum */, double eps, diff --git a/aten/src/ATen/native/vulkan/ops/Batchnorm.h b/aten/src/ATen/native/vulkan/ops/Batchnorm.h index 6afaeb6f243b3..4108b0d4e3201 100644 --- a/aten/src/ATen/native/vulkan/ops/Batchnorm.h +++ b/aten/src/ATen/native/vulkan/ops/Batchnorm.h @@ -18,10 +18,10 @@ class BatchNormPackedContext final : virtual public VulkanPackedContext, public: BatchNormPackedContext( - const c10::optional& weight_opt, - const c10::optional& bias_opt, - const c10::optional& running_mean_opt, - const c10::optional& running_var_opt, + const std::optional& weight_opt, + const std::optional& bias_opt, + const std::optional& running_mean_opt, + const std::optional& running_var_opt, double eps); /* @@ -47,10 +47,10 @@ class BatchNormPackedContext final : virtual public VulkanPackedContext, }; c10::intrusive_ptr create_batchnorm_context( - c10::optional&& weight_opt, - c10::optional&& bias_opt, - c10::optional&& running_mean_opt, - c10::optional&& running_var_opt, + std::optional&& weight_opt, + std::optional&& bias_opt, + std::optional&& running_mean_opt, + std::optional&& running_var_opt, bool training, double /* momentum */, double eps, diff --git a/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp b/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp index c08363a17f8eb..e1445f40ac5f8 100644 --- a/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp +++ b/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp @@ -15,7 +15,7 @@ using namespace api::utils; Tensor binary_op_scalar( const Tensor& self_arg, const Scalar& other, - const c10::optional& alpha_arg, + const std::optional& alpha_arg, const api::ShaderInfo& shader_descriptor) { api::Context* const context = api::context(); @@ -102,7 +102,7 @@ Tensor binary_op_preprocess_other_arg(const Tensor& other_arg) { Tensor& binary_op_scalar_( Tensor& self_arg, const Scalar& other, - const c10::optional& alpha_arg, + const std::optional& alpha_arg, const api::ShaderInfo& shader_descriptor) { TORCH_CHECK( self_arg.is_vulkan(), @@ -152,7 +152,7 @@ Tensor& binary_op_scalar_( Tensor binary_op_tensor( const Tensor& self_arg, const Tensor& other_arg, - const c10::optional& alpha_arg, + const std::optional& alpha_arg, const api::ShaderInfo& shader_descriptor) { utils::is_broadcastable(self_arg, other_arg); api::Context* const context = api::context(); @@ -313,7 +313,7 @@ Tensor quantized_binary_op_tensor( Tensor& binary_op_tensor_( Tensor& self_arg, const Tensor& other_arg, - const c10::optional& alpha_arg, + const std::optional& alpha_arg, const api::ShaderInfo& shader_descriptor) { TORCH_CHECK( get_dim(self_arg) >= get_dim(other_arg) && @@ -389,12 +389,12 @@ Tensor add_scalar( const Scalar& other, const Scalar& alpha) { return binary_op_scalar( - self_arg, other, c10::optional(alpha), VK_KERNEL(add_scalar)); + self_arg, other, std::optional(alpha), VK_KERNEL(add_scalar)); } Tensor& add_scalar_(Tensor& self, const Scalar& other, const Scalar& alpha) { return binary_op_scalar_( - self, other, c10::optional(alpha), VK_KERNEL(add_scalar_inplace)); + self, other, std::optional(alpha), VK_KERNEL(add_scalar_inplace)); } Tensor quantized_add( @@ -438,7 +438,7 @@ Tensor add_tensor( const Tensor& other_arg, const Scalar& alpha) { return binary_op_tensor( - self_arg, other_arg, c10::optional(alpha), VK_KERNEL(add)); + self_arg, other_arg, std::optional(alpha), VK_KERNEL(add)); } Tensor& add_tensor_( @@ -446,7 +446,7 @@ Tensor& add_tensor_( const Tensor& other_arg, const Scalar& alpha) { return binary_op_tensor_( - self, other_arg, c10::optional(alpha), VK_KERNEL(add_inplace)); + self, other_arg, std::optional(alpha), VK_KERNEL(add_inplace)); } Tensor sub_scalar( @@ -456,7 +456,7 @@ Tensor sub_scalar( return binary_op_scalar( self_arg, other, - c10::optional(-1 * alpha.to()), + std::optional(-1 * alpha.to()), VK_KERNEL(add_scalar)); } @@ -464,7 +464,7 @@ Tensor& sub_scalar_(Tensor& self, const Scalar& other, const Scalar& alpha) { return binary_op_scalar_( self, other, - c10::optional(-1 * alpha.to()), + std::optional(-1 * alpha.to()), VK_KERNEL(add_scalar_inplace)); } @@ -473,7 +473,7 @@ Tensor sub_tensor( const Tensor& other_arg, const Scalar& alpha) { return binary_op_tensor( - self_arg, other_arg, c10::optional(alpha), VK_KERNEL(sub)); + self_arg, other_arg, std::optional(alpha), VK_KERNEL(sub)); } Tensor& sub_tensor_( @@ -481,34 +481,34 @@ Tensor& sub_tensor_( const Tensor& other_arg, const Scalar& alpha) { return binary_op_tensor_( - self, other_arg, c10::optional(alpha), VK_KERNEL(sub_inplace)); + self, other_arg, std::optional(alpha), VK_KERNEL(sub_inplace)); } Tensor mul_scalar(const Tensor& self_arg, const Scalar& other) { return binary_op_scalar( - self_arg, other, c10::optional(), VK_KERNEL(mul_scalar)); + self_arg, other, std::optional(), VK_KERNEL(mul_scalar)); } Tensor& mul_scalar_(Tensor& self, const Scalar& other) { return binary_op_scalar_( - self, other, c10::optional(), VK_KERNEL(mul_scalar_inplace)); + self, other, std::optional(), VK_KERNEL(mul_scalar_inplace)); } Tensor mul_tensor(const Tensor& self_arg, const Tensor& other_arg) { return binary_op_tensor( - self_arg, other_arg, c10::optional(), VK_KERNEL(mul)); + self_arg, other_arg, std::optional(), VK_KERNEL(mul)); } Tensor& mul_tensor_(Tensor& self, const Tensor& other_arg) { return binary_op_tensor_( - self, other_arg, c10::optional(), VK_KERNEL(mul_inplace)); + self, other_arg, std::optional(), VK_KERNEL(mul_inplace)); } Tensor div_scalar(const Tensor& self_arg, const Scalar& other) { return binary_op_scalar( self_arg, 1.0 / other.to(), - c10::optional(), + std::optional(), VK_KERNEL(mul_scalar)); } @@ -516,45 +516,45 @@ Tensor& div_scalar_(Tensor& self, const Scalar& other) { return binary_op_scalar_( self, 1.0 / other.to(), - c10::optional(), + std::optional(), VK_KERNEL(mul_scalar_inplace)); } Tensor div_tensor(const Tensor& self_arg, const Tensor& other_arg) { return binary_op_tensor( - self_arg, other_arg, c10::optional(), VK_KERNEL(div)); + self_arg, other_arg, std::optional(), VK_KERNEL(div)); } Tensor& div_tensor_(Tensor& self, const Tensor& other_arg) { return binary_op_tensor_( - self, other_arg, c10::optional(), VK_KERNEL(div_inplace)); + self, other_arg, std::optional(), VK_KERNEL(div_inplace)); } Tensor pow(const Tensor& self, const Tensor& other) { - return binary_op_tensor(self, other, c10::optional(), VK_KERNEL(pow)); + return binary_op_tensor(self, other, std::optional(), VK_KERNEL(pow)); } Tensor& pow_(Tensor& self, const Tensor& other) { return binary_op_tensor_( - self, other, c10::optional(), VK_KERNEL(pow_inplace)); + self, other, std::optional(), VK_KERNEL(pow_inplace)); } Tensor pow_tensor_scalar(const Tensor& self, const Scalar& other) { return binary_op_scalar( - self, other, c10::optional(), VK_KERNEL(pow_tensor_scalar)); + self, other, std::optional(), VK_KERNEL(pow_tensor_scalar)); } Tensor& pow_tensor_scalar_(Tensor& self, const Scalar& other) { return binary_op_scalar_( self, other, - c10::optional(), + std::optional(), VK_KERNEL(pow_tensor_scalar_inplace)); } Tensor pow_scalar_tensor(const Scalar& self, const Tensor& other) { return binary_op_scalar( - other, self, c10::optional(), VK_KERNEL(pow_scalar_tensor)); + other, self, std::optional(), VK_KERNEL(pow_scalar_tensor)); } Tensor floor_divide_scalar(const Tensor& self, const Scalar& other) { @@ -563,7 +563,7 @@ Tensor floor_divide_scalar(const Tensor& self, const Scalar& other) { return binary_op_scalar( self, 1.0 / other.to(), - c10::optional(), + std::optional(), VK_KERNEL(floor_mul_scalar)); } @@ -573,20 +573,20 @@ Tensor& floor_divide_scalar_(Tensor& self, const Scalar& other) { return binary_op_scalar_( self, 1.0 / other.to(), - c10::optional(), + std::optional(), VK_KERNEL(floor_mul_scalar_inplace)); } Tensor floor_divide_tensor(const Tensor& self, const Tensor& other) { return binary_op_tensor( - self, other, c10::optional(), VK_KERNEL(floor_divide)); + self, other, std::optional(), VK_KERNEL(floor_divide)); } Tensor& floor_divide_tensor_(Tensor& self, const Tensor& other_arg) { return binary_op_tensor_( self, other_arg, - c10::optional(), + std::optional(), VK_KERNEL(floor_divide_inplace)); } diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp index 3cc4dd3d3c4bc..e336b01323666 100644 --- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp +++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp @@ -11,8 +11,8 @@ using namespace api::utils; Tensor _clamp( const Tensor& self_arg, - const c10::optional& min, - const c10::optional& max, + const std::optional& min, + const std::optional& max, const api::ShaderInfo& shader_descriptor) { TORCH_CHECK(min || max, "At least one of 'min' or 'max' must not be None"); @@ -96,15 +96,15 @@ Tensor _clamp( Tensor clamp( const Tensor& self_arg, - const c10::optional& min, - const c10::optional& max) { + const std::optional& min, + const std::optional& max) { return _clamp(self_arg, min, max, VK_KERNEL(clamp)); } Tensor& _clamp_( Tensor& self_arg, - const c10::optional& min, - const c10::optional& max, + const std::optional& min, + const std::optional& max, const api::ShaderInfo& shader_descriptor) { TORCH_CHECK(min || max, "At least one of 'min' or 'max' must not be None"); @@ -186,8 +186,8 @@ Tensor threshold( Tensor& clamp_( Tensor& self, - const c10::optional& min, - const c10::optional& max) { + const std::optional& min, + const std::optional& max) { return _clamp_(self, min, max, VK_KERNEL(clamp_)); } diff --git a/aten/src/ATen/native/vulkan/ops/Clone.cpp b/aten/src/ATen/native/vulkan/ops/Clone.cpp index 2601d785ddb52..3e9e611717257 100644 --- a/aten/src/ATen/native/vulkan/ops/Clone.cpp +++ b/aten/src/ATen/native/vulkan/ops/Clone.cpp @@ -16,7 +16,7 @@ namespace { Tensor clone( const Tensor& src, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve); TORCH_CHECK( (c10::MemoryFormat::Preserve == memory_format) || diff --git a/aten/src/ATen/native/vulkan/ops/Common.h b/aten/src/ATen/native/vulkan/ops/Common.h index 83cb45b163a2a..c74483f793c52 100644 --- a/aten/src/ATen/native/vulkan/ops/Common.h +++ b/aten/src/ATen/native/vulkan/ops/Common.h @@ -76,18 +76,18 @@ uint32_t get_dim(const vTensor& v_in) { return get_dim(v_in.sizes()); } -inline c10::optional get_optional_tensor( +inline std::optional get_optional_tensor( const c10::impl::GenericList& gen_list, const uint32_t idx) { return gen_list.get(idx).isTensor() ? gen_list.get(idx).toTensor() - : c10::optional(); + : std::optional(); } -inline c10::optional get_optional_scalar( +inline std::optional get_optional_scalar( const c10::impl::GenericList& gen_list, const uint32_t idx) { return gen_list.get(idx).isScalar() ? gen_list.get(idx).toScalar() - : c10::optional(); + : std::optional(); } inline float roundevenf(float v) { diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp index 01dccac003011..f210c253800b1 100644 --- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp +++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp @@ -245,7 +245,7 @@ at::Tensor rearrange_weights_2d(const Tensor& weight_in, bool tconv) { * taking each texel and arranging them along the x axis. */ at::Tensor rearrange_bias( - const c10::optional& bias_in, + const std::optional& bias_in, const at::Tensor& weight_in, bool tconv) { // If optional is empty, just return zeros @@ -543,7 +543,7 @@ vTensor pack_weights( } vTensor pack_biases( - const c10::optional& bias, + const std::optional& bias, const Tensor& weight, const bool transposed, const bool quantized) { @@ -629,7 +629,7 @@ bool weight_valid(const Tensor& weight, const bool quantized) { } bool bias_valid( - const c10::optional& bias, + const std::optional& bias, const Tensor& weight, const bool transposed, const bool quantized) { @@ -656,7 +656,7 @@ bool bias_valid( bool available( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const IntArrayRef stride, const IntArrayRef padding, const IntArrayRef dilation, @@ -664,8 +664,8 @@ bool available( const bool quantized, const IntArrayRef /* output_padding */, const int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { + const std::optional& output_min, + const std::optional& output_max) { if (!weight_valid(weight, quantized)) { return false; } @@ -765,7 +765,7 @@ static inline std::vector get_conv_transpose_output_size( Tensor convolution( const Tensor& input, const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const IntArrayRef stride, const IntArrayRef padding, const IntArrayRef dilation, @@ -790,7 +790,7 @@ Tensor convolution( Tensor quantized_convolution( const Tensor& input, const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const IntArrayRef stride, const IntArrayRef padding, const IntArrayRef dilation, @@ -865,7 +865,7 @@ vTensor pack_weights_using_width_packing(const Tensor& weight_arg) { Tensor run_conv1d_context_impl( const Tensor& input_arg, const Tensor& weight_arg, - const c10::optional& bias_arg_opt, + const std::optional& bias_arg_opt, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, @@ -962,7 +962,7 @@ Tensor run_conv1d_context_impl( Conv2dPackedContext::Conv2dPackedContext( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const IntArrayRef stride_arg, const IntArrayRef padding_arg, const IntArrayRef dilation_arg, @@ -970,8 +970,8 @@ Conv2dPackedContext::Conv2dPackedContext( const bool quantized, const IntArrayRef output_padding_arg, const int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) + const std::optional& output_min, + const std::optional& output_max) : unpacked_{c10::AnyType::get()} { const auto stride = expand_param_if_needed(stride_arg, "stride", 2); const auto padding = expand_param_if_needed(padding_arg, "padding", 2); @@ -1058,13 +1058,13 @@ Conv2dPackedContext Conv2dPackedContext::pack(c10::impl::GenericList unpacked) { c10::intrusive_ptr create_conv2d_context( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& dilation, const int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { + const std::optional& output_min, + const std::optional& output_max) { return c10::make_intrusive(Conv2dPackedContext( weight, bias, @@ -1081,14 +1081,14 @@ c10::intrusive_ptr create_conv2d_context( c10::intrusive_ptr create_tconv2d_context( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& output_padding, std::vector&& dilation, const int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { + const std::optional& output_min, + const std::optional& output_max) { return c10::make_intrusive(Conv2dPackedContext( weight, bias, @@ -1105,13 +1105,13 @@ c10::intrusive_ptr create_tconv2d_context( c10::intrusive_ptr create_qconv2d_context( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& dilation, const int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { + const std::optional& output_min, + const std::optional& output_max) { return c10::make_intrusive(Conv2dPackedContext( weight, bias, @@ -1128,14 +1128,14 @@ c10::intrusive_ptr create_qconv2d_context( c10::intrusive_ptr create_qtconv2d_context( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& output_padding, std::vector&& dilation, const int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { + const std::optional& output_min, + const std::optional& output_max) { return c10::make_intrusive(Conv2dPackedContext( weight, bias, @@ -1294,7 +1294,7 @@ Tensor run_qconv2d_context( Tensor quantized_conv2d( const Tensor& input, const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, @@ -1321,15 +1321,15 @@ Conv2dOpContext::Conv2dOpContext(Conv2dPackedContext conv_context) Conv2dOpContext Conv2dOpContext::create( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const IntArrayRef stride_arg, const IntArrayRef padding_arg, const IntArrayRef dilation_arg, const bool transposed, const IntArrayRef output_padding_arg, const int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { + const std::optional& output_min, + const std::optional& output_max) { return Conv2dOpContext{Conv2dPackedContext( weight, bias, @@ -1367,13 +1367,13 @@ Conv2dOpContext::State Conv2dOpContext::unpack() const { c10::intrusive_ptr conv2d_clamp_prepack( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& dilation, const int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { + const std::optional& output_min, + const std::optional& output_max) { return c10::make_intrusive(Conv2dOpContext::create( std::move(weight), std::move(bias), @@ -1395,7 +1395,7 @@ Tensor conv2d_clamp_run( Conv1dPackedContext::Conv1dPackedContext( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const IntArrayRef stride_arg, const IntArrayRef padding_arg, const IntArrayRef dilation_arg, @@ -1435,7 +1435,7 @@ Conv1dPackedContext Conv1dPackedContext::pack(c10::impl::GenericList unpacked) { c10::intrusive_ptr create_conv1d_context( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& dilation, @@ -1447,7 +1447,7 @@ c10::intrusive_ptr create_conv1d_context( Tensor convolution1d( const Tensor& input, const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const IntArrayRef stride, const IntArrayRef padding, const IntArrayRef dilation, @@ -1464,7 +1464,7 @@ Tensor run_conv1d_context( const c10::intrusive_ptr& context) { const Tensor weight = context->get_val(Conv1dPackedContext::Packed::Weight).toTensor(); - const c10::optional& bias_opt = + const std::optional& bias_opt = context->get_val(Conv1dPackedContext::Packed::Bias).toTensor(); const auto stride = context->get_val(Conv1dPackedContext::Packed::Stride).toIntVector(); diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.h b/aten/src/ATen/native/vulkan/ops/Convolution.h index 1d51190b8cab5..84ace9526bbfc 100644 --- a/aten/src/ATen/native/vulkan/ops/Convolution.h +++ b/aten/src/ATen/native/vulkan/ops/Convolution.h @@ -21,7 +21,7 @@ namespace conv2d { Tensor rearrange_weights_dw(const Tensor& weight_in); Tensor rearrange_weights_2d(const Tensor& weight_in, bool tconv); Tensor rearrange_bias( - const c10::optional& bias_in, + const std::optional& bias_in, const at::Tensor& weight_in, bool tconv); @@ -60,7 +60,7 @@ class Conv2dPackedContext final : virtual public VulkanPackedContext, public: Conv2dPackedContext( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const IntArrayRef stride_arg, const IntArrayRef padding_arg, const IntArrayRef dilation_arg, @@ -68,8 +68,8 @@ class Conv2dPackedContext final : virtual public VulkanPackedContext, const bool quantized, const IntArrayRef output_padding_arg, const int64_t groups, - const c10::optional& output_min = c10::nullopt, - const c10::optional& output_max = c10::nullopt); + const std::optional& output_min = c10::nullopt, + const std::optional& output_max = c10::nullopt); /* * Assigns a name to each index in the unpacked list. @@ -127,13 +127,13 @@ class Conv2dPackedContext final : virtual public VulkanPackedContext, c10::intrusive_ptr create_conv2d_context( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& dilation, const int64_t groups, - const c10::optional& output_min = c10::nullopt, - const c10::optional& output_max = c10::nullopt); + const std::optional& output_min = c10::nullopt, + const std::optional& output_max = c10::nullopt); Tensor run_conv2d_context( const Tensor& input, @@ -141,14 +141,14 @@ Tensor run_conv2d_context( c10::intrusive_ptr create_tconv2d_context( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& output_padding, std::vector&& dilation, const int64_t groups, - const c10::optional& output_min = c10::nullopt, - const c10::optional& output_max = c10::nullopt); + const std::optional& output_min = c10::nullopt, + const std::optional& output_max = c10::nullopt); Tensor run_tconv2d_context( const Tensor& input, @@ -156,13 +156,13 @@ Tensor run_tconv2d_context( c10::intrusive_ptr create_qconv2d_context( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& dilation, const int64_t groups, - const c10::optional& output_min = c10::nullopt, - const c10::optional& output_max = c10::nullopt); + const std::optional& output_min = c10::nullopt, + const std::optional& output_max = c10::nullopt); Tensor run_qconv2d_context( const Tensor& input_arg, @@ -172,39 +172,39 @@ Tensor run_qconv2d_context( c10::intrusive_ptr create_qtconv2d_context( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& output_padding, std::vector&& dilation, const int64_t groups, - const c10::optional& output_min = c10::nullopt, - const c10::optional& output_max = c10::nullopt); + const std::optional& output_min = c10::nullopt, + const std::optional& output_max = c10::nullopt); // Backwards compatibility class Conv2dOpContext final : public torch::jit::CustomClassHolder { public: static Conv2dOpContext create( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed, IntArrayRef output_padding, int64_t groups, - const c10::optional& output_min = c10::nullopt, - const c10::optional& output_max = c10::nullopt); + const std::optional& output_min = c10::nullopt, + const std::optional& output_max = c10::nullopt); using State = std::tuple< Tensor, - c10::optional, + std::optional, std::vector, std::vector, std::vector, int64_t, - c10::optional, - c10::optional>; + std::optional, + std::optional>; Tensor run(const Tensor& input) const; State unpack() const; @@ -220,13 +220,13 @@ Tensor conv2d_clamp_run( c10::intrusive_ptr conv2d_clamp_prepack( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& dilation, const int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max); + const std::optional& output_min, + const std::optional& output_max); class Conv1dPackedContext final : virtual public VulkanPackedContext, public torch::jit::CustomClassHolder { @@ -237,7 +237,7 @@ class Conv1dPackedContext final : virtual public VulkanPackedContext, public: Conv1dPackedContext( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const IntArrayRef stride_arg, const IntArrayRef padding_arg, const IntArrayRef dilation_arg, @@ -287,7 +287,7 @@ class Conv1dPackedContext final : virtual public VulkanPackedContext, c10::intrusive_ptr create_conv1d_context( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& stride, std::vector&& padding, std::vector&& dilation, diff --git a/aten/src/ATen/native/vulkan/ops/Factory.cpp b/aten/src/ATen/native/vulkan/ops/Factory.cpp index b746868c238fd..afe82caed8f19 100644 --- a/aten/src/ATen/native/vulkan/ops/Factory.cpp +++ b/aten/src/ATen/native/vulkan/ops/Factory.cpp @@ -8,10 +8,10 @@ namespace ops { Tensor _empty_affine_quantized( const IntArrayRef sizes, - const c10::optional dtype, - const c10::optional layout, - const c10::optional device, - const c10::optional pin_memory, + const std::optional dtype, + const std::optional layout, + const std::optional device, + const std::optional pin_memory, const double scale, const int64_t zero_point, const optional memory_format) { @@ -30,10 +30,10 @@ Tensor _empty_affine_quantized( Tensor empty_memory_format( const IntArrayRef sizes, - const c10::optional dtype, - const c10::optional layout, - const c10::optional device, - const c10::optional pin_memory, + const std::optional dtype, + const std::optional layout, + const std::optional device, + const std::optional pin_memory, const optional memory_format) { api::StorageType storage_type = api::StorageType::TEXTURE_3D; return convert(vTensor{ diff --git a/aten/src/ATen/native/vulkan/ops/Factory.h b/aten/src/ATen/native/vulkan/ops/Factory.h index 9dee6307bb85c..9839ba2d84319 100644 --- a/aten/src/ATen/native/vulkan/ops/Factory.h +++ b/aten/src/ATen/native/vulkan/ops/Factory.h @@ -7,10 +7,10 @@ namespace ops { Tensor _empty_affine_quantized( const IntArrayRef sizes, - const c10::optional dtype, - const c10::optional layout, - const c10::optional device, - const c10::optional pin_memory, + const std::optional dtype, + const std::optional layout, + const std::optional device, + const std::optional pin_memory, const double scale, const int64_t zero_point, const optional memory_format); diff --git a/aten/src/ATen/native/vulkan/ops/Layernorm.cpp b/aten/src/ATen/native/vulkan/ops/Layernorm.cpp index cdca77f95fcaf..6b6a4b866c700 100644 --- a/aten/src/ATen/native/vulkan/ops/Layernorm.cpp +++ b/aten/src/ATen/native/vulkan/ops/Layernorm.cpp @@ -19,8 +19,8 @@ namespace vulkan { namespace ops { LayernormPackedContext::LayernormPackedContext( - const c10::optional& weight, - const c10::optional& bias, + const std::optional& weight, + const std::optional& bias, double eps) : unpacked_{c10::AnyType::get()} { packed_.reserve(ListArgs::kNumArgs); @@ -48,8 +48,8 @@ LayernormPackedContext LayernormPackedContext::pack( } c10::intrusive_ptr create_layernorm_context( - c10::optional&& weight, - c10::optional&& bias, + std::optional&& weight, + std::optional&& bias, double eps) { return c10::make_intrusive( LayernormPackedContext(weight, bias, eps)); @@ -61,10 +61,10 @@ Tensor run_layernorm_context( const c10::intrusive_ptr& layernorm_context) { const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan(); - const c10::optional& weight_opt = + const std::optional& weight_opt = layernorm_context->get_val(LayernormPackedContext::ListArgs::kWeight) .toTensor(); - const c10::optional& bias_opt = + const std::optional& bias_opt = layernorm_context->get_val(LayernormPackedContext::ListArgs::kBias) .toTensor(); const float eps = api::utils::safe_downcast( @@ -81,8 +81,8 @@ Tensor run_layernorm_context( Tensor layer_norm( const at::Tensor& input_arg, IntArrayRef normalized_shape, - const c10::optional& weight_opt /* optional */, - const c10::optional& bias_opt /* optional */, + const std::optional& weight_opt /* optional */, + const std::optional& bias_opt /* optional */, double eps, bool /* cudnn_enable, deprecated */) { return run_layernorm_context( diff --git a/aten/src/ATen/native/vulkan/ops/Layernorm.h b/aten/src/ATen/native/vulkan/ops/Layernorm.h index 39518bf63bc9f..881fd6ba9b36c 100644 --- a/aten/src/ATen/native/vulkan/ops/Layernorm.h +++ b/aten/src/ATen/native/vulkan/ops/Layernorm.h @@ -18,8 +18,8 @@ class LayernormPackedContext final : virtual public VulkanPackedContext, public: LayernormPackedContext( - const c10::optional& weight, - const c10::optional& bias, + const std::optional& weight, + const std::optional& bias, double eps); /* @@ -43,8 +43,8 @@ class LayernormPackedContext final : virtual public VulkanPackedContext, }; c10::intrusive_ptr create_layernorm_context( - c10::optional&& weight, - c10::optional&& bias, + std::optional&& weight, + std::optional&& bias, double eps); Tensor run_layernorm_context( diff --git a/aten/src/ATen/native/vulkan/ops/Mm.cpp b/aten/src/ATen/native/vulkan/ops/Mm.cpp index e5893e8172875..c4f4d6d0a6342 100644 --- a/aten/src/ATen/native/vulkan/ops/Mm.cpp +++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp @@ -149,7 +149,7 @@ vTensor pack_weights(const Tensor& weight_arg, const bool use_batch = false) { vTensor pack_biases( const Tensor& weight_arg, - const c10::optional& bias_arg, + const std::optional& bias_arg, const bool use_batch = false) { if (bias_arg) { Tensor bias = *bias_arg; @@ -166,7 +166,7 @@ vTensor pack_biases( // removed in the future. vTensor pack_biases_quantized_weights( const Tensor& weight_arg, - const c10::optional& bias_arg, + const std::optional& bias_arg, const bool use_batch = false) { TORCH_CHECK( weight_arg.is_quantized(), @@ -291,7 +291,7 @@ vTensor pack_biases_quantized_weights( bool available_check_with_batch( const Tensor& weight, - const c10::optional& bias) { + const std::optional& bias) { const bool weight_available = (3 == weight.ndimension()) && (weight.size(Layout::BatchMatrices::batch) > 0) && (weight.size(Layout::BatchMatrices::height) > 0) && @@ -345,7 +345,7 @@ bool available_check_with_batch( bool available( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const bool use_batch = false) { if (!api::available()) { return false; @@ -897,7 +897,7 @@ Tensor mm(const Tensor& mat1_arg, const Tensor& mat2_arg) { 1.0f, 1.0f, c10::make_intrusive( - LinearPackedContext(mat2_arg, c10::optional())), + LinearPackedContext(mat2_arg, std::optional())), false, 0, 0); @@ -909,7 +909,7 @@ Tensor bmm(const Tensor& mat1_arg, const Tensor& mat2_arg) { 1.0f, 1.0f, c10::make_intrusive(LinearPackedContext( - mat2_arg, c10::optional(), true /*use batch*/))); + mat2_arg, std::optional(), true /*use batch*/))); } Tensor baddbmm( @@ -941,7 +941,7 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) { LinearPackedContext::LinearPackedContext( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const bool use_batch) : unpacked_{c10::AnyType::get()} { TORCH_CHECK( @@ -974,7 +974,7 @@ LinearPackedContext LinearPackedContext::pack(c10::impl::GenericList unpacked) { c10::intrusive_ptr create_linear_context( Tensor&& weight, - c10::optional&& bias) { + std::optional&& bias) { return c10::make_intrusive( LinearPackedContext(weight, bias)); } diff --git a/aten/src/ATen/native/vulkan/ops/Mm.h b/aten/src/ATen/native/vulkan/ops/Mm.h index b4fcb31bc315c..99862913a65a0 100644 --- a/aten/src/ATen/native/vulkan/ops/Mm.h +++ b/aten/src/ATen/native/vulkan/ops/Mm.h @@ -61,7 +61,7 @@ class LinearPackedContext final : virtual public VulkanPackedContext, public: LinearPackedContext( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const bool use_batch = false); /* @@ -97,7 +97,7 @@ class LinearPackedContext final : virtual public VulkanPackedContext, c10::intrusive_ptr create_linear_context( Tensor&& weight, - c10::optional&& bias); + std::optional&& bias); Tensor run_linear_context( const Tensor& input, diff --git a/aten/src/ATen/native/vulkan/ops/NativeLayerNorm.cpp b/aten/src/ATen/native/vulkan/ops/NativeLayerNorm.cpp index ffeb8c27c52b5..94d155cc2f647 100644 --- a/aten/src/ATen/native/vulkan/ops/NativeLayerNorm.cpp +++ b/aten/src/ATen/native/vulkan/ops/NativeLayerNorm.cpp @@ -12,8 +12,8 @@ using namespace api::utils; void _check_layer_norm_inputs( const at::Tensor& input, IntArrayRef normalized_shape, - const c10::optional& weight /* optional */, - const c10::optional& bias /* optional */) { + const std::optional& weight /* optional */, + const std::optional& bias /* optional */) { const auto normalized_ndim = normalized_shape.size(); TORCH_CHECK( normalized_ndim >= 1, @@ -55,8 +55,8 @@ void _check_layer_norm_inputs( std::tuple native_layer_norm( const at::Tensor& input_arg, IntArrayRef normalized_shape, - const c10::optional& weight_opt /* optional */, - const c10::optional& bias_opt /* optional */, + const std::optional& weight_opt /* optional */, + const std::optional& bias_opt /* optional */, double eps) { _check_layer_norm_inputs(input_arg, normalized_shape, weight_opt, bias_opt); diff --git a/aten/src/ATen/native/vulkan/ops/Pool.cpp b/aten/src/ATen/native/vulkan/ops/Pool.cpp index fab4f05b4a98b..8730cf660a43b 100644 --- a/aten/src/ATen/native/vulkan/ops/Pool.cpp +++ b/aten/src/ATen/native/vulkan/ops/Pool.cpp @@ -232,7 +232,7 @@ Tensor avg_pool2d( const IntArrayRef padding_arg, const bool ceil_mode, const bool /* count_include_pad */, - const c10::optional /* divisor_override */) { + const std::optional /* divisor_override */) { return pool2d( self_arg, kernel_arg, diff --git a/aten/src/ATen/native/vulkan/ops/QuantizedFunctions.h b/aten/src/ATen/native/vulkan/ops/QuantizedFunctions.h index b22a3aa05b819..d72ad00321043 100644 --- a/aten/src/ATen/native/vulkan/ops/QuantizedFunctions.h +++ b/aten/src/ATen/native/vulkan/ops/QuantizedFunctions.h @@ -52,7 +52,7 @@ Tensor quantized_div( Tensor quantized_conv2d( const Tensor& input_, const Tensor& weight, - const c10::optional& bias_opt, + const std::optional& bias_opt, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, @@ -63,8 +63,8 @@ Tensor quantized_conv2d( Tensor quantized_upsample_nearest2d( const Tensor& input_arg, const IntArrayRef output_sizes, - const c10::optional scales_h, - const c10::optional scales_w); + const std::optional scales_h, + const std::optional scales_w); } // namespace ops } // namespace vulkan diff --git a/aten/src/ATen/native/vulkan/ops/Random.cpp b/aten/src/ATen/native/vulkan/ops/Random.cpp index c266b10417039..3103f7fe6f58d 100644 --- a/aten/src/ATen/native/vulkan/ops/Random.cpp +++ b/aten/src/ATen/native/vulkan/ops/Random.cpp @@ -16,7 +16,7 @@ Tensor& uniform_( Tensor& self, const double from, const double to, - const c10::optional /* not implemented */) { + const std::optional /* not implemented */) { TORCH_CHECK( self.is_vulkan(), "Vulkan: In-place operator is only supported on Vulkan tensors."); @@ -59,11 +59,11 @@ Tensor& uniform_( Tensor rand_like( const at::Tensor& input_arg, - const c10::optional /* not implemented */, - const c10::optional /* not implemented */, - const c10::optional /* not implemented */, - const c10::optional /* not implemented */, - const c10::optional /* not implemented */) { + const std::optional /* not implemented */, + const std::optional /* not implemented */, + const std::optional /* not implemented */, + const std::optional /* not implemented */, + const std::optional /* not implemented */) { // Returns a tensor with the same size as input that is filled with random // numbers from a uniform distribution on the interval [0,1). To match the CPU // implementation, we simplify the range to [0,1] and tolerate the small @@ -75,7 +75,7 @@ Tensor& normal_( Tensor& self, const double mean, const double std, - const c10::optional /* not implemented */) { + const std::optional /* not implemented */) { TORCH_CHECK( self.is_vulkan(), "Vulkan: In-place operator is only supported on Vulkan tensors."); @@ -120,11 +120,11 @@ Tensor& normal_( Tensor randn_like( const at::Tensor& input_arg, - const c10::optional /* not implemented */, - const c10::optional /* not implemented */, - const c10::optional /* not implemented */, - const c10::optional /* not implemented */, - const c10::optional /* not implemented */) { + const std::optional /* not implemented */, + const std::optional /* not implemented */, + const std::optional /* not implemented */, + const std::optional /* not implemented */, + const std::optional /* not implemented */) { // Returns a tensor with the same size as input that is filled with random // numbers from a normal distribution with mean 0 and standard deviation 1. return input_arg.clone().detach().normal_(0.0, 1.0); diff --git a/aten/src/ATen/native/vulkan/ops/Slice.cpp b/aten/src/ATen/native/vulkan/ops/Slice.cpp index 7d7721bcb7b15..dad391e9a5ddd 100644 --- a/aten/src/ATen/native/vulkan/ops/Slice.cpp +++ b/aten/src/ATen/native/vulkan/ops/Slice.cpp @@ -232,8 +232,8 @@ Tensor slice_height( Tensor slice( const Tensor& self, int64_t dim, - c10::optional start, - c10::optional end, + std::optional start, + std::optional end, const int64_t step) { TORCH_CHECK(step > 0, "slice step must be positive"); auto nDims = safe_downcast(self.dim()); diff --git a/aten/src/ATen/native/vulkan/ops/Sum.cpp b/aten/src/ATen/native/vulkan/ops/Sum.cpp index 56eed26448dd5..6d8331caff215 100644 --- a/aten/src/ATen/native/vulkan/ops/Sum.cpp +++ b/aten/src/ATen/native/vulkan/ops/Sum.cpp @@ -132,7 +132,7 @@ Tensor sum_dim_IntList( return self; } -Tensor sum(const Tensor& self, const c10::optional dtype) { +Tensor sum(const Tensor& self, const std::optional dtype) { std::vector dims; for (int64_t d = 0; d < self.dim(); d++) { // If any dimension has zero elements, we will shortcut to a zero-dim. diff --git a/aten/src/ATen/native/vulkan/ops/Upsample.cpp b/aten/src/ATen/native/vulkan/ops/Upsample.cpp index 776d1e79ce705..7e3a2ead2d632 100644 --- a/aten/src/ATen/native/vulkan/ops/Upsample.cpp +++ b/aten/src/ATen/native/vulkan/ops/Upsample.cpp @@ -12,8 +12,8 @@ using namespace api::utils; Tensor upsample_nearest2d( const Tensor& input_arg, const IntArrayRef output_sizes, - const c10::optional scales_h, - const c10::optional scales_w) { + const std::optional scales_h, + const std::optional scales_w) { api::Context* const context = api::context(); TORCH_CHECK( @@ -98,8 +98,8 @@ Tensor upsample_bilinear2d( const Tensor& input_arg, const IntArrayRef output_sizes, bool align_corners, - const c10::optional scales_h, - const c10::optional scales_w) { + const std::optional scales_h, + const std::optional scales_w) { api::Context* const context = api::context(); TORCH_CHECK( diff --git a/aten/src/ATen/native/vulkan/ops/Zero.cpp b/aten/src/ATen/native/vulkan/ops/Zero.cpp index 5ceaae07cdc3e..fc903ad3f1e19 100644 --- a/aten/src/ATen/native/vulkan/ops/Zero.cpp +++ b/aten/src/ATen/native/vulkan/ops/Zero.cpp @@ -43,10 +43,10 @@ Tensor& zero_(at::Tensor& self) { Tensor zeros( const IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { TORCH_CHECK(size.size() <= 4, "Vulkan zeros supports up to 4d tensors"); // Get the global Vulkan context diff --git a/aten/src/ATen/native/vulkan/ops/cumsum.cpp b/aten/src/ATen/native/vulkan/ops/cumsum.cpp index c0e8a0c09362d..e6537fcc5acd5 100644 --- a/aten/src/ATen/native/vulkan/ops/cumsum.cpp +++ b/aten/src/ATen/native/vulkan/ops/cumsum.cpp @@ -87,7 +87,7 @@ void set_cumsum_kernel_params( Tensor cumsum( const at::Tensor& input_arg, const int64_t dim_arg, - const c10::optional dtype) { + const std::optional dtype) { TORCH_CHECK( input_arg.dim() >= 1 && input_arg.dim() <= 4, "Vulkan cumsum expects 1 <= input dimension <= 4, Tensor input dimensions ", diff --git a/aten/src/ATen/native/xnnpack/Convolution.cpp b/aten/src/ATen/native/xnnpack/Convolution.cpp index aaf42ea3ed3d3..504c6a363816c 100644 --- a/aten/src/ATen/native/xnnpack/Convolution.cpp +++ b/aten/src/ATen/native/xnnpack/Convolution.cpp @@ -170,7 +170,7 @@ const Tensor reorder_weights_for_transpose_conv(const Tensor& weight_nhwc, ContextConv2D create( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const IntArrayRef padding, const IntArrayRef output_padding, const IntArrayRef stride, @@ -396,13 +396,13 @@ Tensor run( c10::intrusive_ptr createConv2dClampPrePackOpContext( Tensor weight, - c10::optional bias, + std::optional bias, std::vector stride, std::vector padding, std::vector dilation, int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { + const std::optional& output_min, + const std::optional& output_max) { return xnnpack::XNNPackConv2dOpContext::create_context( std::move(weight), std::move(bias), @@ -417,14 +417,14 @@ c10::intrusive_ptr c10::intrusive_ptr createConv2dTransposeClampPrePackOpContext( Tensor weight, - c10::optional bias, + std::optional bias, std::vector stride, std::vector padding, std::vector output_padding, std::vector dilation, int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { + const std::optional& output_min, + const std::optional& output_max) { return xnnpack::XNNPackTransposeConv2dOpContext::create_context( std::move(weight), std::move(bias), diff --git a/aten/src/ATen/native/xnnpack/Convolution.h b/aten/src/ATen/native/xnnpack/Convolution.h index 0df4a6bcd483d..0ec3f01f36bb6 100644 --- a/aten/src/ATen/native/xnnpack/Convolution.h +++ b/aten/src/ATen/native/xnnpack/Convolution.h @@ -12,25 +12,25 @@ namespace internal::convolution2d { c10::intrusive_ptr createConv2dClampPrePackOpContext( Tensor weight, - c10::optional bias, + std::optional bias, std::vector stride, std::vector padding, std::vector dilation, int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max); + const std::optional& output_min, + const std::optional& output_max); c10::intrusive_ptr createConv2dTransposeClampPrePackOpContext( Tensor weight, - c10::optional bias, + std::optional bias, std::vector stride, std::vector padding, std::vector output_padding, std::vector dilation, int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max); + const std::optional& output_min, + const std::optional& output_max); Tensor conv2d_clamp_run( const Tensor& input, @@ -45,7 +45,7 @@ Tensor conv2d_transpose_clamp_run( ContextConv2D create( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const IntArrayRef padding, const IntArrayRef output_padding, const IntArrayRef stride, diff --git a/aten/src/ATen/native/xnnpack/Linear.cpp b/aten/src/ATen/native/xnnpack/Linear.cpp index dcab40ec17cfd..b1f4936625828 100644 --- a/aten/src/ATen/native/xnnpack/Linear.cpp +++ b/aten/src/ATen/native/xnnpack/Linear.cpp @@ -14,7 +14,7 @@ namespace { // TODO: Decouple and improve error handling and messages. bool available( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const float output_min, const float output_max) { // XNNPACK @@ -65,7 +65,7 @@ Tensor create_and_run( ContextLinear create( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const float output_min, const float output_max) { const Tensor weight_contig = weight.contiguous(); @@ -173,9 +173,9 @@ Tensor run( c10::intrusive_ptr createLinearClampPrePackOpContext( Tensor weight, - c10::optional bias, - const c10::optional& output_min, - const c10::optional& output_max) { + std::optional bias, + const std::optional& output_min, + const std::optional& output_max) { return xnnpack::XNNPackLinearOpContext::create_context( std::move(weight), std::move(bias), output_min, output_max); } diff --git a/aten/src/ATen/native/xnnpack/Linear.h b/aten/src/ATen/native/xnnpack/Linear.h index 32c9d93bf4533..9a16918ca0a99 100644 --- a/aten/src/ATen/native/xnnpack/Linear.h +++ b/aten/src/ATen/native/xnnpack/Linear.h @@ -11,9 +11,9 @@ namespace internal::linear { c10::intrusive_ptr createLinearClampPrePackOpContext( Tensor weight, - c10::optional bias, - const c10::optional& output_min, - const c10::optional& output_max); + std::optional bias, + const std::optional& output_min, + const std::optional& output_max); Tensor linear_clamp_run(const Tensor& input, const c10::intrusive_ptr& op_context); @@ -22,7 +22,7 @@ unpack_prepacked_sizes_linear(const IValue& ivalue); ContextLinear create( const Tensor& weight, - const c10::optional& bias, + const std::optional& bias, const float output_min, const float output_max); diff --git a/aten/src/ATen/native/xnnpack/OpContext.cpp b/aten/src/ATen/native/xnnpack/OpContext.cpp index 07f926cd8add5..71c40d1dccd7b 100644 --- a/aten/src/ATen/native/xnnpack/OpContext.cpp +++ b/aten/src/ATen/native/xnnpack/OpContext.cpp @@ -10,9 +10,9 @@ namespace at::native::xnnpack { c10::intrusive_ptr XNNPackLinearOpContext::create_context( at::Tensor&& weight, - c10::optional&& bias, - const c10::optional& output_min, - const c10::optional& output_max) { + std::optional&& bias, + const std::optional& output_min, + const std::optional& output_max) { auto linear_op_context = c10::make_intrusive( std::move(weight), @@ -46,13 +46,13 @@ Tensor XNNPackLinearOpContext::run(const Tensor& input) { c10::intrusive_ptr XNNPackConv2dOpContext::create_context(at::Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& padding, std::vector&& stride, std::vector&& dilation, int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { + const std::optional& output_min, + const std::optional& output_max) { auto op_context = xnnpack::internal::convolution2d::create( weight, @@ -89,14 +89,14 @@ XNNPackConv2dOpContext::create_context(at::Tensor&& weight, c10::intrusive_ptr XNNPackTransposeConv2dOpContext::create_context(at::Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& padding, std::vector&& output_padding, std::vector&& stride, std::vector&& dilation, int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { + const std::optional& output_min, + const std::optional& output_max) { auto op_context = xnnpack::internal::convolution2d::create( weight, diff --git a/aten/src/ATen/native/xnnpack/OpContext.h b/aten/src/ATen/native/xnnpack/OpContext.h index eecc8b11fad13..0aec38b102ff5 100644 --- a/aten/src/ATen/native/xnnpack/OpContext.h +++ b/aten/src/ATen/native/xnnpack/OpContext.h @@ -10,37 +10,37 @@ namespace at::native::xnnpack { using SerializationTypeLinearPrePack = std::tuple< Tensor, - c10::optional, - c10::optional, - c10::optional>; + std::optional, + std::optional, + std::optional>; using SerializationTypeConv2dPrePack = std::tuple< Tensor, - c10::optional, + std::optional, std::vector, std::vector, std::vector, int64_t, - c10::optional, - c10::optional>; + std::optional, + std::optional>; using SerializationTypeTransposeConv2dPrePack = std::tuple< Tensor, - c10::optional, + std::optional, std::vector, std::vector, std::vector, std::vector, int64_t, - c10::optional, - c10::optional>; + std::optional, + std::optional>; class LinearOpContext : public torch::jit::CustomClassHolder { protected: Tensor orig_weight_; - c10::optional orig_bias_; - c10::optional output_min_; - c10::optional output_max_; + std::optional orig_bias_; + std::optional output_min_; + std::optional output_max_; bool orig_weight_and_bias_freed_; public: @@ -60,9 +60,9 @@ class XNNPackLinearOpContext final : public LinearOpContext { public: XNNPackLinearOpContext( Tensor&& weight, - c10::optional&& bias, - const c10::optional& min, - const c10::optional& max, + std::optional&& bias, + const std::optional& min, + const std::optional& max, ContextLinear&& op_context) : op_context_(std::move(op_context)) { orig_weight_ = std::move(weight); @@ -77,21 +77,21 @@ class XNNPackLinearOpContext final : public LinearOpContext { static c10::intrusive_ptr create_context( Tensor&& weight, - c10::optional&& bias, - const c10::optional& output_min, - const c10::optional& output_max); + std::optional&& bias, + const std::optional& output_min, + const std::optional& output_max); }; class Conv2dOpContext : public torch::jit::CustomClassHolder { protected: Tensor orig_weight_; - c10::optional orig_bias_; + std::optional orig_bias_; std::vector stride_; std::vector padding_; std::vector dilation_; int64_t groups_; - c10::optional output_min_; - c10::optional output_max_; + std::optional output_min_; + std::optional output_max_; bool orig_weight_and_bias_freed_; public: @@ -115,14 +115,14 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder { class TransposeConv2dOpContext : public torch::jit::CustomClassHolder { protected: Tensor orig_weight_; - c10::optional orig_bias_; + std::optional orig_bias_; std::vector stride_; std::vector padding_; std::vector output_padding_; std::vector dilation_; int64_t groups_; - c10::optional output_min_; - c10::optional output_max_; + std::optional output_min_; + std::optional output_max_; bool orig_weight_and_bias_freed_; public: @@ -158,13 +158,13 @@ class XNNPackConv2dOpContext final : public Conv2dOpContext { public: XNNPackConv2dOpContext( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& padding, std::vector&& stride, std::vector&& dilation, uint64_t groups, - const c10::optional& min, - const c10::optional& max, + const std::optional& min, + const std::optional& max, ContextConv2D&& op_context) : op_context_(std::move(op_context)) { orig_weight_ = std::move(weight); @@ -183,13 +183,13 @@ class XNNPackConv2dOpContext final : public Conv2dOpContext { static c10::intrusive_ptr create_context( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& padding, std::vector&& stride, std::vector&& dilation, int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max); + const std::optional& output_min, + const std::optional& output_max); }; class XNNPackTransposeConv2dOpContext final : public TransposeConv2dOpContext { @@ -206,14 +206,14 @@ class XNNPackTransposeConv2dOpContext final : public TransposeConv2dOpContext { public: XNNPackTransposeConv2dOpContext( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& padding, std::vector&& output_padding, std::vector&& stride, std::vector&& dilation, uint64_t groups, - const c10::optional& min, - const c10::optional& max, + const std::optional& min, + const std::optional& max, ContextConv2D&& op_context) : op_context_(std::move(op_context)) { orig_weight_ = std::move(weight); @@ -233,14 +233,14 @@ class XNNPackTransposeConv2dOpContext final : public TransposeConv2dOpContext { static c10::intrusive_ptr create_context( Tensor&& weight, - c10::optional&& bias, + std::optional&& bias, std::vector&& padding, std::vector&& output_padding, std::vector&& stride, std::vector&& dilation, int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max); + const std::optional& output_min, + const std::optional& output_max); }; } // namespace at::native::xnnpack diff --git a/aten/src/ATen/ops/from_blob.h b/aten/src/ATen/ops/from_blob.h index 8ebc01a922029..88089092c1fd7 100644 --- a/aten/src/ATen/ops/from_blob.h +++ b/aten/src/ATen/ops/from_blob.h @@ -31,7 +31,7 @@ class TORCH_API TensorMaker { return *this; } - TensorMaker& storage_offset(c10::optional value) noexcept { + TensorMaker& storage_offset(std::optional value) noexcept { storage_offset_ = value; return *this; @@ -50,7 +50,7 @@ class TORCH_API TensorMaker { return *this; } - TensorMaker& target_device(c10::optional value) noexcept { + TensorMaker& target_device(std::optional value) noexcept { device_ = value; return *this; @@ -91,10 +91,10 @@ class TORCH_API TensorMaker { void* data_; IntArrayRef sizes_; OptionalIntArrayRef strides_{}; - c10::optional storage_offset_{}; + std::optional storage_offset_{}; std::function deleter_{}; std::unique_ptr ctx_{nullptr, detail::noopDelete}; - c10::optional device_{}; + std::optional device_{}; TensorOptions opts_{}; bool resizeable_{}; c10::Allocator* allocator_{}; @@ -110,7 +110,7 @@ inline Tensor from_blob( IntArrayRef strides, const std::function& deleter, const TensorOptions& options = {}, - const c10::optional target_device = c10::nullopt) { + const std::optional target_device = c10::nullopt) { return for_blob(data, sizes) .strides(strides) .deleter(deleter) @@ -126,7 +126,7 @@ inline Tensor from_blob( int64_t storage_offset, const std::function& deleter, const TensorOptions& options = {}, - const c10::optional target_device = c10::nullopt) { + const std::optional target_device = c10::nullopt) { return for_blob(data, sizes) .strides(strides) .storage_offset(storage_offset) @@ -141,7 +141,7 @@ inline Tensor from_blob( IntArrayRef sizes, std::function deleter, const TensorOptions& options = {}, - const c10::optional target_device = c10::nullopt) { + const std::optional target_device = c10::nullopt) { return for_blob(data, sizes) .deleter(std::move(deleter)) .options(options) diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp index bc3a0ba517483..04743ff256ece 100644 --- a/aten/src/ATen/record_function.cpp +++ b/aten/src/ATen/record_function.cpp @@ -43,7 +43,7 @@ RecordFunctionCallbacks::iterator findCallback( return std::find_if(entries.begin(), entries.end(), match_handle); } -c10::optional extractCallback( +std::optional extractCallback( RecordFunctionCallbacks& entries, CallbackHandle handle) { auto it = findCallback(entries, handle); @@ -132,7 +132,7 @@ class CacheEntry { // The caller is expected to check `GlobalCallbackManager::get().version()' // and call CacheEntry::update() if necessary. StepCallbacks getActiveCallbacks(); - c10::optional getActiveCallbacksUnlessEmpty(); + std::optional getActiveCallbacksUnlessEmpty(); // Full rebuild. (E.g. during registration) void update(const std::vector& callbacks); @@ -174,7 +174,7 @@ class LocalCallbackManager { public: const RecordFunctionTLS& getTLS() const; StepCallbacks getActiveCallbacks(const RecordScope scope); - c10::optional getActiveCallbacksUnlessEmpty(const RecordScope scope); + std::optional getActiveCallbacksUnlessEmpty(const RecordScope scope); void setTLS(const RecordFunctionTLS& tls); void seed(uint32_t seed); @@ -310,7 +310,7 @@ StepCallbacks CacheEntry::getActiveCallbacks() { return active_callbacks_; } -c10::optional CacheEntry::getActiveCallbacksUnlessEmpty() { +std::optional CacheEntry::getActiveCallbacksUnlessEmpty() { getActiveCallbacksImpl(); if (C10_LIKELY(active_callbacks_.empty())) { return c10::nullopt; @@ -397,7 +397,7 @@ StepCallbacks LocalCallbackManager::getActiveCallbacks( return active_callbacks_[static_cast(scope)].getActiveCallbacks(); } -c10::optional LocalCallbackManager::getActiveCallbacksUnlessEmpty( +std::optional LocalCallbackManager::getActiveCallbacksUnlessEmpty( const RecordScope scope) { rebuildActiveCallbacksIfNeeded(); return active_callbacks_[static_cast(scope)].getActiveCallbacksUnlessEmpty(); @@ -585,25 +585,25 @@ size_t RecordFunction::num_outputs() const { fn_); } -c10::optional RecordFunction::operator_name() const { +std::optional RecordFunction::operator_name() const { return std::visit( c10::overloaded( - [&](const std::string&) -> c10::optional { + [&](const std::string&) -> std::optional { return c10::nullopt; }, - [](const schema_ref_t schema) -> c10::optional { + [](const schema_ref_t schema) -> std::optional { return schema.get().operator_name(); }), fn_); } -c10::optional RecordFunction::operator_schema() const { +std::optional RecordFunction::operator_schema() const { return std::visit( c10::overloaded( - [&](const std::string&) -> c10::optional { + [&](const std::string&) -> std::optional { return c10::nullopt; }, - [](const schema_ref_t schema) -> c10::optional { + [](const schema_ref_t schema) -> std::optional { return schema.get(); }), fn_); @@ -613,7 +613,7 @@ StepCallbacks getStepCallbacks(RecordScope scope) { return LocalCallbackManager::get().getActiveCallbacks(scope); } -c10::optional getStepCallbacksUnlessEmpty(RecordScope scope) { +std::optional getStepCallbacksUnlessEmpty(RecordScope scope) { return LocalCallbackManager::get().getActiveCallbacksUnlessEmpty(scope); } diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h index c6f79289e6c21..014260fb220f8 100644 --- a/aten/src/ATen/record_function.h +++ b/aten/src/ATen/record_function.h @@ -433,10 +433,10 @@ struct TORCH_API RecordFunction { return handle_; } - c10::optional operator_name() const; + std::optional operator_name() const; // This method returns a copy of the FunctionSchema and can be expensive. - c10::optional operator_schema() const; + std::optional operator_schema() const; void setHandle(RecordFunctionHandle handle) { handle_ = handle; @@ -521,7 +521,7 @@ struct TORCH_API RecordFunction { TORCH_API StepCallbacks getStepCallbacks(RecordScope scope); -TORCH_API c10::optional getStepCallbacksUnlessEmpty( +TORCH_API std::optional getStepCallbacksUnlessEmpty( RecordScope scope); namespace detail { diff --git a/aten/src/ATen/templates/RegisterBackendSelect.cpp b/aten/src/ATen/templates/RegisterBackendSelect.cpp index dcb5986ab69ed..3586e44da999b 100644 --- a/aten/src/ATen/templates/RegisterBackendSelect.cpp +++ b/aten/src/ATen/templates/RegisterBackendSelect.cpp @@ -23,7 +23,7 @@ namespace { ${backend_select_method_definitions} -bool is_pinned(const Tensor& self, c10::optional device) { +bool is_pinned(const Tensor& self, std::optional device) { // Only CPU tensors can be pinned if (!self.is_cpu()) { return false; @@ -33,7 +33,7 @@ bool is_pinned(const Tensor& self, c10::optional device) { return at::_ops::is_pinned::redispatch(_dk, self, device); } -at::Tensor _pin_memory(const Tensor& self, c10::optional device) { +at::Tensor _pin_memory(const Tensor& self, std::optional device) { TORCH_CHECK(self.device().is_cpu(), "cannot pin '", self.toString(), "' only dense CPU tensors can be pinned"); DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(at::kCUDA))); if (self.is_nested()) { diff --git a/aten/src/ATen/templates/RegisterFunctionalization.cpp b/aten/src/ATen/templates/RegisterFunctionalization.cpp index fabc12a03fa9f..74d02be9f93d3 100644 --- a/aten/src/ATen/templates/RegisterFunctionalization.cpp +++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp @@ -60,7 +60,7 @@ inline Tensor to_meta(const Tensor& t) { /*device=*/c10::make_optional(c10::Device(kMeta)), /*pin_memory=*/c10::nullopt); } -inline c10::optional to_meta(const c10::optional& t) { +inline std::optional to_meta(const c10::optional& t) { if (t.has_value()) { return c10::make_optional(to_meta(*t)); } diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h index 010f12d4cfbce..1515442dd1f94 100644 --- a/aten/src/ATen/templates/TensorBody.h +++ b/aten/src/ATen/templates/TensorBody.h @@ -398,7 +398,7 @@ class TORCH_API Tensor: public TensorBase { /// // f requires grad, has no operation creating it /// @endcode - /// \fn void backward(const Tensor & gradient={}, c10::optional retain_graph=c10::nullopt, bool create_graph=false, c10::optional inputs=c10::nullopt) const; + /// \fn void backward(const Tensor & gradient={}, std::optional retain_graph=c10::nullopt, bool create_graph=false, c10::optional inputs=c10::nullopt) const; /// /// Computes the gradient of current tensor with respect to graph leaves. /// @@ -433,7 +433,7 @@ class TORCH_API Tensor: public TensorBase { /// the current implementation will call its grad_fn (even though it is not strictly needed to get this gradients). /// It is an implementation detail on which the user should not rely. /// See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details. - void backward(const Tensor & gradient={}, c10::optional retain_graph=c10::nullopt, bool create_graph=false, c10::optional inputs=c10::nullopt) const { + void backward(const Tensor & gradient={}, std::optional retain_graph=c10::nullopt, bool create_graph=false, c10::optional inputs=c10::nullopt) const { // NB: Adding this wrapper to _backward here because we'd like our // 'backwards' api to accept the 'inputs' argument optionally. Since code gen // currently does not support optional of TensorList our approach is to replace @@ -626,7 +626,7 @@ class TORCH_API Tensor: public TensorBase { return TensorBase::data(); } - void _backward(TensorList inputs, const c10::optional& gradient, c10::optional keep_graph, bool create_graph) const; + void _backward(TensorList inputs, const std::optional& gradient, c10::optional keep_graph, bool create_graph) const; const Tensor& requires_grad_(bool _requires_grad=true) const { TensorBase::requires_grad_(_requires_grad); @@ -737,7 +737,7 @@ struct ExclusivelyOwnedTraits { namespace at { inline c10::MaybeOwned borrow_from_optional_tensor( - const c10::optional& opt) { + const std::optional& opt) { return opt.has_value() ? c10::MaybeOwned::borrowed(*opt) : c10::MaybeOwned::owned(std::in_place); diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index a1a6249414dea..09579b2367206 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -123,16 +123,6 @@ list(APPEND ATen_XPU_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/xpu_generator_test.cpp ) -# Caffe2 specific tests -if(BUILD_CAFFE2) - list(APPEND ATen_CPU_TEST_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/ExclusivelyOwned_test.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/tensor_interop_test.cpp) - list(APPEND ATen_CUDA_TEST_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/cuda_tensor_interop_test.cpp) -endif() - - # ---[ Send the lists to the parent scope. set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/test/cpu_rng_test.cpp b/aten/src/ATen/test/cpu_rng_test.cpp index ebc3eee12f3f6..593d78d47887f 100644 --- a/aten/src/ATen/test/cpu_rng_test.cpp +++ b/aten/src/ATen/test/cpu_rng_test.cpp @@ -22,10 +22,10 @@ struct TestCPUGenerator : public c10::GeneratorImpl { ~TestCPUGenerator() override = default; uint32_t random() { return value_; } uint64_t random64() { return value_; } - c10::optional next_float_normal_sample() { return next_float_normal_sample_; } - c10::optional next_double_normal_sample() { return next_double_normal_sample_; } - void set_next_float_normal_sample(c10::optional randn) { next_float_normal_sample_ = randn; } - void set_next_double_normal_sample(c10::optional randn) { next_double_normal_sample_ = randn; } + std::optional next_float_normal_sample() { return next_float_normal_sample_; } + std::optional next_double_normal_sample() { return next_double_normal_sample_; } + void set_next_float_normal_sample(std::optional randn) { next_float_normal_sample_ = randn; } + void set_next_double_normal_sample(std::optional randn) { next_double_normal_sample_ = randn; } void set_current_seed(uint64_t seed) override { throw std::runtime_error("not implemented"); } void set_offset(uint64_t offset) override { throw std::runtime_error("not implemented"); } uint64_t get_offset() const override { throw std::runtime_error("not implemented"); } @@ -38,95 +38,95 @@ struct TestCPUGenerator : public c10::GeneratorImpl { static DeviceType device_type() { return DeviceType::CPU; } uint64_t value_; - c10::optional next_float_normal_sample_; - c10::optional next_double_normal_sample_; + std::optional next_float_normal_sample_; + std::optional next_double_normal_sample_; }; // ==================================================== Random ======================================================== -Tensor& random_(Tensor& self, c10::optional generator) { +Tensor& random_(Tensor& self, std::optional generator) { return at::native::templates::random_impl(self, generator); } -Tensor& random_from_to(Tensor& self, int64_t from, optional to, c10::optional generator) { +Tensor& random_from_to(Tensor& self, int64_t from, optional to, std::optional generator) { return at::native::templates::random_from_to_impl(self, from, to, generator); } -Tensor& random_to(Tensor& self, int64_t to, c10::optional generator) { +Tensor& random_to(Tensor& self, int64_t to, std::optional generator) { return random_from_to(self, 0, to, generator); } // ==================================================== Normal ======================================================== -Tensor& normal_(Tensor& self, double mean, double std, c10::optional gen) { +Tensor& normal_(Tensor& self, double mean, double std, std::optional gen) { return at::native::templates::normal_impl_(self, mean, std, gen); } -Tensor& normal_Tensor_float_out(const Tensor& mean, double std, c10::optional gen, Tensor& output) { +Tensor& normal_Tensor_float_out(const Tensor& mean, double std, std::optional gen, Tensor& output) { return at::native::templates::normal_out_impl(output, mean, std, gen); } -Tensor& normal_float_Tensor_out(double mean, const Tensor& std, c10::optional gen, Tensor& output) { +Tensor& normal_float_Tensor_out(double mean, const Tensor& std, std::optional gen, Tensor& output) { return at::native::templates::normal_out_impl(output, mean, std, gen); } -Tensor& normal_Tensor_Tensor_out(const Tensor& mean, const Tensor& std, c10::optional gen, Tensor& output) { +Tensor& normal_Tensor_Tensor_out(const Tensor& mean, const Tensor& std, std::optional gen, Tensor& output) { return at::native::templates::normal_out_impl(output, mean, std, gen); } -Tensor normal_Tensor_float(const Tensor& mean, double std, c10::optional gen) { +Tensor normal_Tensor_float(const Tensor& mean, double std, std::optional gen) { return at::native::templates::normal_impl(mean, std, gen); } -Tensor normal_float_Tensor(double mean, const Tensor& std, c10::optional gen) { +Tensor normal_float_Tensor(double mean, const Tensor& std, std::optional gen) { return at::native::templates::normal_impl(mean, std, gen); } -Tensor normal_Tensor_Tensor(const Tensor& mean, const Tensor& std, c10::optional gen) { +Tensor normal_Tensor_Tensor(const Tensor& mean, const Tensor& std, std::optional gen) { return at::native::templates::normal_impl(mean, std, gen); } // ==================================================== Uniform ======================================================= -Tensor& uniform_(Tensor& self, double from, double to, c10::optional generator) { +Tensor& uniform_(Tensor& self, double from, double to, std::optional generator) { return at::native::templates::uniform_impl_(self, from, to, generator); } // ==================================================== Cauchy ======================================================== -Tensor& cauchy_(Tensor& self, double median, double sigma, c10::optional generator) { +Tensor& cauchy_(Tensor& self, double median, double sigma, std::optional generator) { return at::native::templates::cauchy_impl_(self, median, sigma, generator); } // ================================================== LogNormal ======================================================= -Tensor& log_normal_(Tensor& self, double mean, double std, c10::optional gen) { +Tensor& log_normal_(Tensor& self, double mean, double std, std::optional gen) { return at::native::templates::log_normal_impl_(self, mean, std, gen); } // ================================================== Geometric ======================================================= -Tensor& geometric_(Tensor& self, double p, c10::optional gen) { +Tensor& geometric_(Tensor& self, double p, std::optional gen) { return at::native::templates::geometric_impl_(self, p, gen); } // ================================================== Exponential ===================================================== -Tensor& exponential_(Tensor& self, double lambda, c10::optional gen) { +Tensor& exponential_(Tensor& self, double lambda, std::optional gen) { return at::native::templates::exponential_impl_(self, lambda, gen); } // ================================================== Bernoulli ======================================================= -Tensor& bernoulli_Tensor(Tensor& self, const Tensor& p_, c10::optional gen) { +Tensor& bernoulli_Tensor(Tensor& self, const Tensor& p_, std::optional gen) { return at::native::templates::bernoulli_impl_(self, p_, gen); } -Tensor& bernoulli_float(Tensor& self, double p, c10::optional gen) { +Tensor& bernoulli_float(Tensor& self, double p, std::optional gen) { return at::native::templates::bernoulli_impl_(self, p, gen); } -Tensor& bernoulli_out(const Tensor& self, c10::optional gen, Tensor& result) { +Tensor& bernoulli_out(const Tensor& self, std::optional gen, Tensor& result) { return at::native::templates::bernoulli_out_impl(result, self, gen); } diff --git a/aten/src/ATen/test/cuda_distributions_test.cu b/aten/src/ATen/test/cuda_distributions_test.cu index 82d3d7777bc23..dcb5c9cc19cf0 100644 --- a/aten/src/ATen/test/cuda_distributions_test.cu +++ b/aten/src/ATen/test/cuda_distributions_test.cu @@ -173,7 +173,7 @@ TEST(RandomPermutationTest, TestIslandShuffle) { bool shuffled2 = false; for (int i = 0; i < 100; i++) { cudaDeviceSynchronize(); - c10::optional gen = c10::nullopt; + std::optional gen = c10::nullopt; randperm_handle_duplicate_keys(keys, values, 8, 5, gen); cudaDeviceSynchronize(); std::vector slice1 = {values[0], values[1], values[2]}; diff --git a/aten/src/ATen/test/cuda_optional_test.cu b/aten/src/ATen/test/cuda_optional_test.cu index b35180d921e9f..be51a4cbe8c97 100644 --- a/aten/src/ATen/test/cuda_optional_test.cu +++ b/aten/src/ATen/test/cuda_optional_test.cu @@ -11,8 +11,8 @@ using namespace at; // optional in cuda files TEST(OptionalTest, OptionalTestCUDA) { if (!at::cuda::is_available()) return; - c10::optional trivially_destructible; - c10::optional> non_trivially_destructible; + std::optional trivially_destructible; + std::optional> non_trivially_destructible; ASSERT_FALSE(trivially_destructible.has_value()); ASSERT_FALSE(non_trivially_destructible.has_value()); diff --git a/aten/src/ATen/test/cuda_stream_test.cpp b/aten/src/ATen/test/cuda_stream_test.cpp index 77100482b5955..b6b3bf7f9e7de 100644 --- a/aten/src/ATen/test/cuda_stream_test.cpp +++ b/aten/src/ATen/test/cuda_stream_test.cpp @@ -408,7 +408,7 @@ TEST(TestStream, ExternalMultiThreadTest) { std::promise aToBProm; std::promise bToAProm; - c10::optional foundStream; + std::optional foundStream; std::thread threadA([&]() { at::cuda::CUDAGuard device_guard(0); diff --git a/aten/src/ATen/test/extension_backend_test.cpp b/aten/src/ATen/test/extension_backend_test.cpp index 4be68b1d0a710..3b2345f347d63 100644 --- a/aten/src/ATen/test/extension_backend_test.cpp +++ b/aten/src/ATen/test/extension_backend_test.cpp @@ -15,8 +15,8 @@ using namespace at; static int test_int; -Tensor empty_override(SymIntArrayRef size, c10::optional dtype, c10::optional layout, - c10::optional device, c10::optional pin_memory, c10::optional optional_memory_format) { +Tensor empty_override(SymIntArrayRef size, std::optional dtype, c10::optional layout, + std::optional device, c10::optional pin_memory, c10::optional optional_memory_format) { test_int = 1; auto tensor_impl = c10::make_intrusive( Storage( @@ -39,10 +39,10 @@ Tensor add_override(const Tensor & a, const Tensor & b , const Scalar& c) { Tensor empty_strided_override( IntArrayRef size, IntArrayRef stride, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return empty_override(fromIntArrayRefSlow(size), dtype, layout, device, pin_memory, c10::nullopt); } diff --git a/aten/src/ATen/test/operator_name_test.cpp b/aten/src/ATen/test/operator_name_test.cpp index 6d074572dd748..f670a434cb638 100644 --- a/aten/src/ATen/test/operator_name_test.cpp +++ b/aten/src/ATen/test/operator_name_test.cpp @@ -9,7 +9,7 @@ TEST(OperatorNameTest, SetNamespaceIfNotSetWithoutExistingNamespace) { EXPECT_TRUE(result); EXPECT_EQ(testName.name, "ns::operator"); EXPECT_EQ(testName.overload_name, "operator.overload"); - EXPECT_EQ(testName.getNamespace(), c10::optional("ns")); + EXPECT_EQ(testName.getNamespace(), std::optional("ns")); } TEST(OperatorNameTest, SetNamespaceIfNotSetWithExistingNamespace) { @@ -18,5 +18,5 @@ TEST(OperatorNameTest, SetNamespaceIfNotSetWithExistingNamespace) { EXPECT_FALSE(result); EXPECT_EQ(namespacedName.name, "already_namespaced::operator"); EXPECT_EQ(namespacedName.overload_name, "operator.overload"); - EXPECT_EQ(namespacedName.getNamespace(), c10::optional("already_namespaced")); + EXPECT_EQ(namespacedName.getNamespace(), std::optional("already_namespaced")); } diff --git a/aten/src/ATen/test/rng_test.h b/aten/src/ATen/test/rng_test.h index df04d340893fb..82b9c6d5a836e 100644 --- a/aten/src/ATen/test/rng_test.h +++ b/aten/src/ATen/test/rng_test.h @@ -68,14 +68,14 @@ void test_random_from_to(const at::Device& device) { constexpr auto uint64_max_val = std::numeric_limits::max(); std::vector froms; - std::vector> tos; + std::vector<::std::optional> tos; if constexpr (::std::is_same_v) { froms = { 0L }; tos = { 1L, - static_cast>(c10::nullopt) + static_cast<::std::optional>(c10::nullopt) }; } else if constexpr (::std::is_signed_v) { constexpr int64_t min_from = _min_from(); @@ -86,11 +86,11 @@ void test_random_from_to(const at::Device& device) { 42L }; tos = { - c10::optional(-42L), - c10::optional(0L), - c10::optional(42L), - c10::optional(max_to), - static_cast>(c10::nullopt) + ::std::optional(-42L), + ::std::optional(0L), + ::std::optional(42L), + ::std::optional(max_to), + static_cast<::std::optional>(c10::nullopt) }; } else { froms = { @@ -98,9 +98,9 @@ void test_random_from_to(const at::Device& device) { 42L }; tos = { - c10::optional(42L), - c10::optional(max_to), - static_cast>(c10::nullopt) + ::std::optional(42L), + ::std::optional(max_to), + static_cast<::std::optional>(c10::nullopt) }; } @@ -116,7 +116,7 @@ void test_random_from_to(const at::Device& device) { bool from_to_case_covered = false; bool from_case_covered = false; for (const int64_t from : froms) { - for (const c10::optional to : tos) { + for (const ::std::optional to : tos) { if (!to.has_value() || from < *to) { for (const uint64_t val : vals) { auto gen = at::make_generator(val); diff --git a/aten/src/ATen/test/type_test.cpp b/aten/src/ATen/test/type_test.cpp index 3ea64a4da2124..955d60c586c0f 100644 --- a/aten/src/ATen/test/type_test.cpp +++ b/aten/src/ATen/test/type_test.cpp @@ -9,7 +9,7 @@ namespace c10 { TEST(TypeCustomPrinter, Basic) { TypePrinter printer = - [](const Type& t) -> c10::optional { + [](const Type& t) -> std::optional { if (auto tensorType = t.cast()) { return "CustomTensor"; } @@ -29,7 +29,7 @@ TEST(TypeCustomPrinter, Basic) { TEST(TypeCustomPrinter, ContainedTypes) { TypePrinter printer = - [](const Type& t) -> c10::optional { + [](const Type& t) -> std::optional { if (auto tensorType = t.cast()) { return "CustomTensor"; } @@ -53,7 +53,7 @@ TEST(TypeCustomPrinter, ContainedTypes) { TEST(TypeCustomPrinter, NamedTuples) { TypePrinter printer = - [](const Type& t) -> c10::optional { + [](const Type& t) -> std::optional { if (auto tupleType = t.cast()) { // Rewrite only NamedTuples if (tupleType->name()) { diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp index 5b6a31e0b5147..687691a370bf4 100644 --- a/aten/src/ATen/test/vulkan_api_test.cpp +++ b/aten/src/ATen/test/vulkan_api_test.cpp @@ -177,8 +177,8 @@ static void gen_all_subsets( static void slice_test( const std::vector& size, int64_t dim, - c10::optional start, - c10::optional end, + std::optional start, + std::optional end, int64_t step) { // Arrange const auto in_cpu = at::rand(size, at::device(at::kCPU).dtype(at::kFloat)); @@ -212,7 +212,7 @@ static void slice_tests(const std::unordered_map>& } } -static void clone_test(const std::vector& size, c10::optional optional_memory_format) { +static void clone_test(const std::vector& size, std::optional optional_memory_format) { // Arrange const auto in_cpu = at::rand(size, at::device(at::kCPU).dtype(at::kFloat)); const auto in_vulkan = in_cpu.vulkan(); @@ -249,7 +249,7 @@ inline std::vector callOpByName( const char* func_name, const char* overload_name, Args... args) { - const c10::optional op_handle = + const std::optional op_handle = c10::Dispatcher::singleton().findSchema({func_name, overload_name}); assert(op_handle.has_value()); return callOpByHandle(op_handle.value(), std::forward(args)...); @@ -7120,7 +7120,7 @@ TEST_F(VulkanAPITest, zeros) { TEST_F(VulkanAPITest, clone_success) { // Arrange - std::multimap, std::vector> mem2sizes { + std::multimap, std::vector> mem2sizes { {c10::MemoryFormat::Preserve, {2, 3, 5, 161}}, // 4D tensors with MemoryFormat::Preserve {c10::MemoryFormat::Contiguous, {2, 3, 5, 161}}, // 4D tensors with MemoryFormat::Contiguous {{}, {2, 3, 5, 161}}, // 4D tensors with null diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp index 031154de17f85..cf243d5ce50c9 100644 --- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp +++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp @@ -136,7 +136,7 @@ inline std::vector callOpByName( const char* func_name, const char* overload_name, Args... args) { - const c10::optional op_handle = + const std::optional op_handle = c10::Dispatcher::singleton().findSchema({func_name, overload_name}); assert(op_handle.has_value()); return callOpByHandle(op_handle.value(), std::forward(args)...); diff --git a/aten/src/ATen/xpu/CachingHostAllocator.cpp b/aten/src/ATen/xpu/CachingHostAllocator.cpp index 13cd1b6124a9b..332114a8715b7 100644 --- a/aten/src/ATen/xpu/CachingHostAllocator.cpp +++ b/aten/src/ATen/xpu/CachingHostAllocator.cpp @@ -20,7 +20,7 @@ struct XPUCachingHostAllocatorImpl } void record_stream( - c10::optional>& events, + std::optional>& events, XPUStream stream) override { XPUEvent event; event.record(stream); diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index 1987a60f64fbb..096dbc48ec7da 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -108,6 +108,7 @@ current_onnx_compiler = "" current_batch_size = None output_filename = None +disable_output = False MAX_DOWNLOAD_ATTEMPTS = 5 @@ -306,6 +307,9 @@ def load_model_from_path(path_and_class_str): def output_csv(filename, headers, row): + global disable_output + if disable_output: + return if os.path.exists(filename): with open(filename) as fd: lines = list(csv.reader(fd)) or [[]] @@ -3212,6 +3216,11 @@ def get_example_inputs(self): "--output-directory", help="Overrides the directory to place output files.", ) + parser.add_argument( + "--disable-output", + action="store_true", + help="Disable writing of output files, e.g., for warm-up runs", + ) parser.add_argument( "--baseline", help="Compare with a prior --output", @@ -3391,6 +3400,7 @@ def get_example_inputs(self): ) group_latency.add_argument( "--warm-start-latency", + "--warm_start_latency", action="store_true", help="Run model(s) twice and preseve caches in between to enable a 'warm start' on the 2nd run", ) @@ -3610,10 +3620,11 @@ def main(runner, original_dir=None, args=None): cmd = [sys.executable] + sys.argv cmd.remove("--warm-start-latency") - print(f"Executing cold-start run for {args.only}") - subprocess.check_call(cmd, timeout=args.timeout, env=env) + print(f"Performing cold-start run for {args.only}") + warmup_cmd = cmd + ["--repeat=1", "--disable-output"] + subprocess.check_call(warmup_cmd, timeout=args.timeout, env=env) - print(f"Executing warm-start run for {args.only}") + print(f"Performing warm-start run for {args.only}") subprocess.check_call(cmd, timeout=args.timeout, env=env) else: # single process path just uses the main process @@ -3666,7 +3677,7 @@ def run(runner, args, original_dir=None): if args.ci: if args.accuracy: # Run fewer iterations when checking accuracy - args.repeat = 2 + args.repeat = min(args.repeat, 2) # Set translation validation on by default on CI accuracy runs. torch.fx.experimental._config.translation_validation = True @@ -3820,9 +3831,12 @@ def run(runner, args, original_dir=None): runner.skip_models.clear() experiment = null_experiment - global current_name, current_device, current_batch_size, output_filename, optimize_ctx, current_onnx_compiler + global current_name, current_device, current_batch_size, output_filename, disable_output, optimize_ctx, current_onnx_compiler optimize_ctx = contextlib.nullcontext() + if args.disable_output: + disable_output = True + if args.overhead: optimize_ctx = torch._dynamo.optimize(dummy_fx_compile, nopython=args.nopython) experiment = speedup_experiment diff --git a/benchmarks/transformer/score_mod.py b/benchmarks/transformer/score_mod.py index 0e9e8d11a35b9..2c5f41502f7ea 100644 --- a/benchmarks/transformer/score_mod.py +++ b/benchmarks/transformer/score_mod.py @@ -1,3 +1,4 @@ +import argparse import itertools from collections import defaultdict from dataclasses import asdict, dataclass @@ -98,7 +99,7 @@ def generate_inputs( return query, key, value -def run_single_experiment(config: ExperimentConfig) -> ExperimentResults: +def run_single_experiment(config: ExperimentConfig, dynamic=False) -> ExperimentResults: device = torch.device("cuda") query, key, value = generate_inputs( config.batch_size, @@ -113,7 +114,7 @@ def run_single_experiment(config: ExperimentConfig) -> ExperimentResults: def eager_sdpa(query, key, value, _): return F.scaled_dot_product_attention(query, key, value) - compiled_sdpa = torch.compile(_flex_attention) + compiled_sdpa = torch.compile(_flex_attention, dynamic=dynamic) score_mod = config.score_mod @@ -242,16 +243,26 @@ def generate_experiment_configs() -> List[ExperimentConfig]: return all_configs -def main(): +def main(dynamic=False): seed = 123 np.random.seed(seed) torch.manual_seed(seed) results = [] for config in tqdm(generate_experiment_configs()): - results.append(Experiment(config, run_single_experiment(config))) + results.append( + Experiment(config, run_single_experiment(config, dynamic=dynamic)) + ) print_results(results) if __name__ == "__main__": - main() + parser = argparse.ArgumentParser() + parser.add_argument( + "--dynamic", + action="store_true", + help="Runs a dynamic shapes version of compiled flex attention.", + ) + + args = parser.parse_args() + main(args.dynamic) diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt index 70b235e43e7d7..273353128baaf 100644 --- a/binaries/CMakeLists.txt +++ b/binaries/CMakeLists.txt @@ -7,16 +7,6 @@ if(INTERN_BUILD_MOBILE) return() endif() -if(BUILD_CAFFE2) - caffe2_binary_target("at_launch_benchmark.cc") - target_include_directories(at_launch_benchmark PUBLIC - ${CMAKE_BINARY_DIR}/aten/src) - - caffe2_binary_target("intra_inter_benchmark.cc") - target_include_directories(intra_inter_benchmark PUBLIC - ${CMAKE_BINARY_DIR}/aten/src) -endif() - caffe2_binary_target("parallel_info.cc") target_include_directories(parallel_info PUBLIC ${CMAKE_BINARY_DIR}/aten/src) # provides "ATen/TypeExtendedInterface.h" to ATen.h diff --git a/binaries/compare_models_torch.cc b/binaries/compare_models_torch.cc index 5e90445560bc7..c8338fe546a59 100644 --- a/binaries/compare_models_torch.cc +++ b/binaries/compare_models_torch.cc @@ -305,7 +305,7 @@ int main(int argc, char** argv) { torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard(false); c10::CPUCachingAllocator caching_allocator; - c10::optional caching_allocator_guard; + std::optional caching_allocator_guard; if (FLAGS_use_caching_allocator) { caching_allocator_guard.emplace(&caching_allocator); } diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc index b2c521e569b16..00b17ddd47488 100644 --- a/binaries/speed_benchmark_torch.cc +++ b/binaries/speed_benchmark_torch.cc @@ -294,7 +294,7 @@ int main(int argc, char** argv) { } c10::CPUCachingAllocator caching_allocator; - c10::optional caching_allocator_guard; + std::optional caching_allocator_guard; if (FLAGS_use_caching_allocator) { caching_allocator_guard.emplace(&caching_allocator); } diff --git a/buckbuild.bzl b/buckbuild.bzl index 89707dd9bc3f0..4c4fc9a89a280 100644 --- a/buckbuild.bzl +++ b/buckbuild.bzl @@ -279,7 +279,6 @@ def get_pt_preprocessor_flags(): "-D_THP_CORE", "-DUSE_SCALARS", "-DNO_CUDNN_DESTROY_HANDLE", - "-DBUILD_CAFFE2", ] if _is_build_mode_dev(): diff --git a/build_variables.bzl b/build_variables.bzl index d0d5857c2b3c9..6fd04b7701157 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -106,6 +106,7 @@ libtorch_profiler_sources = [ "torch/csrc/profiler/standalone/execution_trace_observer.cpp", "torch/csrc/profiler/standalone/itt_observer.cpp", "torch/csrc/profiler/standalone/nvtx_observer.cpp", + "torch/csrc/profiler/standalone/privateuse1_observer.cpp", "torch/csrc/profiler/stubs/base.cpp", "torch/csrc/profiler/orchestration/vulkan.cpp", "torch/csrc/profiler/perf.cpp", @@ -825,6 +826,7 @@ libtorch_python_core_sources = [ "torch/csrc/mtia/Module.cpp", "torch/csrc/inductor/aoti_runner/pybind.cpp", "torch/csrc/inductor/aoti_eager/kernel_holder.cpp", + "torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp", "torch/csrc/jit/backends/backend_init.cpp", "torch/csrc/jit/python/init.cpp", "torch/csrc/jit/passes/onnx.cpp", @@ -1171,7 +1173,6 @@ aten_native_source_codegen_list = [ "aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp", "aten/src/ATen/native/cpu/FusedAdamKernel.cpp", "aten/src/ATen/native/cpu/FusedSGDKernel.cpp", - "aten/src/ATen/native/cpu/FusedAdagradKernel.cpp", ] # This aten native source file list will not go through aten codegen process @@ -1408,7 +1409,6 @@ aten_native_source_non_codegen_list = [ "aten/src/ATen/native/xnnpack/Shim.cpp", "aten/src/ATen/native/FusedAdam.cpp", "aten/src/ATen/native/FusedSGD.cpp", - "aten/src/ATen/native/FusedAdagrad.cpp", # Files not in native, but depends on native symbols # "aten/src/ATen/TensorIndexing.cpp", "aten/src/ATen/TensorIterator.cpp", diff --git a/c10/core/ConstantSymNodeImpl.h b/c10/core/ConstantSymNodeImpl.h index 4df1d1010f807..3c0fb66f7469f 100644 --- a/c10/core/ConstantSymNodeImpl.h +++ b/c10/core/ConstantSymNodeImpl.h @@ -69,14 +69,14 @@ class C10_API ConstantSymNodeImpl : public SymNodeImpl { return ::std::get(value_) ? "true" : "false"; } } - c10::optional constant_int() override { + std::optional constant_int() override { if constexpr (is_int_()) { return ::std::get(value_); } else { return c10::nullopt; } } - c10::optional constant_bool() override { + std::optional constant_bool() override { if constexpr (is_bool_()) { return ::std::get(value_); } else { diff --git a/c10/core/StorageImpl.cpp b/c10/core/StorageImpl.cpp index dc36064ddca4e..9dd6f5f431316 100644 --- a/c10/core/StorageImpl.cpp +++ b/c10/core/StorageImpl.cpp @@ -68,7 +68,7 @@ c10::intrusive_ptr make_storage_impl( c10::DataPtr data_ptr, c10::Allocator* allocator, bool resizable, - c10::optional device_opt) { + std::optional device_opt) { // This will be non-nullptr only when there is a custom StorageImpl // constructor for the given device c10::StorageImplCreateHelper fptr = nullptr; diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h index 4ee9f62e620f5..abe6218fbc941 100644 --- a/c10/core/StorageImpl.h +++ b/c10/core/StorageImpl.h @@ -325,6 +325,6 @@ C10_API c10::intrusive_ptr make_storage_impl( c10::DataPtr data_ptr, c10::Allocator* allocator, bool resizable, - c10::optional device_opt); + std::optional device_opt); } // namespace c10 diff --git a/c10/core/SymBool.h b/c10/core/SymBool.h index cf984611e2340..9f9f141293a37 100644 --- a/c10/core/SymBool.h +++ b/c10/core/SymBool.h @@ -34,7 +34,7 @@ class C10_API SymBool { SymNode wrap_node(const SymNode& base) const; bool expect_bool() const { - c10::optional c = maybe_as_bool(); + std::optional c = maybe_as_bool(); TORCH_CHECK(c.has_value()); return *c; } @@ -66,7 +66,7 @@ class C10_API SymBool { return data_; } - c10::optional maybe_as_bool() const { + std::optional maybe_as_bool() const { if (!is_heap_allocated()) { return c10::make_optional(data_); } diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h index 79ce4054b8640..025c351334a01 100644 --- a/c10/core/SymInt.h +++ b/c10/core/SymInt.h @@ -229,7 +229,7 @@ class C10_API SymInt { return data_; } - c10::optional maybe_as_int() const { + std::optional maybe_as_int() const { if (!is_heap_allocated()) { return c10::make_optional(data_); } diff --git a/c10/core/SymIntArrayRef.h b/c10/core/SymIntArrayRef.h index 76137aa47bdbb..760f4ba4e79a2 100644 --- a/c10/core/SymIntArrayRef.h +++ b/c10/core/SymIntArrayRef.h @@ -19,7 +19,7 @@ inline at::IntArrayRef asIntArrayRefUnchecked(c10::SymIntArrayRef ar) { // allocate another buffer and write the integers into it. If you need it, // we can do it. But I don't think you need it. -inline c10::optional asIntArrayRefSlowOpt( +inline std::optional asIntArrayRefSlowOpt( c10::SymIntArrayRef ar) { for (const c10::SymInt& sci : ar) { if (sci.is_heap_allocated()) { diff --git a/c10/core/SymNodeImpl.h b/c10/core/SymNodeImpl.h index 0413b9ff28482..9ffab5065109e 100644 --- a/c10/core/SymNodeImpl.h +++ b/c10/core/SymNodeImpl.h @@ -30,61 +30,61 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target { // these could be pure virtual when we implement LTC versions virtual bool is_int() { TORCH_CHECK(false, "NYI"); - }; + } virtual bool is_bool() { TORCH_CHECK(false, "NYI"); - }; + } virtual bool is_float() { TORCH_CHECK(false, "NYI"); - }; + } virtual bool is_nested_int() const { return false; - }; + } virtual SymNode add(const SymNode& other) { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode sub(const SymNode& other) { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode mul(const SymNode& other) { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode truediv(const SymNode& other) { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode pow(const SymNode& other) { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode floordiv(const SymNode& other) { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode mod(const SymNode& other) { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode eq(const SymNode& other) { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode ne(const SymNode& other) { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode gt(const SymNode& other) { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode lt(const SymNode& other) { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode le(const SymNode& other) { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode ge(const SymNode& other) { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode ceil() { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode floor() { TORCH_CHECK(false, "NYI"); - }; + } virtual SymNode neg() { TORCH_CHECK(false, "NYI"); }; @@ -188,19 +188,19 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target { virtual std::string str() { TORCH_CHECK(false, "NYI"); }; - virtual c10::optional nested_int() { + virtual std::optional nested_int() { return c10::nullopt; } - virtual c10::optional nested_int_coeff() { + virtual std::optional nested_int_coeff() { return c10::nullopt; } - virtual c10::optional constant_int() { + virtual std::optional constant_int() { return c10::nullopt; } - virtual c10::optional constant_bool() { + virtual std::optional constant_bool() { return c10::nullopt; } - virtual c10::optional maybe_as_int() { + virtual std::optional maybe_as_int() { return c10::nullopt; } virtual bool is_constant() { diff --git a/c10/core/SymbolicShapeMeta.cpp b/c10/core/SymbolicShapeMeta.cpp index 04b2f8da832f4..62b03d36ec71c 100644 --- a/c10/core/SymbolicShapeMeta.cpp +++ b/c10/core/SymbolicShapeMeta.cpp @@ -28,7 +28,7 @@ SymbolicShapeMeta::SymbolicShapeMeta(const SymbolicShapeMeta& other) } // base, sizes, strides -static c10::optional< +static std::optional< std::tuple, std::vector>> normalize_sym_sizes_strides(SymIntArrayRef sizes, SymIntArrayRef strides) { // Look for a SymNode to dispatch on diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index 320dc7796877e..47f83c78e5789 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -127,7 +127,7 @@ TensorImpl::TensorImpl( TensorImpl::TensorImpl( DispatchKeySet key_set, const caffe2::TypeMeta data_type, - c10::optional device_opt) + std::optional device_opt) : TensorImpl({}, key_set, data_type, device_opt) {} // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) @@ -135,7 +135,7 @@ TensorImpl::TensorImpl( Storage&& storage, DispatchKeySet key_set, const caffe2::TypeMeta data_type, - c10::optional device_opt) + std::optional device_opt) : storage_(std::move(storage)), numel_(0), @@ -846,7 +846,7 @@ static void clone_symvec(SymIntArrayRef src, SymDimVector& dst) { void TensorImpl::set_sizes_and_strides( c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, - c10::optional storage_offset) { + std::optional storage_offset) { auto int_sizes = asIntArrayRefSlowOpt(sizes); auto int_strides = asIntArrayRefSlowOpt(strides); if (int_sizes && int_strides && diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 3a74c8936297e..e49a66c916ffb 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -233,8 +233,8 @@ struct C10_API ExtraMeta { std::unique_ptr symbolic_shape_meta_ = nullptr; std::unique_ptr named_tensor_meta_ = nullptr; intrusive_ptr backend_meta_ = nullptr; - c10::optional custom_data_ptr_error_msg_ = c10::nullopt; - c10::optional custom_storage_error_msg_ = c10::nullopt; + std::optional custom_data_ptr_error_msg_ = c10::nullopt; + std::optional custom_storage_error_msg_ = c10::nullopt; ExtraMeta() = default; ExtraMeta(const ExtraMeta& other) { @@ -260,8 +260,8 @@ struct C10_API ExtraMeta { std::unique_ptr symbolic_shape_meta, std::unique_ptr named_tensor_meta, intrusive_ptr backend_meta, - c10::optional custom_data_ptr_error_msg = c10::nullopt, - c10::optional custom_storage_access_error_msg = c10::nullopt) + std::optional custom_data_ptr_error_msg = c10::nullopt, + std::optional custom_storage_access_error_msg = c10::nullopt) : symbolic_shape_meta_(std::move(symbolic_shape_meta)), named_tensor_meta_(std::move(named_tensor_meta)), backend_meta_(std::move(backend_meta)), @@ -528,7 +528,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { TensorImpl( DispatchKeySet, const caffe2::TypeMeta data_type, - c10::optional device_opt); + std::optional device_opt); // Legacy constructors so I don't have to go update call sites. // TODO: When Variable is added, delete these constructors @@ -543,7 +543,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { TensorImpl( DispatchKey dispatch_key, const caffe2::TypeMeta data_type, - c10::optional device_opt) + std::optional device_opt) : TensorImpl(DispatchKeySet(dispatch_key), data_type, device_opt) {} private: @@ -555,7 +555,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { Storage&& storage, DispatchKeySet, const caffe2::TypeMeta data_type, - c10::optional); + std::optional); public: TensorImpl(const TensorImpl&) = delete; @@ -1253,7 +1253,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { protected: c10::Device device_default() const { TORCH_CHECK(device_opt_.has_value(), "tensor does not have a device"); - // See NOTE [c10::optional operator usage in CUDA] + // See NOTE [std::optional operator usage in CUDA] return *device_opt_; } @@ -1687,7 +1687,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { } void release_storage_and_set_meta_custom_data_ptr_error_msg_( - c10::optional s) { + std::optional s) { storage_ = {}; set_storage_access_should_throw(); get_extra_meta().custom_data_ptr_error_msg_ = s; @@ -1737,7 +1737,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { void set_sizes_and_strides( c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, - c10::optional storage_offset = c10::nullopt); + std::optional storage_offset = c10::nullopt); // This is renamed to avoid breaking overload BC void generic_set_sizes_contiguous(c10::SymIntArrayRef sizes); void generic_set_sizes_contiguous(c10::IntArrayRef sizes) { @@ -1834,7 +1834,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { void set_sizes_and_strides( IntArrayRef new_size, IntArrayRef new_stride, - c10::optional storage_offset = c10::nullopt) { + std::optional storage_offset = c10::nullopt) { TORCH_CHECK( allow_tensor_metadata_change(), "set_sizes_and_strides ", @@ -2129,10 +2129,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { } private: - // See NOTE [c10::optional operator usage in CUDA] + // See NOTE [std::optional operator usage in CUDA] // We probably don't want to expose this publicly until // the note is addressed. - c10::optional device_opt() const { + std::optional device_opt() const { return device_opt_; } @@ -2146,7 +2146,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { TORCH_CHECK( device_opt_.has_value(), "device_type cannot be run on undefined Tensor"); - // See NOTE [c10::optional operator usage in CUDA] + // See NOTE [std::optional operator usage in CUDA] return (*device_opt_).type(); } @@ -2875,7 +2875,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // agree with the type meta in storage caffe2::TypeMeta data_type_; - // NOTE [c10::optional operator usage in CUDA] + // NOTE [std::optional operator usage in CUDA] // Our optional definition doesn't compile in .cu file if `value()` or // `operator->` are used. Instead, we always use `operator*`. // See https://github.com/pytorch/pytorch/issues/18496 for more info. @@ -2887,7 +2887,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // // INVARIANT: device_opt_ is only nullopt for undefined tensors // (which do not have a device.) - c10::optional device_opt_; + std::optional device_opt_; // default member initializers for bit-fields only available with -std=c++2a // or -std=gnu++2a diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h index 765f474702ef7..d99005d3d28f8 100644 --- a/c10/core/TensorOptions.h +++ b/c10/core/TensorOptions.h @@ -24,28 +24,28 @@ namespace c10 { DispatchKey computeDispatchKey( - c10::optional dtype, - c10::optional layout, - c10::optional device); + std::optional dtype, + std::optional layout, + std::optional device); -inline ScalarType dtype_or_default(c10::optional dtype) { +inline ScalarType dtype_or_default(std::optional dtype) { return value_or_else(dtype, [] { return get_default_dtype_as_scalartype(); }); } inline caffe2::TypeMeta dtype_or_default( - c10::optional dtype) { + std::optional dtype) { return value_or_else(dtype, [] { return get_default_dtype(); }); } -inline Layout layout_or_default(c10::optional layout) { +inline Layout layout_or_default(std::optional layout) { return layout.value_or(kStrided); } -inline Device device_or_default(c10::optional device) { +inline Device device_or_default(std::optional device) { return value_or_else(device, [] { return Device(kCPU); }); } -inline bool pinned_memory_or_default(c10::optional pinned_memory) { +inline bool pinned_memory_or_default(std::optional pinned_memory) { return pinned_memory.value_or(false); } @@ -193,19 +193,19 @@ struct C10_API TensorOptions { /// Return a copy of `TensorOptions` with `device` set to the given one, or /// cleared if `device` is `nullopt`. C10_NODISCARD TensorOptions - device(c10::optional device) const noexcept { + device(std::optional device) const noexcept { TensorOptions r = *this; r.set_device(device); return r; } /// Return a copy of `TensorOptions` with `device` set to the given one. - /// (This overload ensures that variadic template c10::optional constructor + /// (This overload ensures that variadic template std::optional constructor /// for Device work correctly.) template C10_NODISCARD TensorOptions device(Args&&... args) const noexcept { return device( - c10::optional(std::in_place, std::forward(args)...)); + std::optional(std::in_place, std::forward(args)...)); } /// Return a copy of `TensorOptions`, but with device set to CUDA, and the @@ -220,7 +220,7 @@ struct C10_API TensorOptions { /// Return a copy of `TensorOptions` with `dtype` set to the given one. C10_NODISCARD TensorOptions - dtype(c10::optional dtype) const noexcept { + dtype(std::optional dtype) const noexcept { TensorOptions r = *this; r.set_dtype(dtype); return r; @@ -228,7 +228,7 @@ struct C10_API TensorOptions { // legacy function to support ScalarType C10_NODISCARD TensorOptions - dtype(c10::optional dtype) const noexcept { + dtype(std::optional dtype) const noexcept { TensorOptions r = *this; r.set_dtype(dtype); return r; @@ -244,7 +244,7 @@ struct C10_API TensorOptions { /// Sets the layout of the `TensorOptions`. C10_NODISCARD TensorOptions - layout(c10::optional layout) const noexcept { + layout(std::optional layout) const noexcept { TensorOptions r = *this; r.set_layout(layout); return r; @@ -252,7 +252,7 @@ struct C10_API TensorOptions { /// Sets the `requires_grad` property of the `TensorOptions`. C10_NODISCARD TensorOptions - requires_grad(c10::optional requires_grad) const noexcept { + requires_grad(std::optional requires_grad) const noexcept { TensorOptions r = *this; r.set_requires_grad(requires_grad); return r; @@ -260,7 +260,7 @@ struct C10_API TensorOptions { /// Sets the `pinned_memory` property on the `TensorOptions`. C10_NODISCARD TensorOptions - pinned_memory(c10::optional pinned_memory) const noexcept { + pinned_memory(std::optional pinned_memory) const noexcept { TensorOptions r = *this; r.set_pinned_memory(pinned_memory); return r; @@ -268,7 +268,7 @@ struct C10_API TensorOptions { /// Sets the `memory_format` property on `TensorOptions`. C10_NODISCARD TensorOptions - memory_format(c10::optional memory_format) const noexcept { + memory_format(std::optional memory_format) const noexcept { TensorOptions r = *this; r.set_memory_format(memory_format); return r; @@ -286,7 +286,7 @@ struct C10_API TensorOptions { /// Returns the device of the `TensorOptions`, or `c10::nullopt` if /// device is not specified. - c10::optional device_opt() const noexcept { + std::optional device_opt() const noexcept { return has_device_ ? c10::make_optional(device_) : c10::nullopt; } @@ -307,7 +307,7 @@ struct C10_API TensorOptions { /// Returns the dtype of the `TensorOptions`, or `c10::nullopt` if /// device is not specified. - c10::optional dtype_opt() const noexcept { + std::optional dtype_opt() const noexcept { return has_dtype_ ? c10::make_optional(dtype_) : c10::nullopt; } @@ -323,7 +323,7 @@ struct C10_API TensorOptions { /// Returns the layout of the `TensorOptions`, or `c10::nullopt` if /// layout is not specified. - c10::optional layout_opt() const noexcept { + std::optional layout_opt() const noexcept { return has_layout_ ? c10::make_optional(layout_) : c10::nullopt; } @@ -339,7 +339,7 @@ struct C10_API TensorOptions { /// Returns the `requires_grad` property of the `TensorOptions`, or /// `c10::nullopt` if `requires_grad` is not specified. - c10::optional requires_grad_opt() const noexcept { + std::optional requires_grad_opt() const noexcept { return has_requires_grad_ ? c10::make_optional(requires_grad_) : c10::nullopt; } @@ -379,7 +379,7 @@ struct C10_API TensorOptions { /// Returns the `pinned_memory` property of the `TensorOptions`, or /// `c10::nullopt` if `pinned_memory` is not specified. - c10::optional pinned_memory_opt() const noexcept { + std::optional pinned_memory_opt() const noexcept { return has_pinned_memory_ ? c10::make_optional(pinned_memory_) : c10::nullopt; } @@ -394,7 +394,7 @@ struct C10_API TensorOptions { /// Returns the `memory_layout` property of `TensorOptions, or /// `c10::nullopt` if `memory_format` is not specified. - c10::optional memory_format_opt() const noexcept { + std::optional memory_format_opt() const noexcept { return has_memory_format_ ? c10::make_optional(memory_format_) : c10::nullopt; } @@ -435,7 +435,7 @@ struct C10_API TensorOptions { // TODO remove after TensorOptions rationalization TensorOptions merge_memory_format( - c10::optional optional_memory_format) const noexcept { + std::optional optional_memory_format) const noexcept { TensorOptions merged = *this; if (optional_memory_format.has_value()) { merged.set_memory_format(*optional_memory_format); @@ -466,7 +466,7 @@ struct C10_API TensorOptions { // on temporaries.) /// Mutably set the device of `TensorOptions`. - void set_device(c10::optional device) & noexcept { + void set_device(std::optional device) & noexcept { if (device) { device_ = *device; has_device_ = true; @@ -476,7 +476,7 @@ struct C10_API TensorOptions { } /// Mutably set the dtype of `TensorOptions`. - void set_dtype(c10::optional dtype) & noexcept { + void set_dtype(std::optional dtype) & noexcept { if (dtype) { dtype_ = *dtype; has_dtype_ = true; @@ -486,7 +486,7 @@ struct C10_API TensorOptions { } // legacy function to support ScalarType - void set_dtype(c10::optional dtype) & noexcept { + void set_dtype(std::optional dtype) & noexcept { if (dtype) { dtype_ = scalarTypeToTypeMeta(*dtype); has_dtype_ = true; @@ -496,7 +496,7 @@ struct C10_API TensorOptions { } /// Mutably set the layout of `TensorOptions`. - void set_layout(c10::optional layout) & noexcept { + void set_layout(std::optional layout) & noexcept { if (layout) { layout_ = *layout; has_layout_ = true; @@ -506,7 +506,7 @@ struct C10_API TensorOptions { } /// Mutably set the `requires_grad` property of `TensorOptions`. - void set_requires_grad(c10::optional requires_grad) & noexcept { + void set_requires_grad(std::optional requires_grad) & noexcept { if (requires_grad) { requires_grad_ = *requires_grad; has_requires_grad_ = true; @@ -516,7 +516,7 @@ struct C10_API TensorOptions { } /// Mutably set the `pinned_memory` property of `TensorOptions`. - void set_pinned_memory(c10::optional pinned_memory) & noexcept { + void set_pinned_memory(std::optional pinned_memory) & noexcept { if (pinned_memory) { pinned_memory_ = *pinned_memory; has_pinned_memory_ = true; @@ -526,7 +526,7 @@ struct C10_API TensorOptions { } /// Mutably set the `memory_Format` property of `TensorOptions`. - void set_memory_format(c10::optional memory_format) & noexcept { + void set_memory_format(std::optional memory_format) & noexcept { if (memory_format) { memory_format_ = *memory_format; has_memory_format_ = true; @@ -544,7 +544,7 @@ struct C10_API TensorOptions { // // TODO: MemoryFormat is not implemented in this way - // NB: We didn't use c10::optional here, because then we can't pack + // NB: We didn't use std::optional here, because then we can't pack // the has_***_ boolean fields. Device device_ = at::kCPU; // 16-bit @@ -632,9 +632,9 @@ inline std::string toString(const TensorOptions& options) { // This is intended to be a centralized location by which we can determine // what an appropriate DispatchKey for a tensor is. inline DispatchKey computeDispatchKey( - c10::optional dtype, - c10::optional layout, - c10::optional device) { + std::optional dtype, + std::optional layout, + std::optional device) { const auto layout_ = layout_or_default(layout); const auto device_ = device_or_default(device); switch (layout_) { diff --git a/c10/core/impl/PyObjectSlot.h b/c10/core/impl/PyObjectSlot.h index b3a4b85f05e8e..518b0e63e4921 100644 --- a/c10/core/impl/PyObjectSlot.h +++ b/c10/core/impl/PyObjectSlot.h @@ -93,8 +93,8 @@ struct C10_API PyObjectSlot { // be properly treated as a nonhermetic PyObject. // // NB: this lives in header so that we can avoid actually creating the - // c10::optional - c10::optional check_pyobj( + // std::optional + std::optional check_pyobj( PyInterpreter* self_interpreter, bool ignore_hermetic_tls = false) const { // Note [Memory ordering on Python interpreter tag] diff --git a/c10/core/impl/TorchDispatchModeTLS.cpp b/c10/core/impl/TorchDispatchModeTLS.cpp index e558a70522aca..f1847cb005b4c 100644 --- a/c10/core/impl/TorchDispatchModeTLS.cpp +++ b/c10/core/impl/TorchDispatchModeTLS.cpp @@ -121,7 +121,7 @@ int64_t TorchDispatchModeTLS::stack_len() { return stack_len + infra_modes_len; } -const c10::optional> +const std::optional> TorchDispatchModeTLS::get_mode(TorchDispatchModeKey mode_key) { return torchDispatchModeState.infra_modes_[static_cast(mode_key)]; } @@ -145,7 +145,7 @@ void TorchDispatchModeTLS::set_mode( torchDispatchModeState.infra_modes_[static_cast(mode_key)] = mode; } -const c10::optional> +const std::optional> TorchDispatchModeTLS::unset_mode(TorchDispatchModeKey mode_key) { auto out = torchDispatchModeState.infra_modes_[static_cast(mode_key)]; torchDispatchModeState.infra_modes_[static_cast(mode_key)] = diff --git a/c10/core/impl/TorchDispatchModeTLS.h b/c10/core/impl/TorchDispatchModeTLS.h index d9ac8d8449b49..7179d52c35162 100644 --- a/c10/core/impl/TorchDispatchModeTLS.h +++ b/c10/core/impl/TorchDispatchModeTLS.h @@ -35,9 +35,9 @@ struct C10_API TorchDispatchModeTLS { int64_t idx); static int64_t stack_len(); - static const c10::optional> + static const std::optional> get_mode(TorchDispatchModeKey mode_key); - static const c10::optional> + static const std::optional> unset_mode(TorchDispatchModeKey mode_key); static void set_mode( const std::shared_ptr& mode, @@ -55,7 +55,7 @@ struct C10_API TorchDispatchModeTLS { // However, we only allow a single FakeTensorMode onto the stack at a time // (Pushing additional FakeTensorModes onto the stack is a no-op) std::array< - c10::optional>, + std::optional>, static_cast(TorchDispatchModeKey::NUM_MODE_KEYS)> infra_modes_; }; diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 8af2c41dfab7e..2479f96ab30b5 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -550,7 +550,7 @@ struct ExpandableSegment { CUdeviceptr ptr_{}; size_t max_handles_{0}; size_t segment_size_; - std::vector> handles_; + std::vector> handles_; // devices on which this memory should be mapped in addition // to the device where the physical memory lives (device_). std::vector peers_; @@ -1107,6 +1107,26 @@ class DeviceCachingAllocator { .current; auto observers_local = oom_observers_; + size_t allocated_in_private_pools = 0; + auto get_size_block = [](const BlockPool& pool) { + size_t res = 0; + for (const auto& block : pool.blocks) { + res += block->size; + } + return res; + }; + for (const auto& p : graph_pools) { + allocated_in_private_pools += get_size_block(p.second->large_blocks); + allocated_in_private_pools += get_size_block(p.second->small_blocks); + } + + std::string private_pool_msg; + + if (allocated_in_private_pools > 0) { + private_pool_msg = "with " + format_size(allocated_in_private_pools) + + " allocated in private pools (e.g., CUDA Graphs), "; + } + // Make sure we do not have the device lock before calling our // observers which might need hold the GIL // It is safe to release at this point because will no longer @@ -1153,9 +1173,12 @@ class DeviceCachingAllocator { " is free. ", proc_info, "Of the allocated memory ", - format_size(allocated_bytes), - " is allocated by PyTorch, and ", - format_size(reserved_bytes - allocated_bytes), + format_size(allocated_bytes + allocated_in_private_pools), + " is allocated by PyTorch, ", + private_pool_msg, + "and ", + format_size( + reserved_bytes - allocated_bytes - allocated_in_private_pools), " is reserved by PyTorch but unallocated.", " If reserved but unallocated memory is large try setting", " PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid" diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp index 652f222385465..2b53eb4d7c7cb 100644 --- a/c10/cuda/CUDAFunctions.cpp +++ b/c10/cuda/CUDAFunctions.cpp @@ -151,7 +151,7 @@ void warn_or_error_on_sync() { } } -c10::optional getDeviceIndexWithPrimaryContext() { +std::optional getDeviceIndexWithPrimaryContext() { // check current device first auto current_device_index = current_device(); if (current_device_index >= 0) { diff --git a/c10/cuda/CUDAFunctions.h b/c10/cuda/CUDAFunctions.h index 72fdfc6fd692f..192fafbad10f4 100644 --- a/c10/cuda/CUDAFunctions.h +++ b/c10/cuda/CUDAFunctions.h @@ -111,6 +111,6 @@ C10_CUDA_API void __inline__ stream_synchronize(cudaStream_t stream) { } C10_CUDA_API bool hasPrimaryContext(DeviceIndex device_index); -C10_CUDA_API c10::optional getDeviceIndexWithPrimaryContext(); +C10_CUDA_API std::optional getDeviceIndexWithPrimaryContext(); } // namespace c10::cuda diff --git a/c10/cuda/impl/CUDAGuardImpl.h b/c10/cuda/impl/CUDAGuardImpl.h index 113f896c6fa29..ec50c8152b33e 100644 --- a/c10/cuda/impl/CUDAGuardImpl.h +++ b/c10/cuda/impl/CUDAGuardImpl.h @@ -40,7 +40,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface { C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); return Device(DeviceType::CUDA, device); } - c10::optional uncheckedGetDevice() const noexcept { + std::optional uncheckedGetDevice() const noexcept { DeviceIndex device{-1}; const auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDevice(&device)); C10_CUDA_CHECK_WARN(err); diff --git a/c10/test/util/optional_test.cpp b/c10/test/util/optional_test.cpp index f17cc61c51b1c..f95fc864d812c 100644 --- a/c10/test/util/optional_test.cpp +++ b/c10/test/util/optional_test.cpp @@ -22,7 +22,7 @@ using testing::Not; template class OptionalTest : public ::testing::Test { public: - using optional = c10::optional; + using optional = std::optional; }; template @@ -96,10 +96,10 @@ TYPED_TEST(OptionalTest, Initialized) { } } -class SelfCompareTest : public testing::TestWithParam> {}; +class SelfCompareTest : public testing::TestWithParam> {}; TEST_P(SelfCompareTest, SelfCompare) { - c10::optional x = GetParam(); + std::optional x = GetParam(); EXPECT_THAT(x, Eq(x)); EXPECT_THAT(x, Le(x)); EXPECT_THAT(x, Ge(x)); @@ -118,7 +118,7 @@ INSTANTIATE_TEST_SUITE_P( testing::Values(c10::make_optional(2))); TEST(OptionalTest, Nullopt) { - c10::optional x = 2; + std::optional x = 2; EXPECT_THAT(c10::nullopt, Not(Eq(x))); EXPECT_THAT(x, Not(Eq(c10::nullopt))); @@ -142,17 +142,17 @@ TEST(OptionalTest, Nullopt) { // Ensure comparisons work... using CmpTestTypes = testing::Types< // between two optionals - std::pair, c10::optional>, + std::pair, c10::optional>, // between an optional and a value - std::pair, int>, + std::pair, int>, // between a value and an optional - std::pair>, + std::pair>, // between an optional and a differently typed value - std::pair, long>, + std::pair, long>, // between a differently typed value and an optional - std::pair>>; + std::pair>>; template class CmpTest : public testing::Test {}; TYPED_TEST_SUITE(CmpTest, CmpTestTypes); diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h index 59ea43f8d959c..2a56e60832993 100644 --- a/c10/util/ArrayRef.h +++ b/c10/util/ArrayRef.h @@ -61,7 +61,7 @@ class ArrayRef final { void debugCheckNullptrInvariant() { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( Data != nullptr || Length == 0, - "created ArrayRef with nullptr and non-zero length! c10::optional relies on this being illegal"); + "created ArrayRef with nullptr and non-zero length! std::optional relies on this being illegal"); } public: diff --git a/c10/util/BFloat16.h b/c10/util/BFloat16.h index 95bc5f91838b6..badde3681f341 100644 --- a/c10/util/BFloat16.h +++ b/c10/util/BFloat16.h @@ -99,7 +99,7 @@ struct alignas(2) BFloat16 { } constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t) - : x(bits){}; + : x(bits) {} inline C10_HOST_DEVICE BFloat16(float value); inline C10_HOST_DEVICE operator float() const; diff --git a/c10/util/Backtrace.cpp b/c10/util/Backtrace.cpp index bbad1c879b7a4..7d0fedbb335a2 100644 --- a/c10/util/Backtrace.cpp +++ b/c10/util/Backtrace.cpp @@ -31,7 +31,30 @@ namespace c10 { -#if SUPPORTS_BACKTRACE && defined(C10_ANDROID) +namespace { + +#ifdef FBCODE_CAFFE2 + +// For some reason, the stacktrace implementation in fbcode is better than ours, +// see https://github.com/pytorch/pytorch/issues/56399 When it's available, just +// use that. +class GetBacktraceImpl { + public: + C10_ALWAYS_INLINE GetBacktraceImpl( + size_t frames_to_skip, + size_t /* maximum_number_of_frames */, + bool /* skip_python_frames */) + : st_(/*skipFrames=*/frames_to_skip) {} + + std::string symbolize() const { + return st_.toString(); + } + + private: + facebook::process::StackTrace st_; +}; + +#elif SUPPORTS_BACKTRACE && defined(C10_ANDROID) struct AndroidBacktraceState { std::vector buffer; @@ -48,44 +71,49 @@ _Unwind_Reason_Code android_unwind_callback( return _URC_NO_REASON; } -void dump_stack( - std::ostream& os, - size_t frames_to_skip, - size_t maximum_number_of_frames) { - AndroidBacktraceState state; - - _Unwind_Backtrace(android_unwind_callback, &state); +class GetBacktraceImpl { + public: + C10_ALWAYS_INLINE GetBacktraceImpl( + size_t /* frames_to_skip */, + size_t /* maximum_number_of_frames */, + bool /* skip_python_frames */) { + _Unwind_Backtrace(android_unwind_callback, &state_); + } - int idx = 0; - char* demangled = nullptr; - size_t length = 0; + std::string symbolize() const { + std::ostringstream os; + int idx = 0; + char* demangled = nullptr; + size_t length = 0; - for (const void* addr : state.buffer) { - const char* symbol = ""; + for (const void* addr : state_.buffer) { + const char* symbol = ""; - Dl_info info; - if (dladdr(addr, &info) && info.dli_sname) { - symbol = info.dli_sname; - } + Dl_info info; + if (dladdr(addr, &info) && info.dli_sname) { + symbol = info.dli_sname; + } - int status = 0; - demangled = __cxxabiv1::__cxa_demangle( - /*mangled_name*/ symbol, - /*output_buffer*/ demangled, - /*length*/ &length, - /*status*/ &status); + int status = 0; + demangled = __cxxabiv1::__cxa_demangle( + /*mangled_name*/ symbol, + /*output_buffer*/ demangled, + /*length*/ &length, + /*status*/ &status); - os << " frame #" << idx++ << "\t" - << ((demangled != NULL && status == 0) ? demangled : symbol) << "[" - << addr << "]\t" << std::endl; + os << " frame #" << idx++ << "\t" + << ((demangled != NULL && status == 0) ? demangled : symbol) << "[" + << addr << "]\t" << std::endl; + } + free(demangled); + return os.str(); } - free(demangled); -} -#endif /* SUPPORTS_BACKTRACE && defined(C10_ANDROID) */ + private: + AndroidBacktraceState state_; +}; -#if SUPPORTS_BACKTRACE -namespace { +#elif SUPPORTS_BACKTRACE // !defined(C10_ANDROID) struct FrameInformation { /// If available, the demangled name of the function at this frame, else @@ -101,13 +129,12 @@ struct FrameInformation { std::string object_file; }; -#ifndef C10_ANDROID bool is_python_frame(const FrameInformation& frame) { return frame.object_file == "python" || frame.object_file == "python3" || (frame.object_file.find("libpython") != std::string::npos); } -c10::optional parse_frame_information( +std::optional parse_frame_information( const std::string& frame_string) { FrameInformation frame; @@ -173,10 +200,89 @@ c10::optional parse_frame_information( frame.function_name = demangle(mangled_function_name.c_str()); return frame; } -#endif /* !defined(C10_ANDROID) */ -} // anonymous namespace -#elif defined(_MSC_VER) -namespace { + +class GetBacktraceImpl { + public: + C10_ALWAYS_INLINE GetBacktraceImpl( + size_t frames_to_skip, + size_t maximum_number_of_frames, + bool skip_python_frames) + : skip_python_frames_(skip_python_frames), + callstack_(frames_to_skip + maximum_number_of_frames, nullptr) { + // We always skip this frame (backtrace). + frames_to_skip += 1; + + // backtrace() gives us a list of return addresses in the current call + // stack. NOTE: As per man (3) backtrace it can never fail + // (http://man7.org/linux/man-pages/man3/backtrace.3.html). + auto number_of_frames = static_cast( + ::backtrace(callstack_.data(), static_cast(callstack_.size()))); + + // Skip as many frames as requested. + frames_to_skip = std::min(frames_to_skip, number_of_frames); + number_of_frames -= frames_to_skip; + callstack_.erase( + callstack_.begin(), + callstack_.begin() + static_cast(frames_to_skip)); + callstack_.resize(number_of_frames); + } + + std::string symbolize() const { + // `backtrace_symbols` takes the return addresses obtained from + // `backtrace()` and fetches string representations of each stack. + // Unfortunately it doesn't return a struct of individual pieces of + // information but a concatenated string, so we'll have to parse the string + // after. NOTE: The array returned by `backtrace_symbols` is malloc'd and + // must be manually freed, but not the strings inside the array. + std::unique_ptr> raw_symbols( + ::backtrace_symbols( + callstack_.data(), static_cast(callstack_.size())), + /*deleter=*/free); + const std::vector symbols( + raw_symbols.get(), raw_symbols.get() + callstack_.size()); + + // The backtrace string goes into here. + std::ostringstream stream; + + // Toggles to true after the first skipped python frame. + bool has_skipped_python_frames = false; + + for (const auto frame_number : c10::irange(callstack_.size())) { + const auto frame = parse_frame_information(symbols[frame_number]); + + if (skip_python_frames_ && frame && is_python_frame(*frame)) { + if (!has_skipped_python_frames) { + stream << "\n"; + has_skipped_python_frames = true; + } + continue; + } + + // frame #: + stream << "frame #" << frame_number << ": "; + + if (frame) { + // + ( in ) + stream << frame->function_name << " + " << frame->offset_into_function + << " (" << callstack_[frame_number] << " in " + << frame->object_file << ")\n"; + } else { + // In the edge-case where we couldn't parse the frame string, we can + // just use it directly (it may have a different format). + stream << symbols[frame_number] << "\n"; + } + } + + return stream.str(); + } + + private: + const bool skip_python_frames_; + std::vector callstack_; +}; + +#elif defined(_MSC_VER) // !SUPPORTS_BACKTRACE + const int max_name_len = 256; std::string get_module_base_name(void* addr) { HMODULE h_module; @@ -225,180 +331,144 @@ class SymbolHelper { SymbolHelper(SymbolHelper const&) = delete; void operator=(SymbolHelper const&) = delete; }; -} // anonymous namespace -#endif // SUPPORTS_BACKTRACE -std::string get_backtrace( - size_t frames_to_skip, - size_t maximum_number_of_frames, - bool skip_python_frames) { -#ifdef FBCODE_CAFFE2 - // For some reason, the stacktrace implementation in fbcode is - // better than ours, see https://github.com/pytorch/pytorch/issues/56399 - // When it's available, just use that. - facebook::process::StackTrace st; - return st.toString(); - -#elif SUPPORTS_BACKTRACE && !defined(C10_ANDROID) - - // We always skip this frame (backtrace). - frames_to_skip += 1; - - std::vector callstack( - frames_to_skip + maximum_number_of_frames, nullptr); - // backtrace() gives us a list of return addresses in the current call stack. - // NOTE: As per man (3) backtrace it can never fail - // (http://man7.org/linux/man-pages/man3/backtrace.3.html). - auto number_of_frames = - ::backtrace(callstack.data(), static_cast(callstack.size())); - - // Skip as many frames as requested. This is not efficient, but the sizes here - // are small and it makes the code nicer and safer. - for (; frames_to_skip > 0 && number_of_frames > 0; - --frames_to_skip, --number_of_frames) { - callstack.erase(callstack.begin()); +// This backtrace retrieval is implemented on Windows via the Windows API using +// `CaptureStackBackTrace`, `SymFromAddr` and `SymGetLineFromAddr64`. +// https://stackoverflow.com/questions/5693192/win32-backtrace-from-c-code +// https://stackoverflow.com/questions/26398064/counterpart-to-glibcs-backtrace-and-backtrace-symbols-on-windows +// https://docs.microsoft.com/en-us/windows/win32/debug/capturestackbacktrace +// https://docs.microsoft.com/en-us/windows/win32/api/dbghelp/nf-dbghelp-symfromaddr +// https://docs.microsoft.com/en-us/windows/win32/api/dbghelp/nf-dbghelp-symgetlinefromaddr64 +// TODO: Support skipping python frames +class GetBacktraceImpl { + public: + C10_ALWAYS_INLINE GetBacktraceImpl( + size_t frames_to_skip, + size_t maximum_number_of_frames, + bool /* skip_python_frames */) + : back_trace_(new void*[maximum_number_of_frames]) { + // We always skip this frame (backtrace). + frames_to_skip += 1; + + // Get the frames + n_frame_ = CaptureStackBackTrace( + static_cast(frames_to_skip), + static_cast(maximum_number_of_frames), + back_trace_.get(), + NULL); } - // `number_of_frames` is strictly less than the current capacity of - // `callstack`, so this is just a pointer subtraction and makes the subsequent - // code safer. - callstack.resize(static_cast(number_of_frames)); - - // `backtrace_symbols` takes the return addresses obtained from `backtrace()` - // and fetches string representations of each stack. Unfortunately it doesn't - // return a struct of individual pieces of information but a concatenated - // string, so we'll have to parse the string after. NOTE: The array returned - // by `backtrace_symbols` is malloc'd and must be manually freed, but not the - // strings inside the array. - std::unique_ptr> raw_symbols( - ::backtrace_symbols(callstack.data(), static_cast(callstack.size())), - /*deleter=*/free); - const std::vector symbols( - raw_symbols.get(), raw_symbols.get() + callstack.size()); - - // The backtrace string goes into here. - std::ostringstream stream; - - // Toggles to true after the first skipped python frame. - bool has_skipped_python_frames = false; - - for (const auto frame_number : c10::irange(callstack.size())) { - const auto frame = parse_frame_information(symbols[frame_number]); - - if (skip_python_frames && frame && is_python_frame(*frame)) { - if (!has_skipped_python_frames) { - stream << "\n"; - has_skipped_python_frames = true; + std::string symbolize() const { + DWORD64 displacement; + DWORD disp; + std::unique_ptr line; + + char buffer[sizeof(SYMBOL_INFO) + MAX_SYM_NAME * sizeof(TCHAR)]; + PSYMBOL_INFO p_symbol = (PSYMBOL_INFO)buffer; + + bool with_symbol = false; + bool with_line = false; + + // The backtrace string goes into here. + std::ostringstream stream; + + // Initialize symbols if necessary + SymbolHelper& sh = SymbolHelper::getInstance(); + + for (USHORT i_frame = 0; i_frame < n_frame_; ++i_frame) { + // Get the address and the name of the symbol + if (sh.inited) { + p_symbol->SizeOfStruct = sizeof(SYMBOL_INFO); + p_symbol->MaxNameLen = MAX_SYM_NAME; + with_symbol = SymFromAddr( + sh.process, (ULONG64)back_trace_[i_frame], &displacement, p_symbol); } - continue; - } - // frame #: - stream << "frame #" << frame_number << ": "; - - if (frame) { - // + ( in ) - stream << frame->function_name << " + " << frame->offset_into_function - << " (" << callstack[frame_number] << " in " << frame->object_file - << ")\n"; - } else { - // In the edge-case where we couldn't parse the frame string, we can - // just use it directly (it may have a different format). - stream << symbols[frame_number] << "\n"; + // Get the line number and the module + if (sh.inited) { + line.reset(new IMAGEHLP_LINE64()); + line->SizeOfStruct = sizeof(IMAGEHLP_LINE64); + with_line = SymGetLineFromAddr64( + sh.process, (ULONG64)back_trace_[i_frame], &disp, line.get()); + } + + // Get the module basename + std::string module = get_module_base_name(back_trace_[i_frame]); + + // The pattern on Windows is + // ` + // ! [ @ ] + stream << std::setfill('0') << std::setw(16) << std::uppercase << std::hex + << back_trace_[i_frame] << std::dec; + if (with_symbol) { + stream << std::setfill('0') << std::setw(16) << std::uppercase + << std::hex << p_symbol->Address << std::dec << " " << module + << "!" << p_symbol->Name; + } else { + stream << " " << module << "!"; + } + stream << " ["; + if (with_line) { + stream << line->FileName << " @ " << line->LineNumber; + } else { + stream << " @ "; + } + stream << "]" << std::endl; } + + return stream.str(); } - return stream.str(); + private: + std::unique_ptr back_trace_; + USHORT n_frame_; +}; -#elif SUPPORTS_BACKTRACE && defined(C10_ANDROID) +#else - std::ostringstream oss; - dump_stack(oss, frames_to_skip, maximum_number_of_frames); - return oss.str().c_str(); +class GetBacktraceImpl { + public: + C10_ALWAYS_INLINE GetBacktraceImpl( + size_t /* frames_to_skip */, + size_t /* maximum_number_of_frames */, + bool /* skip_python_frames */) {} -#elif defined(_MSC_VER) // !SUPPORTS_BACKTRACE - // This backtrace retrieval is implemented on Windows via the Windows - // API using `CaptureStackBackTrace`, `SymFromAddr` and - // `SymGetLineFromAddr64`. - // https://stackoverflow.com/questions/5693192/win32-backtrace-from-c-code - // https://stackoverflow.com/questions/26398064/counterpart-to-glibcs-backtrace-and-backtrace-symbols-on-windows - // https://docs.microsoft.com/en-us/windows/win32/debug/capturestackbacktrace - // https://docs.microsoft.com/en-us/windows/win32/api/dbghelp/nf-dbghelp-symfromaddr - // https://docs.microsoft.com/en-us/windows/win32/api/dbghelp/nf-dbghelp-symgetlinefromaddr64 - // TODO: Support skipping python frames - - // We always skip this frame (backtrace). - frames_to_skip += 1; - - DWORD64 displacement; - DWORD disp; - std::unique_ptr line; - - char buffer[sizeof(SYMBOL_INFO) + MAX_SYM_NAME * sizeof(TCHAR)]; - PSYMBOL_INFO p_symbol = (PSYMBOL_INFO)buffer; - - std::unique_ptr back_trace(new void*[maximum_number_of_frames]); - bool with_symbol = false; - bool with_line = false; - - // The backtrace string goes into here. - std::ostringstream stream; - - // Get the frames - const USHORT n_frame = CaptureStackBackTrace( - static_cast(frames_to_skip), - static_cast(maximum_number_of_frames), - back_trace.get(), - NULL); - - // Initialize symbols if necessary - SymbolHelper& sh = SymbolHelper::getInstance(); - - for (USHORT i_frame = 0; i_frame < n_frame; ++i_frame) { - // Get the address and the name of the symbol - if (sh.inited) { - p_symbol->SizeOfStruct = sizeof(SYMBOL_INFO); - p_symbol->MaxNameLen = MAX_SYM_NAME; - with_symbol = SymFromAddr( - sh.process, (ULONG64)back_trace[i_frame], &displacement, p_symbol); - } + std::string symbolize() const { + return "(no backtrace available)"; + } +}; - // Get the line number and the module - if (sh.inited) { - line.reset(new IMAGEHLP_LINE64()); - line->SizeOfStruct = sizeof(IMAGEHLP_LINE64); - with_line = SymGetLineFromAddr64( - sh.process, (ULONG64)back_trace[i_frame], &disp, line.get()); - } +#endif - // Get the module basename - std::string module = get_module_base_name(back_trace[i_frame]); +} // namespace - // The pattern on Windows is - // ` - // ! [ @ ] - stream << std::setfill('0') << std::setw(16) << std::uppercase << std::hex - << back_trace[i_frame] << std::dec; - if (with_symbol) { - stream << std::setfill('0') << std::setw(16) << std::uppercase << std::hex - << p_symbol->Address << std::dec << " " << module << "!" - << p_symbol->Name; - } else { - stream << " " << module << "!"; - } - stream << " ["; - if (with_line) { - stream << line->FileName << " @ " << line->LineNumber; - } else { - stream << " @ "; +std::string get_backtrace( + size_t frames_to_skip, + size_t maximum_number_of_frames, + bool skip_python_frames) { + return GetBacktraceImpl{ + frames_to_skip, maximum_number_of_frames, skip_python_frames} + .symbolize(); +} + +Backtrace get_lazy_backtrace( + size_t frames_to_skip, + size_t maximum_number_of_frames, + bool skip_python_frames) { + class LazyBacktrace : public OptimisticLazyValue { + public: + LazyBacktrace(GetBacktraceImpl&& impl) : impl_(std::move(impl)) {} + + private: + std::string compute() const override { + return impl_.symbolize(); } - stream << "]" << std::endl; - } - return stream.str(); -#else // !SUPPORTS_BACKTRACE && !_WIN32 - return "(no backtrace available)"; -#endif // SUPPORTS_BACKTRACE + GetBacktraceImpl impl_; + }; + + return std::make_shared(GetBacktraceImpl{ + frames_to_skip, maximum_number_of_frames, skip_python_frames}); } } // namespace c10 diff --git a/c10/util/Backtrace.h b/c10/util/Backtrace.h index 75691286d9019..500bf4cf407b2 100644 --- a/c10/util/Backtrace.h +++ b/c10/util/Backtrace.h @@ -2,16 +2,30 @@ #define C10_UTIL_BACKTRACE_H_ #include +#include #include #include #include +#include namespace c10 { + +// Symbolizing the backtrace can be expensive; pass it around as a lazy string +// so it is symbolized only if actually needed. +using Backtrace = std::shared_ptr>; + +// DEPRECATED: Prefer get_lazy_backtrace(). C10_API std::string get_backtrace( size_t frames_to_skip = 0, size_t maximum_number_of_frames = 64, bool skip_python_frames = true); + +C10_API Backtrace get_lazy_backtrace( + size_t frames_to_skip = 0, + size_t maximum_number_of_frames = 64, + bool skip_python_frames = true); + } // namespace c10 #endif // C10_UTIL_BACKTRACE_H_ diff --git a/c10/util/Exception.cpp b/c10/util/Exception.cpp index a0b9fa1e72ec8..76083cd14a838 100644 --- a/c10/util/Exception.cpp +++ b/c10/util/Exception.cpp @@ -58,7 +58,7 @@ std::string Error::compute_what(bool include_backtrace) const { return oss.str(); } -const Error::Backtrace& Error::backtrace() const { +const Backtrace& Error::backtrace() const { return backtrace_; } @@ -142,7 +142,7 @@ namespace { WarningHandler* getBaseHandler() { static WarningHandler base_warning_handler_ = WarningHandler(); return &base_warning_handler_; -}; +} class ThreadWarningHandler { public: diff --git a/c10/util/Exception.h b/c10/util/Exception.h index 750e978059ba9..d75c6a8cd30c3 100644 --- a/c10/util/Exception.h +++ b/c10/util/Exception.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -27,11 +28,6 @@ namespace c10 { /// NB: c10::Error is handled specially by the default torch to suppress the /// backtrace, see torch/csrc/Exceptions.h class C10_API Error : public std::exception { - public: - // Symbolizing the backtrace can be expensive; pass it around as a lazy string - // so it is symbolized only if actually needed. - using Backtrace = std::shared_ptr>; - private: // The actual error message. std::string msg_; diff --git a/c10/util/Float8_e4m3fn.h b/c10/util/Float8_e4m3fn.h index d51feabcc8c4d..8e05e2e43bb01 100644 --- a/c10/util/Float8_e4m3fn.h +++ b/c10/util/Float8_e4m3fn.h @@ -233,7 +233,7 @@ struct alignas(1) Float8_e4m3fn { Float8_e4m3fn() = default; constexpr C10_HOST_DEVICE Float8_e4m3fn(uint8_t bits, from_bits_t) - : x(bits){}; + : x(bits) {} inline C10_HOST_DEVICE Float8_e4m3fn(float value); inline C10_HOST_DEVICE operator float() const; inline C10_HOST_DEVICE bool isnan() const; diff --git a/c10/util/Float8_e4m3fnuz.h b/c10/util/Float8_e4m3fnuz.h index bed29891749a3..86ece9ebdadbb 100644 --- a/c10/util/Float8_e4m3fnuz.h +++ b/c10/util/Float8_e4m3fnuz.h @@ -121,7 +121,7 @@ struct alignas(1) Float8_e4m3fnuz { Float8_e4m3fnuz() = default; constexpr C10_HOST_DEVICE Float8_e4m3fnuz(uint8_t bits, from_bits_t) - : x(bits){}; + : x(bits) {} inline C10_HOST_DEVICE Float8_e4m3fnuz(float value); inline C10_HOST_DEVICE operator float() const; inline C10_HOST_DEVICE bool isnan() const; diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp index 27feb9346f880..66a24980a44b4 100644 --- a/c10/util/Logging.cpp +++ b/c10/util/Logging.cpp @@ -3,7 +3,6 @@ #include #include #ifdef FBCODE_CAFFE2 -#include #include #endif @@ -26,30 +25,15 @@ C10_DEFINE_bool( namespace c10 { namespace { -std::function<::c10::Error::Backtrace()>& GetFetchStackTrace() { - static std::function<::c10::Error::Backtrace()> func = []() { -#ifdef FBCODE_CAFFE2 - // Same implementation as get_backtrace() in fbcode, but with lazy - // symbolization. - class LazyBacktrace : public OptimisticLazyValue { - facebook::process::StackTrace st_; - - std::string compute() const override { - return st_.toString(); - } - }; - - return std::make_shared(); -#else - return std::make_shared>( - get_backtrace(/*frames_to_skip=*/1)); -#endif +std::function<::c10::Backtrace()>& GetFetchStackTrace() { + static std::function<::c10::Backtrace()> func = []() { + return get_lazy_backtrace(/*frames_to_skip=*/1); }; return func; -}; +} } // namespace -void SetStackTraceFetcher(std::function<::c10::Error::Backtrace()> fetcher) { +void SetStackTraceFetcher(std::function<::c10::Backtrace()> fetcher) { GetFetchStackTrace() = std::move(fetcher); } @@ -116,7 +100,7 @@ class PyTorchStyleBacktrace : public OptimisticLazyValue { backtrace_->get()); } - ::c10::Error::Backtrace backtrace_; + ::c10::Backtrace backtrace_; SourceLocation source_location_; }; @@ -150,19 +134,19 @@ APIUsageLoggerType* GetAPIUsageLogger() { static APIUsageLoggerType func = IsAPIUsageDebugMode() ? &APIUsageDebug : [](const string&) {}; return &func; -}; +} APIUsageMetadataLoggerType* GetAPIUsageMetadataLogger() { static APIUsageMetadataLoggerType func = [](const std::string&, const std::map& metadata_map) {}; return &func; -}; +} DDPUsageLoggerType* GetDDPUsageLogger() { static DDPUsageLoggerType func = [](const DDPLoggingData&) {}; return &func; -}; +} } // namespace void SetAPIUsageLogger(std::function logger) { diff --git a/c10/util/Logging.h b/c10/util/Logging.h index caab50c8e0cda..a2349e423d013 100644 --- a/c10/util/Logging.h +++ b/c10/util/Logging.h @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -126,8 +127,7 @@ constexpr bool IsUsingGoogleLogging() { */ C10_API void ShowLogInfoToStderr(); -C10_API void SetStackTraceFetcher( - std::function<::c10::Error::Backtrace()> fetcher); +C10_API void SetStackTraceFetcher(std::function<::c10::Backtrace()> fetcher); /** * Convenience function for non-lazy stack trace fetchers. The Backtrace diff --git a/c10/util/OptionalArrayRef.h b/c10/util/OptionalArrayRef.h index 2c2b88722d4d7..98237bba92f56 100644 --- a/c10/util/OptionalArrayRef.h +++ b/c10/util/OptionalArrayRef.h @@ -1,11 +1,11 @@ // This file defines OptionalArrayRef, a class that has almost the same -// exact functionality as c10::optional>, except that its +// exact functionality as std::optional>, except that its // converting constructor fixes a dangling pointer issue. // -// The implicit converting constructor of both c10::optional> and +// The implicit converting constructor of both std::optional> and // std::optional> can cause the underlying ArrayRef to store // a dangling pointer. OptionalArrayRef prevents this by wrapping -// a c10::optional> and fixing the constructor implementation. +// a std::optional> and fixing the constructor implementation. // // See https://github.com/pytorch/pytorch/issues/63645 for more on this. diff --git a/c10/xpu/test/impl/XPUStreamTest.cpp b/c10/xpu/test/impl/XPUStreamTest.cpp index 16f6e20c2163e..01a1dbb62621b 100644 --- a/c10/xpu/test/impl/XPUStreamTest.cpp +++ b/c10/xpu/test/impl/XPUStreamTest.cpp @@ -82,7 +82,7 @@ TEST(XPUStreamTest, StreamBehavior) { EXPECT_NE(stream.device_index(), c10::xpu::current_device()); } -void thread_fun(c10::optional& cur_thread_stream) { +void thread_fun(std::optional& cur_thread_stream) { auto new_stream = c10::xpu::getStreamFromPool(); c10::xpu::setCurrentXPUStream(new_stream); cur_thread_stream = {c10::xpu::getCurrentXPUStream()}; @@ -94,7 +94,7 @@ TEST(XPUStreamTest, MultithreadStreamBehavior) { if (!has_xpu()) { return; } - c10::optional s0, s1; + std::optional s0, s1; std::thread t0{thread_fun, std::ref(s0)}; std::thread t1{thread_fun, std::ref(s1)}; diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 13282063d9078..bd2588b5aef35 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -110,21 +110,11 @@ endif() add_subdirectory(core) add_subdirectory(serialize) add_subdirectory(utils) -if(BUILD_CAFFE2 OR (NOT USE_FBGEMM)) +if(NOT USE_FBGEMM) add_subdirectory(perfkernels) endif() -# Skip modules that are not used by libtorch mobile yet. -if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE) - add_subdirectory(core/nomnigraph) - if(USE_NVRTC) - add_subdirectory(cuda_rtc) - endif() - if(BUILD_CAFFE2_OPS) - endif() - add_subdirectory(proto) -endif() -if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE) +if(NOT INTERN_BUILD_MOBILE) add_subdirectory(proto) endif() @@ -585,17 +575,10 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER) ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp ) - # Disable legacy import of building without Caffe2 support - if(BUILD_CAFFE2) - list(APPEND TORCH_SRCS - ${TORCH_SRC_DIR}/csrc/jit/serialization/import_legacy.cpp - ) - else() - set_source_files_properties( - ${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp - PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT" - ) - endif() + set_source_files_properties( + ${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp + PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT" + ) if(USE_DISTRIBUTED) append_filelist("libtorch_distributed_base_sources" TORCH_SRCS) if(NOT WIN32) @@ -809,11 +792,6 @@ if(HAVE_SOVERSION) VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION}) endif() torch_compile_options(torch_cpu) # see cmake/public/utils.cmake -if(BUILD_CAFFE2 AND NOT MSVC) - # Caffe2 has too many signed-unsigned violation, but the framework is dead - # So no point in fixing those - target_compile_options(torch_cpu PRIVATE "-Wno-sign-compare") -endif() # Ignore Wdeprecated-XXX errors from third-party libraries if(NOT MSVC) @@ -1921,14 +1899,6 @@ if(BUILD_TEST) endif() endforeach() endif() - - # For special tests that explicitly uses dependencies, we add them here - if(BUILD_CAFFE2 AND USE_MPI) - target_link_libraries(mpi_test MPI::MPI_CXX) - if(USE_CUDA) - target_link_libraries(mpi_gpu_test MPI::MPI_CXX) - endif() - endif() endif() if(MSVC) @@ -1998,11 +1968,6 @@ if(BUILD_PYTHON) set_source_files_properties(${TORCH_SRC_DIR}/../caffe2/operators/box_with_nms_limit_op.cc PROPERTIES COMPILE_FLAGS -Wno-attributes) endif() - # ---[ Python. - if(BUILD_CAFFE2) - target_compile_definitions(torch PRIVATE BUILD_CAFFE2) - endif() - # generated pb files are copied from build/caffe2 to caffe2 # if we copied them back to build this would create a build cycle # consider removing the need for globs diff --git a/caffe2/__init__.py b/caffe2/__init__.py index 4096a98283857..f319e8e2dc15b 100644 --- a/caffe2/__init__.py +++ b/caffe2/__init__.py @@ -2,5 +2,4 @@ from torch.onnx import _CAFFE2_ATEN_FALLBACK if not _CAFFE2_ATEN_FALLBACK: - warnings.warn("Caffe2 support is not fully enabled in this PyTorch build. " - "Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.") + warnings.warn("Caffe2 support is no longer present in PyTorch.") diff --git a/caffe2/core/CMakeLists.txt b/caffe2/core/CMakeLists.txt index f59c0e703edf7..371d2216b50ea 100644 --- a/caffe2/core/CMakeLists.txt +++ b/caffe2/core/CMakeLists.txt @@ -1,68 +1,4 @@ -if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE) - list(APPEND Caffe2_CPU_SRCS - "${CMAKE_CURRENT_SOURCE_DIR}/common.cc" - ) - set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) - return() -endif() - -# ---[ GPU files -# ------[ cuDNN -if(USE_CUDNN) - file(GLOB tmp *_cudnn.cc) - set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp}) -endif() -# ------[ general GPU -file(GLOB tmp *_gpu.cc) -set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp}) -# ------[ CUDA sources -file(GLOB tmp *.cu) -set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp}) -# exclude test files -file(GLOB tmp *_test.cc) -exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp}) - -# ---[ general HIP files -file(GLOB tmp hip/*.cc) -set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp}) -# ------[ HIP sources -file(GLOB tmp hip/*.hip) -set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp}) -# exclude test files -file(GLOB tmp hip/*_test.cc) -exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp}) - -# ---[ CPU files. -file(GLOB tmp *.cc) -# Manually remove the cudnn files since we might be using USE_CUDNN=OFF -# TODO: when we move to explicit file list, this would not be needed. -file(GLOB tmp_cudnn *_cudnn.cc) -exclude(tmp "${tmp}" ${tmp_cudnn}) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp}) -# exclude test files and gpu files -file(GLOB tmp *_test.cc) -exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp}) -exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS}) -exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS}) - -# ---[ GPU test files -file(GLOB tmp *_gpu_test.cc) -set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp}) - -# ---[ HIP test files -file(GLOB tmp hip/*_test.cc) -set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp}) - -# ---[ CPU test files -file(GLOB tmp *_test.cc) -set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp}) -exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS}) -exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS}) - -# ---[ Send the lists to the parent scope. +list(APPEND Caffe2_CPU_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/common.cc" +) set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) -set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE) -set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE) -set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE) -set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE) -set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE) diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc index 5cd01ba7cc59c..fca3e63f72182 100644 --- a/caffe2/core/blob_serialization.cc +++ b/caffe2/core/blob_serialization.cc @@ -1,5 +1,6 @@ #include "caffe2/core/blob_serialization.h" +#include #include #include #include @@ -83,8 +84,7 @@ Range GetMutableTensorDataRange( size_t start, size_t numElements) { CAFFE_ENFORCE( - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - start + numElements <= tensor.numel(), + static_cast(start + numElements) <= tensor.numel(), "Requested invalid mutable tensor range [", start, ", ", @@ -100,8 +100,7 @@ c10::ArrayRef GetTensorDataRange( size_t start, size_t numElements) { CAFFE_ENFORCE( - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - start + numElements <= tensor.numel(), + static_cast(start + numElements) <= tensor.numel(), "Requested invalid tensor range [", start, ", ", @@ -390,8 +389,7 @@ void TensorSerializer::SerializeWithOptions( // Poorman's IOBound ThreadPool SimpleQueue chunkQueue; auto task = [&]() { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - size_t chunkStart; + size_t chunkStart = std::numeric_limits::max(); while (chunkQueue.Pop(&chunkStart)) { processChunk(chunkStart); } @@ -409,8 +407,7 @@ void TensorSerializer::SerializeWithOptions( VLOG(1) << "Serializing blob " << name; // Serialize whole vector. If vector is empty, it's shape still needs to be // serialized in empty proto - for (size_t chunkBegin = 0; - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) + for (int64_t chunkBegin = 0; chunkBegin < std::max(tensor.numel(), static_cast(1)); chunkBegin += chunk_size) { VLOG(2) << "Starting a chunk at " << chunkBegin; @@ -582,8 +579,7 @@ void SerializeTensorData(const SerializeParams& params) { BlobSerializationOptions_FloatFormat_FLOAT_BFLOAT16) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) std::unique_ptr tmp_buffer; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - const float* src; + const float* src = nullptr; if (params.context.device() == CPU) { src = params.input.data(); } else { @@ -653,14 +649,12 @@ void TensorSerializer::Serialize( size_t chunkBegin, int32_t chunkSize) { CAFFE_ENFORCE( - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - chunkBegin <= input.numel(), + static_cast(chunkBegin) <= input.numel(), "Chunk begin is out of tensor: ", chunkBegin, ' ', input.numel()); - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - if (chunkBegin + chunkSize > input.numel()) { + if (static_cast(chunkBegin + chunkSize) > input.numel()) { chunkSize = input.numel() - chunkBegin; } @@ -1029,8 +1023,7 @@ DESERIALIZE_IMPL(float, FMT_BFLOAT16) { params.tensor_proto.raw_data().data()); // If we are on a big-endian machine, byte-swap the serialized data. - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - const fbgemm::bfloat16* src; + const fbgemm::bfloat16* src = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) std::unique_ptr bswap_buffer; if (kIsLittleEndian) { @@ -1045,8 +1038,7 @@ DESERIALIZE_IMPL(float, FMT_BFLOAT16) { // bfloat16 to float conversion. // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) std::unique_ptr tmp_buffer; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - float* dest; + float* dest = nullptr; if (params.context.device() == CPU) { dest = params.dest.data(); } else { diff --git a/caffe2/core/context.h b/caffe2/core/context.h index 36fd4e400fe8c..eb46f78f8b0d9 100644 --- a/caffe2/core/context.h +++ b/caffe2/core/context.h @@ -63,23 +63,23 @@ class TORCH_API CPUContext final : public BaseContext { return (static_cast(random1) << 32) | random2; } - c10::optional next_float_normal_sample() { + std::optional next_float_normal_sample() { return next_float_normal_sample_; } - c10::optional next_double_normal_sample() { + std::optional next_double_normal_sample() { return next_double_normal_sample_; } - void set_next_float_normal_sample(c10::optional randn) { + void set_next_float_normal_sample(std::optional randn) { next_float_normal_sample_ = randn; } - void set_next_double_normal_sample(c10::optional randn) { + void set_next_double_normal_sample(std::optional randn) { next_double_normal_sample_ = randn; } private: at::mt19937 engine_; - c10::optional next_float_normal_sample_; - c10::optional next_double_normal_sample_; + std::optional next_float_normal_sample_; + std::optional next_double_normal_sample_; }; #else typedef std::mt19937 rand_gen_type; diff --git a/caffe2/core/export_c10_op_to_caffe2.h b/caffe2/core/export_c10_op_to_caffe2.h index b8bbfda84a50e..f03da90c1b861 100644 --- a/caffe2/core/export_c10_op_to_caffe2.h +++ b/caffe2/core/export_c10_op_to_caffe2.h @@ -185,7 +185,7 @@ class C10OperatorWrapper final : public Operator { template IValue get_nontensor_argument_( const std::string& name, - const c10::optional& default_value) { + const std::optional& default_value) { if (default_value.has_value()) { return this->template GetSingleArgument(name, default_value->to()); } else { diff --git a/caffe2/core/export_caffe2_op_to_c10.h b/caffe2/core/export_caffe2_op_to_c10.h index 216d3833648bf..7e803e545e212 100644 --- a/caffe2/core/export_caffe2_op_to_c10.h +++ b/caffe2/core/export_caffe2_op_to_c10.h @@ -126,7 +126,7 @@ void call_caffe2_op_from_c10( inline FunctionSchema make_function_schema_for_c10( const char* schema_str, - c10::optional optional_alias_analysis_kind) { + std::optional optional_alias_analysis_kind) { #if !defined(EXPOSE_C2_OPS) && \ (defined(CAFFE2_IS_XPLAT_BUILD) || defined(C10_MOBILE)) throw std::logic_error( diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc index a978cfd164ce8..7cf1ef909f18b 100644 --- a/caffe2/core/operator.cc +++ b/caffe2/core/operator.cc @@ -825,7 +825,7 @@ std::function GetOperatorLogger() { return OperatorLogger; } -c10::optional OperatorBase::argumentIndexWithName( +std::optional OperatorBase::argumentIndexWithName( c10::string_view name) const { #if defined(EXPOSE_C2_OPS) || \ !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h index 0dbf31e5932b0..3277357b4f34c 100644 --- a/caffe2/core/operator.h +++ b/caffe2/core/operator.h @@ -605,7 +605,7 @@ class TORCH_API OperatorBase : public Observable { std::string type_; vector inputs_; vector outputs_; - // Preferably use c10::optional, but nvcc doesn't work + // Preferably use std::optional, but nvcc doesn't work #if defined(EXPOSE_C2_OPS) || \ !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) std::unique_ptr fn_schema_; @@ -649,7 +649,7 @@ class TORCH_API OperatorBase : public Observable { } } - c10::optional argumentIndexWithName(c10::string_view name) const; + std::optional argumentIndexWithName(c10::string_view name) const; // An event used by asynchronous execution. std::unique_ptr event_; diff --git a/caffe2/proto/CMakeLists.txt b/caffe2/proto/CMakeLists.txt index ba6b696dde4ba..bdbc045afb3d7 100644 --- a/caffe2/proto/CMakeLists.txt +++ b/caffe2/proto/CMakeLists.txt @@ -1,8 +1,4 @@ -if(BUILD_CAFFE2) - file(GLOB Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto") -else() - set(Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/torch.proto;${CMAKE_CURRENT_SOURCE_DIR}/caffe2.proto") -endif() +set(Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/torch.proto;${CMAKE_CURRENT_SOURCE_DIR}/caffe2.proto") caffe2_protobuf_generate_cpp_py(Caffe2_PROTO_SRCS Caffe2_PROTO_HEADERS Caffe2_PROTO_PY ${Caffe2_PROTOBUF_FILES}) diff --git a/caffe2/proto/__init__.py b/caffe2/proto/__init__.py index ce54a1aee5745..c40ca97189d1b 100644 --- a/caffe2/proto/__init__.py +++ b/caffe2/proto/__init__.py @@ -14,8 +14,7 @@ try: from caffe2.proto import caffe2_pb2, metanet_pb2, torch_pb2 except ImportError: - warnings.warn('Caffe2 support is not enabled in this PyTorch build. ' - 'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.') + warnings.warn('Caffe2 support is no longer present in PyTorch.') raise try: diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py index 888d286458a3a..1e44baf28153f 100644 --- a/caffe2/python/__init__.py +++ b/caffe2/python/__init__.py @@ -6,8 +6,7 @@ try: from caffe2.proto import caffe2_pb2 except ImportError: - warnings.warn('Caffe2 support is not enabled in this PyTorch build. ' - 'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.') + warnings.warn('Caffe2 support is no longer present in PyTorch.') raise # TODO: refactor & remove the following alias diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc index 00d922f356dfc..83415da0a4f77 100644 --- a/caffe2/serialize/inline_container.cc +++ b/caffe2/serialize/inline_container.cc @@ -610,7 +610,8 @@ size_t ostream_write_func( // Get the CRC32 of uncompressed data from the data descriptor, if the written // data is identified as the data descriptor block. - if (n >= 8 && MZ_READ_LE32(pBuf) == MZ_ZIP_DATA_DESCRIPTOR_ID) { + // See [Note: write_record_metadata] for why we check for non-null pBuf here + if (pBuf && n >= 8 && MZ_READ_LE32(pBuf) == MZ_ZIP_DATA_DESCRIPTOR_ID) { const int8_t* pInt8Buf = (const int8_t*)pBuf; const uint32_t uncomp_crc32 = MZ_READ_LE32(pInt8Buf + 4); self->combined_uncomp_crc32_ = @@ -654,7 +655,12 @@ void PyTorchStreamWriter::setup(const string& file_name) { } TORCH_CHECK(file_stream_, "File ", file_name, " cannot be opened."); writer_func_ = [this](const void* buf, size_t nbytes) -> size_t { - file_stream_.write(static_cast(buf), nbytes); + if (!buf) { + // See [Note: write_record_metadata] + file_stream_.seekp(nbytes, std::ios_base::cur); + } else { + file_stream_.write(static_cast(buf), nbytes); + } return !file_stream_ ? 0 : nbytes; }; } @@ -690,20 +696,20 @@ void PyTorchStreamWriter::writeRecord( detail::getPadding(ar_->m_archive_size, full_name.size(), size, padding_); uint32_t flags = compress ? MZ_BEST_COMPRESSION : 0; mz_zip_writer_add_mem_ex_v2( - ar_.get(), - full_name.c_str(), - data, - size, - nullptr, - 0, - flags, - 0, - 0, - nullptr, - padding_.c_str(), - padding_size, - nullptr, - 0); + /*pZip=*/ar_.get(), + /*pArchive_name=*/full_name.c_str(), + /*pBuf=*/data, + /*buf_size=*/size, + /*pComment=*/nullptr, + /*comment_size=*/0, + /*level_and_flags=*/flags, + /*uncomp_size=*/0, + /*uncomp_crc32=*/0, + /*last_modified=*/nullptr, + /*user_extra_data=*/padding_.c_str(), + /*user_extra_data_len=*/padding_size, + /*user_extra_data_central=*/nullptr, + /*user_extra_data_central_len=*/0); valid("writing file ", name.c_str()); files_written_.insert(name); } diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt index 181d164d81327..e168eb595feb2 100644 --- a/caffe2/utils/CMakeLists.txt +++ b/caffe2/utils/CMakeLists.txt @@ -1,100 +1,18 @@ -if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE) - list(APPEND Caffe2_CPU_SRCS - utils/string_utils.cc - utils/threadpool/ThreadPool.cc - ) - - if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL) - list(APPEND Caffe2_CPU_SRCS - utils/threadpool/pthreadpool-cpp.cc - utils/threadpool/thread_pool_guard.cpp - ) - endif() - - if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE) - list(APPEND Caffe2_CPU_SRCS - utils/proto_wrap.cc - ) - endif() - set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) - return() -endif() - list(APPEND Caffe2_CPU_SRCS - utils/bench_utils.cc - utils/cpuid.cc - utils/math/broadcast.cc - utils/math/elementwise.cc - utils/math/reduce.cc - utils/math/transpose.cc - utils/math/utils.cc - utils/math_cpu.cc - utils/murmur_hash3.cc - utils/proto_utils.cc - utils/proto_wrap.cc + utils/string_utils.cc utils/threadpool/ThreadPool.cc - utils/signal_handler.cc - utils/smart_tensor_printer.cc - utils/string_utils.cc) +) -if(USE_PTHREADPOOL) +if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL) list(APPEND Caffe2_CPU_SRCS utils/threadpool/pthreadpool-cpp.cc - utils/threadpool/thread_pool_guard.cpp) - if(USE_INTERNAL_PTHREADPOOL_IMPL) - list(APPEND Caffe2_CPU_SRCS - utils/threadpool/pthreadpool.cc - utils/threadpool/pthreadpool_impl.cc) - endif() + utils/threadpool/thread_pool_guard.cpp + ) endif() -set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} - utils/math/broadcast.cu - utils/math/elementwise.cu - utils/math/reduce.cu - utils/math/transpose.cu - utils/math_gpu.cu - ) - -set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} - utils/math/hip/broadcast.hip - utils/math/hip/elementwise.hip - utils/math/hip/reduce.hip - utils/math/hip/transpose.hip - utils/hip/math_gpu.hip - ) - -set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} - utils/fixed_divisor_test.cc - utils/math_test.cc - utils/fatal_signal_asan_no_sig_test.cc - utils/simple_queue_test.cc - utils/proto_utils_test.cc - utils/smart_tensor_printer_test.cc - utils/cast_test.cc - ) - -if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "s390x") - set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} - utils/cpuid_test.cc - ) +if(NOT INTERN_BUILD_MOBILE) + list(APPEND Caffe2_CPU_SRCS + utils/proto_wrap.cc + ) endif() - -set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} - utils/math_gpu_test.cc - ) - -set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} - utils/hip/math_gpu_test.cc - utils/hip/math_blas_gpu_test.cc - ) - -# TODO Once all source files are defined inside the local c10_utils_xxx targets, -# it should be the job of the parent CMakeLists.txt to decide what to do with the target (i.e. link it to caffe2) -# instead of us locally adding it to Caffe2_xxx variables. set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) -set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE) -set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE) -set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE) -set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE) -set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 73dba4061dced..a9a3aab8c5107 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1597,23 +1597,24 @@ if(NOT INTERN_BUILD_MOBILE) set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF) - if(USE_MAGMA) - find_package(MAGMA) - endif() - if((USE_CUDA OR USE_ROCM) AND MAGMA_FOUND) - set(USE_MAGMA 1) - message(STATUS "Compiling with MAGMA support") - message(STATUS "MAGMA INCLUDE DIRECTORIES: ${MAGMA_INCLUDE_DIR}") - message(STATUS "MAGMA LIBRARIES: ${MAGMA_LIBRARIES}") - message(STATUS "MAGMA V2 check: ${MAGMA_V2}") + if(USE_CUDA OR USE_ROCM) + if(USE_MAGMA) + find_package(MAGMA) + if(MAGMA_FOUND) + message(STATUS "Compiling with MAGMA support") + message(STATUS "MAGMA INCLUDE DIRECTORIES: ${MAGMA_INCLUDE_DIR}") + message(STATUS "MAGMA LIBRARIES: ${MAGMA_LIBRARIES}") + message(STATUS "MAGMA V2 check: ${MAGMA_V2}") + else() + message(STATUS "MAGMA not found. Compiling without MAGMA support") + caffe2_update_option(USE_MAGMA OFF) + endif() + endif() elseif(USE_MAGMA) message(WARNING "Not compiling with MAGMA. Suppress this warning with " "-DUSE_MAGMA=OFF.") caffe2_update_option(USE_MAGMA OFF) - else() - message(STATUS "MAGMA not found. Compiling without MAGMA support") - caffe2_update_option(USE_MAGMA OFF) endif() # ARM specific flags @@ -1685,9 +1686,6 @@ if(NOT INTERN_BUILD_MOBILE) if(MKLDNN_FOUND) set(AT_MKLDNN_ENABLED 1) include_directories(AFTER SYSTEM ${MKLDNN_INCLUDE_DIR}) - if(BUILD_CAFFE2_OPS) - list(APPEND Caffe2_DEPENDENCY_LIBS caffe2::mkldnn) - endif(BUILD_CAFFE2_OPS) else() message(WARNING "MKLDNN could not be found.") caffe2_update_option(USE_MKLDNN OFF) diff --git a/cmake/Modules/FindLAPACK.cmake b/cmake/Modules/FindLAPACK.cmake index fc8bf50d7d5d6..dbe47d6cdcf19 100644 --- a/cmake/Modules/FindLAPACK.cmake +++ b/cmake/Modules/FindLAPACK.cmake @@ -26,6 +26,7 @@ ENDIF(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED) # Old search lapack script include(CheckFortranFunctionExists) +include(CheckFunctionExists) macro(Check_Lapack_Libraries LIBRARIES _prefix _name _flags _list _blas) # This macro checks for the existence of the combination of fortran libraries diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 72c1243c24ea9..09af98d0bc066 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -23,8 +23,6 @@ function(caffe2_print_configuration_summary) message(STATUS "") message(STATUS " TORCH_VERSION : ${TORCH_VERSION}") - message(STATUS " BUILD_CAFFE2 : ${BUILD_CAFFE2}") - message(STATUS " BUILD_CAFFE2_OPS : ${BUILD_CAFFE2_OPS}") message(STATUS " BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}") message(STATUS " BUILD_BINARY : ${BUILD_BINARY}") message(STATUS " BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}") diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in index 6d518a1489626..02e313285297b 100644 --- a/cmake/TorchConfig.cmake.in +++ b/cmake/TorchConfig.cmake.in @@ -80,9 +80,6 @@ else() # shared library. # TODO: this list might be incomplete. append_torchlib_if_found(c10) - if(@BUILD_CAFFE2@) - append_torchlib_if_found(Caffe2_perfkernels_avx512 Caffe2_perfkernels_avx2 Caffe2_perfkernels_avx) - endif() if(@USE_NNPACK@) append_torchlib_if_found(nnpack) diff --git a/docs/source/conf.py b/docs/source/conf.py index 0f89d2799fa52..fe548737b3137 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -2796,6 +2796,7 @@ "ConstraintViolationError", "DynamicDimConstraintPrinter", "GuardOnDataDependentSymNode", + "PendingUnbackedSymbolNotFound", "LoggingShapeGuardPrinter", "RelaxedUnspecConstraint", "RuntimeAssert", diff --git a/docs/source/torch.compiler_troubleshooting.rst b/docs/source/torch.compiler_troubleshooting.rst index f98a4dc779b63..7158149c09e19 100644 --- a/docs/source/torch.compiler_troubleshooting.rst +++ b/docs/source/torch.compiler_troubleshooting.rst @@ -727,3 +727,11 @@ and C++ backtrace whenever this symbol was created. ``TORCHDYNAMO_EXTENDED_DEBUG_CPP`` - provides extended debug information (C++ backtrace) for all extended debug settings as well as errors. For example, set this to "1". The C++ backtrace is slow and very spammy so it is not included by default with extended debugging. + +Cold Start Timing and Cache Corruption Debugging +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In order to measure the cold start compilation time or debug a cache corruption, +it is possible pass ``TORCHINDUCTOR_FORCE_DISABLE_CACHES=1`` or set +``torch._inductor.config.force_disable_caches = True`` which will override any +other caching config option and disable all compile time caching. diff --git a/functorch/csrc/dim/arena.h b/functorch/csrc/dim/arena.h index 3251321f998b2..fa68e67268d53 100644 --- a/functorch/csrc/dim/arena.h +++ b/functorch/csrc/dim/arena.h @@ -55,7 +55,7 @@ struct Slice { T& operator[](int i) const { return begin_[i]; } - c10::optional index(const T& value) { + std::optional index(const T& value) { for (int i : enumerate()) { if (begin_[i] == value) { return i; diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp index e25b8d0e5731a..066f9517acefd 100644 --- a/functorch/csrc/dim/dim.cpp +++ b/functorch/csrc/dim/dim.cpp @@ -1123,7 +1123,7 @@ int64_t _Tensor_ndim(mpy::handle h) { mpy::handle handle_from_tensor(Arena& A, TensorRef t) { // fast case: tensor is live in python - c10::optional mb_obj = + std::optional mb_obj = t->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(getPyInterpreter(), /*ignore_hermetic_tls=*/false); if (mb_obj.has_value() && !t->unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()) { return *mb_obj; diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt deleted file mode 100644 index 598cac60bdbad..0000000000000 --- a/modules/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -project(modules CXX C) -add_subdirectory(detectron) -add_subdirectory(module_test) -add_subdirectory(observers) - -# Finally, set Caffe2_MODULES to parent scope. -set(Caffe2_MODULES ${Caffe2_MODULES} PARENT_SCOPE) diff --git a/modules/detectron/CMakeLists.txt b/modules/detectron/CMakeLists.txt deleted file mode 100644 index 7c9a2d7ff4f4a..0000000000000 --- a/modules/detectron/CMakeLists.txt +++ /dev/null @@ -1,57 +0,0 @@ -file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc) -file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu) -file(GLOB_RECURSE Detectron_HIP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.hip) - -if(BUILD_CAFFE2_OPS) - # Note(ilijar): Since Detectron ops currently have no - # CPU implementation, we only build GPU ops for now. - if(USE_CUDA) - add_library( - caffe2_detectron_ops_gpu SHARED - ${Detectron_CPU_SRCS} - ${Detectron_GPU_SRCS}) - - target_link_libraries(caffe2_detectron_ops_gpu PRIVATE torch) - if(USE_OPENMP) - target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::openmp) - endif() - - if(USE_MKLDNN) - target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::mkldnn) - endif() - install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib) - if(MSVC) - install(FILES $ DESTINATION lib OPTIONAL) - endif() - elseif(USE_ROCM) - hip_include_directories(${Caffe2_HIP_INCLUDES}) - set_source_files_properties(${Detectron_HIP_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - HIP_ADD_LIBRARY( - caffe2_detectron_ops_hip SHARED - ${Detectron_CPU_SRCS} - ${Detectron_HIP_SRCS}) - target_compile_options(caffe2_detectron_ops_hip PRIVATE ${HIP_CXX_FLAGS}) - if(USE_MKLDNN) - target_link_libraries(caffe2_detectron_ops_hip PRIVATE caffe2::mkldnn) - endif() - target_link_libraries(caffe2_detectron_ops_hip PRIVATE torch) - install(TARGETS caffe2_detectron_ops_hip DESTINATION lib) - elseif(NOT IOS_PLATFORM) - add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS}) - if(HAVE_SOVERSION) - set_target_properties(caffe2_detectron_ops PROPERTIES - VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION}) - endif() - target_link_libraries(caffe2_detectron_ops PRIVATE torch) - if(USE_OPENMP) - target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::openmp) - endif() - if(USE_MKLDNN) - target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::mkldnn) - endif() - install(TARGETS caffe2_detectron_ops DESTINATION lib) - if(MSVC) - install(FILES $ DESTINATION lib OPTIONAL) - endif() - endif() -endif() diff --git a/modules/detectron/group_spatial_softmax_op.cc b/modules/detectron/group_spatial_softmax_op.cc deleted file mode 100644 index 8b1fc052ef39b..0000000000000 --- a/modules/detectron/group_spatial_softmax_op.cc +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "modules/detectron/group_spatial_softmax_op.h" - -#include "caffe2/operators/softmax_utils.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR( - GroupSpatialSoftmax, - GroupSpatialSoftmaxOp); -REGISTER_CPU_OPERATOR( - GroupSpatialSoftmaxGradient, - GroupSpatialSoftmaxGradientOp); - -OPERATOR_SCHEMA(GroupSpatialSoftmax) - .NumInputs(1) - .NumOutputs(1) - .SetDoc(R"DOC( -RetinaNet specific form of spatial softmax. - -The input is assumed to be unnormalized scores (sometimes called 'logits') -arranged in a 4D tensor with shape (N, C, H, W), where N is the number of -elements in the batch, H and W are the height and width, and C = num_anchors * -num_classes defines num_anchors 'groups' of softmax inputs, each of length -num_classes. The softmax is applied to each group independently. - -See: https://arxiv.org/abs/1708.02002 for details. -)DOC") - .Arg( - "num_classes", - "(int) default 81; number of classes in each softmax group.") - .Input( - 0, - "scores", - "4D tensor of softmax inputs (called 'scores' or 'logits') with shape " - "(N, C, H, W), where C = num_anchors * num_classes defines num_anchors " - "groups of contiguous num_classes softmax inputs.") - .Output( - 0, - "probabilities", - "4D tensor of softmax probabilities with shape (N, C, H, W), where " - "C = num_anchors * num_classes, and softmax was applied to each of the " - "num_anchors groups; within a group the num_classes values sum to 1."); - -OPERATOR_SCHEMA(GroupSpatialSoftmaxGradient) - .NumInputs(2) - .NumOutputs(1) - .Input(0, "scores", "See GroupSpatialSoftmax") - .Input( - 1, - "d_probabilities", - "Gradient of forward output 0 (probabilities).") - .Output(0, "d_scores", "Gradient of forward input 0 (scores)."); - -class GetGroupSpatialSoftmaxGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "GroupSpatialSoftmaxGradient", - "", - vector{O(0), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(GroupSpatialSoftmax, GetGroupSpatialSoftmaxGradient); - -} // namespace caffe2 diff --git a/modules/detectron/group_spatial_softmax_op.cu b/modules/detectron/group_spatial_softmax_op.cu deleted file mode 100644 index 741da27f59d2b..0000000000000 --- a/modules/detectron/group_spatial_softmax_op.cu +++ /dev/null @@ -1,181 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/group_spatial_softmax_op.h" - -namespace caffe2 { - -namespace { - -__global__ void GroupSpatialSoftmaxKernel(const int num, const int A, const int W, - const int H, const float* Xdata, float* Pdata, const int num_classes) { - // Loop through labels (N x A x H x W) - CUDA_1D_KERNEL_LOOP(index, num * A * H * W) { - int D = num_classes * A; - int x = index % W; - int y = (index / W) % H; - int a = (index / (W * H)) % A; - int i = index / W / H / A; - - // Subtract max on each cell for numerical reasons - float max_val = -FLT_MAX; - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = i * (H * W * D) + c * (H * W) + y * W + x; - max_val = max(max_val, Xdata[idx]); - } - // Exponentiate - float expsum = 0.0f; - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = i * (H * W * D) + c * (H * W) + y * W + x; - float expx = exp(Xdata[idx] - max_val); - Pdata[idx] = expx; - expsum += expx; - } - - // Normalize - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = i * (H * W * D) + c * (H * W) + y * W + x; - Pdata[idx] /= expsum; - } - - } -} - -__global__ void SumProbsKernel(const int N, const int A, const int W, - const int H, const float* Ydata, const float* dYdata, - float* sum_probs_data, const int num_classes) { - CUDA_1D_KERNEL_LOOP(i, N * A * W * H) { - int D = num_classes * A; - int x = i % W; - int y = (i / W) % H; - int a = (i / (W * H)) % A; - int n = i / (W * H * A); - - sum_probs_data[i] = 0.0; - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = n * (H * W * D) + c * (H * W) + y * W + x; - sum_probs_data[i] += (Ydata[idx] * dYdata[idx]); - } - } -} - -__global__ void SubSumKernel( - const int N, const int A, const int W, const int H, - const float* sum_probs_data, float* dXdata, const int num_classes) { - CUDA_1D_KERNEL_LOOP(i, N * (A * num_classes) * W * H) { - int D = num_classes * A; - int x = i % W; - int y = (i / W) % H; - int a = ((i / (W * H)) % D) / num_classes; - int n = i / W / H / D; - int idx = n * (H * W * A) + a * (H * W) + y * W + x; - dXdata[i] = (dXdata[i] - sum_probs_data[idx]); - } -} - -} // namespace - - -template <> -bool GroupSpatialSoftmaxOp::RunOnDevice() { - auto& X = Input(0); // Logits - - int N = X.dim32(0); - int D = X.dim32(1); - int H = X.dim32(2); - int W = X.dim32(3); - int A = D / num_classes_; - - auto* P = Output(0, X.sizes(), at::dtype()); // Probabilities from softmax - TORCH_DCHECK_EQ(X.ndim(), 4); - - const float* Xdata = X.data(); - float* Pdata = P->mutable_data(); - - // Softmax for each x,y location - GroupSpatialSoftmaxKernel<<>>( - N, A, W, H, Xdata, Pdata, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - return true; -} - - -template<> -bool GroupSpatialSoftmaxGradientOp::RunOnDevice() { - auto& Y = Input(0); // Probabilities from softmax - auto& dY = Input(1); - - - TORCH_DCHECK_EQ(Y.ndim(), 4); - - int N = Y.dim32(0); - int D = Y.dim32(1); - int H = Y.dim32(2); - int W = Y.dim32(3); - int A = D / num_classes_; - - auto* dX = Output(0, Y.sizes(), at::dtype()); - - if (sum_probs_.size() != N * A * H * W) { - ReinitializeTensor(&sum_probs_, {N * A * H * W}, at::dtype().device(CUDA)); - } - - const float* Ydata = Y.data(); - const float* dYdata = dY.data(); - float* dXdata = dX->mutable_data(); - - float* sum_probs_data = sum_probs_.mutable_data(); - math::Set( - sum_probs_.size(), 0.0f, sum_probs_data, &context_); - - // Complete math: - // J_ij = h_i (delta_ij - h_j) - // d x_i = sum_j d h_ij = sum_j J_ij * dy_j - // = sum_j h_i (delta_ij - h_j) * dy_j - // = h_i dy_i - (sum_j h_i h_j dy_j) - // = h_i dy_i - h_i sum_j h_j dy_j - - // Step 0: dx = dy - context_.Copy(Y.size(), dYdata, dXdata); - - // Step 1: s = Sum(dY[j] * Y[j]) - SumProbsKernel<<>>( - N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Step 2: dX[i] = dX[i] - s - SubSumKernel<<>>( - N, A, W, H, sum_probs_.data(), dXdata, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Step 3: dX[i] = Y[i] * dX[i] - math::Mul(Y.size(), dXdata, Ydata, dXdata, &context_); - - return true; -} - - -REGISTER_CUDA_OPERATOR(GroupSpatialSoftmax, - GroupSpatialSoftmaxOp); -REGISTER_CUDA_OPERATOR(GroupSpatialSoftmaxGradient, - GroupSpatialSoftmaxGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/group_spatial_softmax_op.h b/modules/detectron/group_spatial_softmax_op.h deleted file mode 100644 index b235a47146b58..0000000000000 --- a/modules/detectron/group_spatial_softmax_op.h +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef GROUP_SPATIAL_SOFTMAX_OP_H_ -#define GROUP_SPATIAL_SOFTMAX_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class GroupSpatialSoftmaxOp final : public Operator { - public: - GroupSpatialSoftmaxOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - num_classes_(this->template GetSingleArgument("num_classes", 81)), - order_(StringToStorageOrder( - this->template GetSingleArgument("order", "NCHW"))) { - CAFFE_ENFORCE_EQ( - order_, StorageOrder::NCHW, "Only NCHW order is supported right now."); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - int num_classes_; - StorageOrder order_; -}; - -template -class GroupSpatialSoftmaxGradientOp final : public Operator { - public: - GroupSpatialSoftmaxGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - num_classes_(this->template GetSingleArgument("num_classes", 81)), - order_(StringToStorageOrder( - this->template GetSingleArgument("order", "NCHW"))) { - CAFFE_ENFORCE_EQ( - order_, StorageOrder::NCHW, "Only NCHW order is supported right now."); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - int num_classes_; - StorageOrder order_; - Tensor sum_probs_; -}; - -} // namespace caffe2 - -#endif // GROUP_SPATIAL_SOFTMAX_OP_H_ diff --git a/modules/detectron/ps_roi_pool_op.cc b/modules/detectron/ps_roi_pool_op.cc deleted file mode 100644 index c57b0fc23678b..0000000000000 --- a/modules/detectron/ps_roi_pool_op.cc +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ps_roi_pool_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(PSRoIPool, PSRoIPoolOp); -REGISTER_CPU_OPERATOR( - PSRoIPoolGradient, - PSRoIPoolGradientOp); - -OPERATOR_SCHEMA(PSRoIPool) - .NumInputs(2) - .NumOutputs(2) - .SetDoc(R"DOC( -Position Sensitive Region of Interest Pooling as used in R-FCN. -)DOC") - .Arg( - "spatial_scale", - "(float) default 1.0; Spatial scale of the input feature map X " - "relative to the input image. E.g., 0.0625 if X has a stride of 16 " - "w.r.t. the input image.") - .Arg( - "group_size", - "(int) default 1; pooled_h = pooled_w = group_size where pooled_{h,w} " - "is the pooled output Y's height and width, respectively.") - .Arg( - "output_dim", - "(int) default 1; number of channels in the pooled output, which might " - "be the number of classes is used for classification or 4 if used for " - "class agnostic bounding box regression.") - .Input( - 0, - "X", - "4D position sensitive feature map input of shape (N, C, H, W), where " - "C = group_size**2 * output_dim.") - .Input( - 1, - "RoIs", - "2D input of shape (R, 5) specifying R RoIs with five columns " - "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI " - "coordinates are in the coordinate system of the input image.") - .Output( - 0, - "Y", - "4D output of shape (R, output_dim, pooled_h, pooled_w). The r-th " - "batch element is a pooled feature map cooresponding to the r-th RoI.") - .Output( - 1, - "argmaxes", - "4D output of shape (R, output_dim, pooled_h, pooled_w). Same as Y, " - "except it records the argmax indices rather than the max pooled " - "values."); - -OPERATOR_SCHEMA(PSRoIPoolGradient) - .NumInputs(4) - .NumOutputs(1) - .Input( - 0, - "X", - "See PSRoIPool.") - .Input( - 1, - "RoIs", - "See PSRoIPool.") - .Input( - 2, - "argmaxes", - "See PSRoIPool.") - .Input( - 3, - "dY", - "Gradient of forward output 0 (Y)") - .Output( - 0, - "dX", - "Gradient of forward input 0 (X)"); - -class GetPSRoIPoolGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "PSRoIPoolGradient", - "", - vector{I(0), I(1), O(1), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(PSRoIPool, GetPSRoIPoolGradient); - -} // namespace caffe2 diff --git a/modules/detectron/ps_roi_pool_op.cu b/modules/detectron/ps_roi_pool_op.cu deleted file mode 100644 index 68e4ec377d622..0000000000000 --- a/modules/detectron/ps_roi_pool_op.cu +++ /dev/null @@ -1,289 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Based on https://github.com/daijifeng001/caffe-rfcn/blob/r-fcn/src/caffe/layers/psroi_pooling_layer.cu -// -// ------------------------------------------------------------------ -// R-FCN -// Copyright (c) 2016 Microsoft -// Licensed under The MIT License [see r-fcn/LICENSE for details] -// Written by Yi Li -// ------------------------------------------------------------------ -// -// COPYRIGHT -// -// All contributions by the University of California: -// Copyright (c) 2014, 2015, The Regents of the University of California -// (Regents) -// All rights reserved. -// -// All other contributions: -// Copyright (c) 2014, 2015, the respective contributors -// All rights reserved. -// -// Caffe uses a shared copyright model: each contributor holds copyright over -// their contributions to Caffe. The project versioning records all such -// contribution and copyright details. If a contributor wants to further mark -// their specific copyright on a particular contribution, they should indicate -// their copyright solely in the commit message of the change when it is -// committed. -// -// LICENSE -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -// POSSIBILITY OF SUCH DAMAGE. -// -// CONTRIBUTION AGREEMENT -// -// By contributing to the BVLC/caffe repository through pull-request, comment, -// or otherwise, the contributor releases their content to the -// license and copyright terms herein. - -#include - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/ps_roi_pool_op.h" - -namespace caffe2 { - -namespace { - -template -inline __device__ T gpu_atomic_add(const T val, T* address); - -template <> -inline __device__ -float gpu_atomic_add(const float val, float* address) { - return atomicAdd(address, val); -} - -template -__global__ void PSRoIPoolForward( - const int nthreads, - const T* bottom_data, - const T spatial_scale, - const int channels, - const int height, - const int width, - const int pooled_height, - const int pooled_width, - const T* bottom_rois, - const int output_dim, - const int group_size, - T* top_data, - int* mapping_channel) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { - // The output is in order (n, ctop, ph, pw) - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int ctop = (index / pooled_width / pooled_height) % output_dim; - int n = index / pooled_width / pooled_height / output_dim; - - // [start, end) interval for spatial sampling - const T* offset_bottom_rois = bottom_rois + n * 5; - int roi_batch_ind = offset_bottom_rois[0]; - T roi_start_w = static_cast( - roundf(offset_bottom_rois[1])) * spatial_scale; - T roi_start_h = static_cast( - roundf(offset_bottom_rois[2])) * spatial_scale; - T roi_end_w = static_cast( - roundf(offset_bottom_rois[3]) + 1.) * spatial_scale; - T roi_end_h = static_cast( - roundf(offset_bottom_rois[4]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast(0.1)); // avoid 0 - T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast(0.1)); - - // Compute w and h at bottom - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - // Add roi offsets and clip to input boundaries - int hstart = floor( - static_cast(ph) * bin_size_h + roi_start_h); - int wstart = floor( - static_cast(pw)* bin_size_w + roi_start_w); - int hend = ceil( - static_cast(ph + 1) * bin_size_h + roi_start_h); - int wend = ceil( - static_cast(pw + 1) * bin_size_w + roi_start_w); - - hstart = min(max(hstart, 0), height); - hend = min(max(hend, 0), height); - wstart = min(max(wstart, 0),width); - wend = min(max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - int gw = pw; - int gh = ph; - int c = (ctop * group_size + gh) * group_size + gw; - - const T* offset_bottom_data = - bottom_data + (roi_batch_ind * channels + c) * height * width; - T out_sum = 0; - for (int h = hstart; h < hend; ++h){ - for (int w = wstart; w < wend; ++w){ - int bottom_index = h*width + w; - out_sum += offset_bottom_data[bottom_index]; - } - } - - T bin_area = (hend - hstart) * (wend - wstart); - top_data[index] = is_empty ? 0. : out_sum / bin_area; - mapping_channel[index] = c; - } -} - -template -__global__ void PSRoIPoolBackward( - const int nthreads, - const T* top_diff, - const int* mapping_channel, - const int num_rois, - const T spatial_scale, - const int channels, - const int height, - const int width, - const int pooled_height, - const int pooled_width, - const int output_dim, - T* bottom_diff, - const T* bottom_rois) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { - // The output is in order (n, ctop, ph, pw) - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int n = index / pooled_width / pooled_height / output_dim; - - // [start, end) interval for spatial sampling - const T* offset_bottom_rois = bottom_rois + n * 5; - int roi_batch_ind = offset_bottom_rois[0]; - T roi_start_w = static_cast( - roundf(offset_bottom_rois[1])) * spatial_scale; - T roi_start_h = static_cast( - roundf(offset_bottom_rois[2])) * spatial_scale; - T roi_end_w = static_cast( - roundf(offset_bottom_rois[3]) + 1.) * spatial_scale; - T roi_end_h = static_cast( - roundf(offset_bottom_rois[4]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast(0.1)); //avoid 0 - T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast(0.1)); - - // Compute w and h at bottom - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor( - static_cast(ph)* bin_size_h + roi_start_h); - int wstart = floor( - static_cast(pw)* bin_size_w + roi_start_w); - int hend = ceil( - static_cast(ph + 1) * bin_size_h + roi_start_h); - int wend = ceil( - static_cast(pw + 1) * bin_size_w + roi_start_w); - // Add roi offsets and clip to input boundaries - hstart = min(max(hstart, 0), height); - hend = min(max(hend, 0), height); - wstart = min(max(wstart, 0), width); - wend = min(max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - // Compute c at bottom - int c = mapping_channel[index]; - T* offset_bottom_diff = - bottom_diff + (roi_batch_ind * channels + c) * height * width; - T bin_area = (hend - hstart) * (wend - wstart); - T diff_val = is_empty ? 0. : top_diff[index] / bin_area; - for (int h = hstart; h < hend; ++h){ - for (int w = wstart; w < wend; ++w){ - int bottom_index = h * width + w; - gpu_atomic_add(diff_val, offset_bottom_diff + bottom_index); - } - } - } -} - -} // namespace - -template<> -bool PSRoIPoolOp::RunOnDevice() { - auto& X = Input(0); // Input data to pool - auto& R = Input(1); // RoIs - - auto* Y = Output(0, {R.dim32(0), output_dim_, pooled_height_, pooled_width_}, at::dtype()); // PSRoI pooled data - auto* A = Output(1, Y->sizes(), at::dtype()); // mapping_channel - int output_size = Y->numel(); - PSRoIPoolForward<<>>( - output_size, X.data(), spatial_scale_, X.dim32(1), X.dim32(2), - X.dim32(3), pooled_height_, pooled_width_, R.data(), output_dim_, - group_size_, Y->mutable_data(), A->mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - return true; -} - - -template<> -bool PSRoIPoolGradientOp::RunOnDevice() { - auto& X = Input(0); // Input data to pool - auto& R = Input(1); // RoIs - auto& A = Input(2); // mapping channels - auto& dY = Input(3); // Gradient of net w.r.t. output of "forward" op - // (aka "gradOutput") - - auto* dX = Output(0, X.sizes(), at::dtype()); // Gradient of net w.r.t. input to "forward" op - // (aka "gradInput") - // Must zero-out dX before accumulating gradients - math::Set( - dX->size(), 0.f, dX->mutable_data(), &context_); - PSRoIPoolBackward<<>>( - dY.size(), dY.data(), A.data(), R.dim32(0), spatial_scale_, - X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, - output_dim_, dX->mutable_data(), R.data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - return true; -} - - -REGISTER_CUDA_OPERATOR(PSRoIPool, - PSRoIPoolOp); -REGISTER_CUDA_OPERATOR(PSRoIPoolGradient, - PSRoIPoolGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/ps_roi_pool_op.h b/modules/detectron/ps_roi_pool_op.h deleted file mode 100644 index ecee1dd7041c4..0000000000000 --- a/modules/detectron/ps_roi_pool_op.h +++ /dev/null @@ -1,93 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PS_ROI_POOL_OP_H_ -#define PS_ROI_POOL_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class PSRoIPoolOp final : public Operator { - public: - PSRoIPoolOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - spatial_scale_(this->template GetSingleArgument( - "spatial_scale", 1.)), - group_size_(this->template GetSingleArgument("group_size", 1)), - output_dim_(this->template GetSingleArgument("output_dim", 1)) { - TORCH_DCHECK_GT(spatial_scale_, 0); - TORCH_DCHECK_GT(group_size_, 0); - pooled_height_ = group_size_; - pooled_width_ = group_size_; - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float spatial_scale_; - int group_size_; - int output_dim_; - int pooled_height_; - int pooled_width_; - int channels_; - int height_; - int width_; - }; - -template -class PSRoIPoolGradientOp final : public Operator { - public: - PSRoIPoolGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - spatial_scale_(this->template GetSingleArgument( - "spatial_scale", 1.)), - group_size_(this->template GetSingleArgument("group_size", 1)), - output_dim_(this->template GetSingleArgument("output_dim", 1)) { - TORCH_DCHECK_GT(spatial_scale_, 0); - TORCH_DCHECK_GT(group_size_, 0); - pooled_height_ = group_size_; - pooled_width_ = group_size_; - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float spatial_scale_; - int group_size_; - int output_dim_; - int pooled_height_; - int pooled_width_; - int channels_; - int height_; - int width_; -}; - -} // namespace caffe2 - -#endif // PS_ROI_POOL_OP_H_ diff --git a/modules/detectron/roi_pool_f_op.cc b/modules/detectron/roi_pool_f_op.cc deleted file mode 100644 index 81bf8bb62ed0a..0000000000000 --- a/modules/detectron/roi_pool_f_op.cc +++ /dev/null @@ -1,99 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "roi_pool_f_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(RoIPoolF, RoIPoolFOp); -REGISTER_CPU_OPERATOR(RoIPoolFGradient, RoIPoolFGradientOp); - -OPERATOR_SCHEMA(RoIPoolF) - .NumInputs(2) - .NumOutputs(2) - .SetDoc(R"DOC( -Region of Interest (RoI) pooling operation as used in Fast R-CNN. -)DOC") - .Arg( - "spatial_scale", - "(float) default 1.0; Spatial scale of the input feature map X " - "relative to the input image. E.g., 0.0625 if X has a stride of 16 " - "w.r.t. the input image.") - .Arg( - "pooled_h", - "(int) default 1; Pooled output Y's height.") - .Arg( - "pooled_w", - "(int) default 1; Pooled output Y's width.") - .Input( - 0, - "X", - "4D feature map input of shape (N, C, H, W).") - .Input( - 1, - "RoIs", - "2D input of shape (R, 5) specifying R RoIs with five columns " - "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI " - "coordinates are in the coordinate system of the input image.") - .Output( - 0, - "Y", - "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element " - "is a pooled feature map cooresponding to the r-th RoI.") - .Output( - 1, - "argmaxes", - "4D output of shape (R, C, pooled_h, pooled_w). Same as Y, except it " - "records the argmax indices rather than the max pooled values."); - -OPERATOR_SCHEMA(RoIPoolFGradient) - .NumInputs(4) - .NumOutputs(1) - .Input( - 0, - "X", - "See RoIPoolF.") - .Input( - 1, - "RoIs", - "See RoIPoolF.") - .Input( - 2, - "argmaxes", - "See RoIPoolF.") - .Input( - 3, - "dY", - "Gradient of forward output 0 (Y)") - .Output( - 0, - "dX", - "Gradient of forward input 0 (X)"); - -class GetRoIPoolFGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "RoIPoolFGradient", - "", - vector{I(0), I(1), O(1), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(RoIPoolF, GetRoIPoolFGradient); - -} // namespace caffe2 diff --git a/modules/detectron/roi_pool_f_op.cu b/modules/detectron/roi_pool_f_op.cu deleted file mode 100644 index b261911b95a16..0000000000000 --- a/modules/detectron/roi_pool_f_op.cu +++ /dev/null @@ -1,187 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/roi_pool_f_op.h" - -namespace caffe2 { - -namespace { - -template -inline __device__ T gpu_atomic_add(const T val, T* address); - -template <> -inline __device__ -float gpu_atomic_add(const float val, float* address) { - return atomicAdd(address, val); -} - -template -__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data, - const T spatial_scale, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const T* bottom_rois, T* top_data, int* argmax_data) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { - // (n, c, ph, pw) is an element in the pooled output - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - - const T* offset_bottom_rois = bottom_rois + n * 5; - int roi_batch_ind = offset_bottom_rois[0]; - int roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale); - int roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale); - int roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale); - int roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale); - - // Force malformed ROIs to be 1x1 - int roi_width = max(roi_end_w - roi_start_w + 1, 1); - int roi_height = max(roi_end_h - roi_start_h + 1, 1); - T bin_size_h = static_cast(roi_height) - / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) - / static_cast(pooled_width); - - int hstart = static_cast(floor(static_cast(ph) - * bin_size_h)); - int wstart = static_cast(floor(static_cast(pw) - * bin_size_w)); - int hend = static_cast(ceil(static_cast(ph + 1) - * bin_size_h)); - int wend = static_cast(ceil(static_cast(pw + 1) - * bin_size_w)); - - // Add roi offsets and clip to input boundaries - hstart = min(max(hstart + roi_start_h, 0), height); - hend = min(max(hend + roi_start_h, 0), height); - wstart = min(max(wstart + roi_start_w, 0), width); - wend = min(max(wend + roi_start_w, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - // Define an empty pooling region to be zero - T maxval = is_empty ? 0 : -FLT_MAX; - // If nothing is pooled, argmax = -1 causes nothing to be backprop'd - int maxidx = -1; - const T* offset_bottom_data = - bottom_data + (roi_batch_ind * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int bottom_index = h * width + w; - if (offset_bottom_data[bottom_index] > maxval) { - maxval = offset_bottom_data[bottom_index]; - maxidx = bottom_index; - } - } - } - top_data[index] = maxval; - argmax_data[index] = maxidx; - } -} - -template -__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff, - const int* argmax_data, const int num_rois, const T spatial_scale, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, T* bottom_diff, - const T* bottom_rois) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { - // (n, c, ph, pw) is an element in the pooled output - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - - const T* offset_bottom_rois = bottom_rois + n * 5; - int roi_batch_ind = offset_bottom_rois[0]; - int bottom_offset = (roi_batch_ind * channels + c) * height * width; - int top_offset = (n * channels + c) * pooled_height * pooled_width; - const T* offset_top_diff = top_diff + top_offset; - T* offset_bottom_diff = bottom_diff + bottom_offset; - const int* offset_argmax_data = argmax_data + top_offset; - - int argmax = offset_argmax_data[ph * pooled_width + pw]; - if (argmax != -1) { - gpu_atomic_add( - static_cast(offset_top_diff[ph * pooled_width + pw]), - offset_bottom_diff + argmax); - } - } -} - -} // namespace - -template<> -bool RoIPoolFOp::RunOnDevice() { - auto& X = Input(0); // Input data to pool - auto& R = Input(1); // RoIs - - if (R.size() == 0) { - // Handle empty rois - std::vector sizes = {0, X.dim32(1), pooled_height_, pooled_width_}; - /* auto* Y = */ Output(0, sizes, at::dtype()); - /* auto* A = */ Output(1, sizes, at::dtype()); - return true; - } - - auto* Y = Output(0, {R.dim32(0), X.dim32(1), pooled_height_, pooled_width_}, at::dtype()); // RoI pooled data - auto* A = Output(1, Y->sizes(), at::dtype()); // argmaxes - int output_size = Y->size(); - RoIPoolFForward<<>>( - output_size, X.data(), spatial_scale_, X.dim32(1), X.dim32(2), - X.dim32(3), pooled_height_, pooled_width_, R.data(), - Y->mutable_data(), A->mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - return true; -} - - -template<> -bool RoIPoolFGradientOp::RunOnDevice() { - auto& X = Input(0); // Input data to pool - auto& R = Input(1); // RoIs - auto& A = Input(2); // argmaxes - auto& dY = Input(3); // Gradient of net w.r.t. output of "forward" op - // (aka "gradOutput") - - auto* dX = Output(0, X.sizes(), at::dtype()); // Gradient of net w.r.t. input to "forward" op - // (aka "gradInput") - // Must zero-out dX before accumulating gradients - math::Set( - dX->size(), 0.f, dX->mutable_data(), &context_); - if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois - RoIPoolFBackward<<>>( - dY.size(), dY.data(), A.data(), R.dim32(0), spatial_scale_, - X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, - dX->mutable_data(), R.data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - } - return true; -} - - -REGISTER_CUDA_OPERATOR(RoIPoolF, - RoIPoolFOp); -REGISTER_CUDA_OPERATOR(RoIPoolFGradient, - RoIPoolFGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/roi_pool_f_op.h b/modules/detectron/roi_pool_f_op.h deleted file mode 100644 index 604c5606a203e..0000000000000 --- a/modules/detectron/roi_pool_f_op.h +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ROI_POOL_F_OP_H_ -#define ROI_POOL_F_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class RoIPoolFOp final : public Operator { - public: - RoIPoolFOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - spatial_scale_(this->template GetSingleArgument( - "spatial_scale", 1.)), - pooled_height_(this->template GetSingleArgument("pooled_h", 1)), - pooled_width_(this->template GetSingleArgument("pooled_w", 1)) { - TORCH_DCHECK_GT(spatial_scale_, 0); - TORCH_DCHECK_GT(pooled_height_, 0); - TORCH_DCHECK_GT(pooled_width_, 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float spatial_scale_; - int pooled_height_; - int pooled_width_; -}; - -template -class RoIPoolFGradientOp final : public Operator { - public: - RoIPoolFGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - spatial_scale_(this->template GetSingleArgument( - "spatial_scale", 1.)), - pooled_height_(this->template GetSingleArgument("pooled_h", 1)), - pooled_width_(this->template GetSingleArgument("pooled_w", 1)) { - TORCH_DCHECK_GT(spatial_scale_, 0); - TORCH_DCHECK_GT(pooled_height_, 0); - TORCH_DCHECK_GT(pooled_width_, 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float spatial_scale_; - int pooled_height_; - int pooled_width_; -}; - -} // namespace caffe2 - -#endif // ROI_POOL_F_OP_H_ diff --git a/modules/detectron/sample_as_op.cc b/modules/detectron/sample_as_op.cc deleted file mode 100644 index d22cfb8194e60..0000000000000 --- a/modules/detectron/sample_as_op.cc +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "sample_as_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(SampleAs, SampleAsOp); -REGISTER_CPU_OPERATOR(SampleAsGradient, SampleAsGradientOp); - -OPERATOR_SCHEMA(SampleAs) - .NumInputs(2) - .NumOutputs(1) - .SetDoc(R"DOC( -Select the batch elements from input tensor X where the corresponding input -label value is > 0. -)DOC") - .Input( - 0, - "X", - "Tensor of at least 1D shape (N, ...).") - .Input( - 1, - "labels", - "Tensor of type int with 1D shape (N, ).") - .Output( - 0, - "Y", - "Tensor with number of dims matching X, but with the length of dim 0 " - "equal to the number of non-zero elements in labels. The batch items " - "from X corresponding to the non-zero elements in labels are copied " - "into Y."); - -OPERATOR_SCHEMA(SampleAsGradient) - .NumInputs(3) - .NumOutputs(1) - .Input( - 0, - "X", - "See SampleAs.") - .Input( - 1, - "labels", - "See SampleAs." - ) - .Input( - 2, - "dY", - "Gradient of forward output 0 (Y).") - .Output( - 0, - "dX", - "Gradient of forward input 0 (X)."); - -class GetSampleAsGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "SampleAsGradient", - "", - vector{I(0), I(1), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(SampleAs, GetSampleAsGradient); - -} // namespace caffe2 diff --git a/modules/detectron/sample_as_op.cu b/modules/detectron/sample_as_op.cu deleted file mode 100644 index a58604de2b0d0..0000000000000 --- a/modules/detectron/sample_as_op.cu +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* SampleAs by Kaiming He for Mask R-CNN -X.dim32(0) = L.dim32(0) -Y's output samples are the samples of X for which L > 0. -*/ -#include - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/sample_as_op.h" - -#include - -namespace caffe2 { - -template <> -bool SampleAsOp::RunOnDevice() { - auto& X = Input(0); // Input data to be sliced - auto& L = Input(1); // Target data that provide the identity - - CAFFE_ENFORCE( - X.dim32(0) == L.dim32(0), - "X.dim32(0) must be equal to L.dim32(0)", - "(", - X.dim32(0), - " vs. ", - L.dim32(0), - ")"); - - // copy L to CPU: - std::vector labels(L.dim32(0)); - context_.CopyBytes( - L.dim32(0) * sizeof(int), L.data(), &labels[0]); - // Make sure that the copy is finished - context_.FinishDeviceComputation(); - - int count = 0; - for (int i = 0; i < L.dim32(0); i++) { - if (labels[i] > 0) { - count++; - } - } - assert(count > 0); - - // resize Y - vector out_shape(X.sizes().vec()); - out_shape[0] = count; - auto* Y = Output(0, out_shape, at::dtype()); // Sliced data (Y.dim32(0) = num of (L > 0)) - - const int len = X.size() / X.dim32(0); - - float* output = Y->mutable_data(); - for (int i = 0; i < L.dim32(0); i++) { - if (labels[i] > 0) { - context_.CopyBytes( - len * sizeof(float), X.data() + i * len, output); - output += len; - } // if - } // i - - return true; -} - -template <> -bool SampleAsGradientOp::RunOnDevice() { - auto& X = Input(0); - auto& L = Input(1); - auto& dY = Input(2); - - - auto* dX = Output(0, X.sizes(), at::dtype()); - - // copy L to CPU: - std::vector labels(L.dim32(0)); - context_.CopyBytes( - L.dim32(0) * sizeof(int), L.data(), &labels[0]); - // Make sure that the copy is finished - context_.FinishDeviceComputation(); - - // zero-out dX - math::Set( - dX->size(), 0.f, dX->mutable_data(), &context_); - - const int len = X.size() / X.dim32(0); - - const float* input = dY.data(); - for (int i = 0; i < L.dim32(0); i++) { - if (labels[i] > 0) { - context_.CopyBytes( - len * sizeof(float), input, dX->mutable_data() + i * len); - input += len; - } // if - } // i - - return true; -} - -REGISTER_CUDA_OPERATOR(SampleAs, SampleAsOp); -REGISTER_CUDA_OPERATOR( - SampleAsGradient, - SampleAsGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/sample_as_op.h b/modules/detectron/sample_as_op.h deleted file mode 100644 index 70d2214e1c8cf..0000000000000 --- a/modules/detectron/sample_as_op.h +++ /dev/null @@ -1,55 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SAMPLE_AS_OP_H_ -#define SAMPLE_AS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SampleAsOp final : public Operator { - public: - SampleAsOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws) {} - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } -}; - -template -class SampleAsGradientOp final : public Operator { - public: - SampleAsGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws) {} - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } -}; - -} // namespace caffe2 - -#endif // SAMPLE_AS_OP_H_ diff --git a/modules/detectron/select_smooth_l1_loss_op.cc b/modules/detectron/select_smooth_l1_loss_op.cc deleted file mode 100644 index 7f1441032acf6..0000000000000 --- a/modules/detectron/select_smooth_l1_loss_op.cc +++ /dev/null @@ -1,107 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "select_smooth_l1_loss_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR( - SelectSmoothL1Loss, - SelectSmoothL1LossOp); -REGISTER_CPU_OPERATOR( - SelectSmoothL1LossGradient, - SelectSmoothL1LossGradientOp); - -OPERATOR_SCHEMA(SelectSmoothL1Loss) - .NumInputs(4) - .NumOutputs(1) - .SetDoc(R"DOC( -RetinaNet specific op for computing Smooth L1 Loss at select locations in a 4D -tensor that encodes bounding box regression predictions. -)DOC") - .Arg( - "beta", - "(float) default 1.0; L2 to L1 transition point.") - .Arg( - "scale", - "(float) default 1.0; multiply the loss by this scale factor.") - .Input( - 0, - "Y_hat", - "4D tensor of bounding box regression predictions with shape " - "(N, 4 * num_bbox_classes * num_anchors, H, W).") - .Input( - 1, - "Y", - "2D tensor of labels shape (M, 4) for 4 contiguous channels starting " - "at each of the M locations selected by the locations input.") - .Input( - 2, - "locations", - "2D tensor of shape (M, 4) that identifies M 'select' locations " - "encoded by the four columns: (n, c, y, x). The loss is computed on the " - "four contiguous channel locations [c, c + 3] (inclusive).") - .Input( - 3, - "normalizer", - "Scalar; the loss is divided by max(1, normalizer).") - .Output( - 0, - "loss", - "Scalar loss."); - -OPERATOR_SCHEMA(SelectSmoothL1LossGradient) - .NumInputs(5) - .NumOutputs(1) - .Input( - 0, - "Y_hat", - "See SelectSmoothL1Loss.") - .Input( - 1, - "Y", - "See SelectSmoothL1Loss.") - .Input( - 2, - "locations", - "See SelectSmoothL1Loss.") - .Input( - 3, - "normalizer", - "See SelectSmoothL1Loss.") - .Input( - 4, - "d_loss", - "Gradient of forward output 0 (loss).") - .Output( - 0, - "d_Y_hat", - "Gradient of forward input 0 (Y_hat)."); - -class GetSelectSmoothL1LossGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "SelectSmoothL1LossGradient", - "", - vector{I(0), I(1), I(2), I(3), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(SelectSmoothL1Loss, GetSelectSmoothL1LossGradient); - -} // namespace caffe2 diff --git a/modules/detectron/select_smooth_l1_loss_op.cu b/modules/detectron/select_smooth_l1_loss_op.cu deleted file mode 100644 index 72f1d563b4c92..0000000000000 --- a/modules/detectron/select_smooth_l1_loss_op.cu +++ /dev/null @@ -1,189 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/select_smooth_l1_loss_op.h" - -namespace caffe2 { - -namespace { -__global__ void SelectSmoothL1Kernel( - const int D, const int H, const int W, - const int M, const float* Y_hat, const float* Y, const float* L, float* out, - const float* S, const float beta) { - // f(x) = 0.5 * x^2 / beta if |x| < beta - // |x| - 0.5 * beta otherwise - CUDA_1D_KERNEL_LOOP(i, M) { - int n = L[i * 4]; - int c = L[i * 4 + 1]; - int y = L[i * 4 + 2]; - int x = L[i * 4 + 3]; - - for (int j = 0; j < 4; j++){ - // Y_hat: N x (A * CLS * 4) x H x W - int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x; - float y_hat = Y_hat[ind]; - float y = Y[i * 4 + j]; - float val = y_hat - y; - float abs_val = c10::cuda::compat::abs(val); - if (abs_val < beta) { - out[ind] = (0.5 * val * val / beta) / c10::cuda::compat::max(S[0], static_cast(1.0)); - } else { - out[ind] = (abs_val - 0.5 * beta) / c10::cuda::compat::max(S[0], static_cast(1.0)); - } - } - } -} - - -__global__ void SelectSmoothL1GradientKernel( - const int D, const int H, const int W, - const int M, - const float* Y_hat, - const float* Y, - const float* L, - float* out, - const float* d_loss_data, - float norm, - const float* S, - float beta) { - // f'(x) = x / beta if |x| < beta - // = sign(x) otherwise - // We also scale by norm * d_loss in this kernel for convenience - CUDA_1D_KERNEL_LOOP(i, M) { - int n = L[i * 4]; - int c = L[i * 4 + 1]; - int y = L[i * 4 + 2]; - int x = L[i * 4 + 3]; - float d_loss = *d_loss_data; - - for (int j = 0; j < 4; j++) { - int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x; - float y_hat = Y_hat[ind]; - float y = Y[i * 4 + j]; - float val = y_hat - y; - float abs_val = c10::cuda::compat::abs(val); - if (abs_val < beta) { - out[ind] = norm * d_loss * val / beta / c10::cuda::compat::max(S[0], static_cast(1.0)); - } else { - out[ind] = norm * d_loss * ((float(0) < val) - (val < float(0))) / c10::cuda::compat::max(S[0], static_cast(1.0)); - } - } - } -} -} // namespace - - -template<> -bool SelectSmoothL1LossOp::RunOnDevice() { - // bbox targets predictions, for example: N x (A * 4) H x W in cls-agnostic case - auto& Y_hat = Input(0); - // true targets: for example: M x 4 where M is the #fg boxes per fpn level - auto& Y = Input(1); - // locations of fg boxes: M x 4 - auto& L = Input(2); - // total number of fg boxes across all FPN levels: scalar - auto& S = Input(3); - - - auto* avg_loss = Output(0, vector(), at::dtype()); - if (Y.size() == 0){ - math::Set( - 1, static_cast(0), avg_loss->mutable_data(), &context_); - return true; - } - - int N = Y_hat.dim32(0); - int D = Y_hat.dim32(1); - int H = Y_hat.dim32(2); - int W = Y_hat.dim32(3); - - int M = Y.dim32(0); - - // initialization - buff_.ResizeLike(Y_hat); - math::Set( - 1, static_cast(0), avg_loss->mutable_data(), &context_); - math::Set( - buff_.size(), 0.0, buff_.mutable_data(), &context_); - - // Element-wise smooth l1 loss - // l := SelectSmoothL1((y_hat - y)) - SelectSmoothL1Kernel<<>>( - D, H, W, - M, Y_hat.data(), Y.data(), - L.data(), buff_.mutable_data(), - S.data(), beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Sum of all losses - // al := sum_i l_i - float* avg_loss_data = avg_loss->mutable_data(); - math::Sum( - buff_.size(), buff_.data(), avg_loss_data, &context_); - - // Average of input batch size - math::Scale( - 1, scale_, avg_loss_data, avg_loss_data, &context_); - return true; -} - -template<> -bool SelectSmoothL1LossGradientOp::RunOnDevice() { - auto& Y_hat = Input(0); - auto& Y = Input(1); - auto& L = Input(2); - auto& S = Input(3); - // Below is gradient of net w.r.t. avg_loss ("gradOutput"), should be all 1's - auto& d_avg_loss = Input(4); - - auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype()); // gradient of net w.r.t. Y_hat ("gradInput") - math::Set( - d_Y_hat->size(), 0.0, d_Y_hat->mutable_data(), &context_); - if (Y.size() == 0){ - return true; - } - - int N = Y_hat.dim32(0); - int D = Y_hat.dim32(1); - int H = Y_hat.dim32(2); - int W = Y_hat.dim32(3); - - int M = Y.dim32(0); - // Element-wise weighted difference (can be used to ignore or reweight - // specific components) - // d := (y_hat - y) - // d_Y_hat := d_avg_loss * SelectSmoothL1'((y_hat - y)) - - SelectSmoothL1GradientKernel<<size()), - CAFFE_CUDA_NUM_THREADS, - 0, context_.cuda_stream()>>>( - D, H, W, M, Y_hat.data(), Y.data(), - L.data(), d_Y_hat->mutable_data(), - d_avg_loss.data(), scale_, S.data(), beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - return true; -} - - -REGISTER_CUDA_OPERATOR(SelectSmoothL1Loss, - SelectSmoothL1LossOp); -REGISTER_CUDA_OPERATOR(SelectSmoothL1LossGradient, - SelectSmoothL1LossGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/select_smooth_l1_loss_op.h b/modules/detectron/select_smooth_l1_loss_op.h deleted file mode 100644 index b5a3badfde716..0000000000000 --- a/modules/detectron/select_smooth_l1_loss_op.h +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SELECT_SMOOTH_L1_LOSS_OP_H_ -#define SELECT_SMOOTH_L1_LOSS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SelectSmoothL1LossOp final : public Operator { - public: - SelectSmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - beta_(this->template GetSingleArgument("beta", 1.)), - scale_(this->template GetSingleArgument("scale", 1.)) { - CAFFE_ENFORCE(beta_ > 0); - CAFFE_ENFORCE(scale_ >= 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float beta_; // Transition point from L1 to L2 loss - float scale_; // Scale the loss by scale_ - int dim_; // dimension for 1 anchor prediction - Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences -}; - -template -class SelectSmoothL1LossGradientOp final : public Operator { - public: - SelectSmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - beta_(this->template GetSingleArgument("beta", 1.)), - scale_(this->template GetSingleArgument("scale", 1.)) { - CAFFE_ENFORCE(beta_ > 0); - CAFFE_ENFORCE(scale_ >= 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float beta_; // Transition point from L1 to L2 loss - float scale_; // Scale the loss by scale_ - int dim_; // dimension for 1 anchor prediction - Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences -}; - -} // namespace caffe2 - -#endif // SELECT_SMOOTH_L1_LOSS_OP_H_ diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cc b/modules/detectron/sigmoid_cross_entropy_loss_op.cc deleted file mode 100644 index f45ff40174bbc..0000000000000 --- a/modules/detectron/sigmoid_cross_entropy_loss_op.cc +++ /dev/null @@ -1,96 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "sigmoid_cross_entropy_loss_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR( - SigmoidCrossEntropyLoss, - SigmoidCrossEntropyLossOp); -REGISTER_CPU_OPERATOR( - SigmoidCrossEntropyLossGradient, - SigmoidCrossEntropyLossGradientOp); - -OPERATOR_SCHEMA(SigmoidCrossEntropyLoss) - .NumInputs(2) - .NumOutputs(1) - .SetDoc(R"DOC( -Compute sigmoid activations followed by averaged binary cross entropy loss. The -target values may be in {-1, 0, 1}, where -1 indicates that the corresponding -sample should be ignored and {0, 1} correspond to the binary classes 0 and 1. By -default the loss is divided by the number of targets > -1 and then multiplied by -the `scale` op argument. The divisive normalization may be disable by setting -the op argument `normalize` to 0 (the multiplication by `scale` still takes -effect). - -This op fuses sigmoid and cross entropy for numerical stability in both forward -and gradient computation. -)DOC") - .Arg( - "scale", - "(float) default 1.0; multiply the loss by this scale factor.") - .Arg( - "normalize", - "(int) default 1; if true, divide the loss by the number of targets > " - "-1.") - .Input( - 0, - "X", - "Tensor of predicted logits (shape must be at least 1D).") - .Input( - 1, - "targets", - "Tensor of targets of type int and same shape as logits X.") - .Output( - 0, - "loss", - "Scalar loss."); - -OPERATOR_SCHEMA(SigmoidCrossEntropyLossGradient) - .NumInputs(3) - .NumOutputs(1) - .Input( - 0, - "X", - "See SigmoidCrossEntropyLoss.") - .Input( - 1, - "targets", - "See SigmoidCrossEntropyLoss.") - .Input( - 2, - "d_loss", - "Gradient of forward output 0 (loss).") - .Output( - 0, - "dX", - "Gradient of forward input 0 (X)."); - -class GetSigmoidCrossEntropyLossGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "SigmoidCrossEntropyLossGradient", - "", - vector{I(0), I(1), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(SigmoidCrossEntropyLoss, GetSigmoidCrossEntropyLossGradient); - -} // namespace caffe2 diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cu b/modules/detectron/sigmoid_cross_entropy_loss_op.cu deleted file mode 100644 index bb86560fcb01f..0000000000000 --- a/modules/detectron/sigmoid_cross_entropy_loss_op.cu +++ /dev/null @@ -1,190 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/sigmoid_cross_entropy_loss_op.h" - -namespace caffe2 { - -namespace { -__global__ void ElementwiseMaxKernel(const int n, float* data, const float a) { - CUDA_1D_KERNEL_LOOP(index, n) { - data[index] = (data[index] > a) ? data[index] : a; - } -} - -__global__ void SigmoidCrossEntropyLossKernel( - const int n, - const float* logits, - const int* targets, - float* losses, - float* counts) { - CUDA_1D_KERNEL_LOOP(index, n) { - if (targets[index] == -1) { - losses[index] = 0.; - counts[index] = 0.; - } else { - losses[index] = - -1. * logits[index] * (targets[index] - (logits[index] >= 0)) + - logf( - 1 + - expf(logits[index] - 2 * logits[index] * (logits[index] >= 0))); - counts[index] = 1.; - } - } -} - -__global__ void SigmoidCrossEntropyLossGradientKernel( - const int n, - const float* logits, - const int* targets, - float* d_logits, - float* counts) { - CUDA_1D_KERNEL_LOOP(index, n) { - if (targets[index] == -1) { - d_logits[index] = 0.; - counts[index] = 0.; - } else { - d_logits[index] = 1. / (1. + expf(-logits[index])) - targets[index]; - counts[index] = 1.; - } - } -} -} // namespace - -template <> -bool SigmoidCrossEntropyLossOp::RunOnDevice() { - auto& X = Input(0); - auto& T = Input(1); - - - CAFFE_ENFORCE( - X.size() == T.size(), - "Logit and target must have the same size", - "(", - X.size(), - " vs. ", - T.size(), - ")"); - auto* avg_loss = Output(0, vector(), at::dtype()); - counts_.ResizeLike(X); - losses_.ResizeLike(X); - ReinitializeTensor(&normalizer_, vector(), at::dtype().device(CUDA)); - SigmoidCrossEntropyLossKernel<<< - CAFFE_GET_BLOCKS(X.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - X.size(), - X.data(), - T.data(), - losses_.mutable_data(), - counts_.mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - float* avg_loss_data = avg_loss->mutable_data(); - math::Sum( - losses_.size(), losses_.data(), avg_loss_data, &context_); - if (normalize_) { - float* normalizer_data = normalizer_.mutable_data(); - math::Sum( - counts_.size(), counts_.data(), normalizer_data, &context_); - // Prevent division by zero is all counts are zero - ElementwiseMaxKernel<<< - CAFFE_GET_BLOCKS(normalizer_.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - math::Div( - 1, avg_loss_data, normalizer_data, avg_loss_data, &context_); - } - math::Scale( - 1, scale_, avg_loss_data, avg_loss_data, &context_); - - return true; -} - -template <> -bool SigmoidCrossEntropyLossGradientOp::RunOnDevice() { - auto& X = Input(0); - auto& T = Input(1); - auto& d_avg_loss = Input(2); - - - auto* dX = Output(0, X.sizes(), at::dtype()); - counts_.ResizeLike(X); - ReinitializeTensor(&normalizer_, vector(), at::dtype().device(CUDA)); - SigmoidCrossEntropyLossGradientKernel<<< - CAFFE_GET_BLOCKS(X.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - X.size(), - X.data(), - T.data(), - dX->mutable_data(), - counts_.mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - if (normalize_) { - float* normalizer_data = normalizer_.mutable_data(); - math::Sum( - counts_.size(), counts_.data(), normalizer_data, &context_); - // Prevent division by zero is all counts are zero - ElementwiseMaxKernel<<< - CAFFE_GET_BLOCKS(normalizer_.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - math::Div( - 1, - d_avg_loss.data(), - normalizer_data, - normalizer_data, - &context_); - math::Scale( - 1, scale_, normalizer_data, normalizer_data, &context_); - math::Scale( - dX->size(), - normalizer_data, - dX->data(), - dX->mutable_data(), - &context_); - } else { - math::Scale( - dX->size(), - scale_, - dX->data(), - dX->mutable_data(), - &context_); - math::Scale( - dX->size(), - d_avg_loss.data(), - dX->data(), - dX->mutable_data(), - &context_); - } - return true; -} - -REGISTER_CUDA_OPERATOR( - SigmoidCrossEntropyLoss, - SigmoidCrossEntropyLossOp); -REGISTER_CUDA_OPERATOR( - SigmoidCrossEntropyLossGradient, - SigmoidCrossEntropyLossGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.h b/modules/detectron/sigmoid_cross_entropy_loss_op.h deleted file mode 100644 index 680519e9bdea9..0000000000000 --- a/modules/detectron/sigmoid_cross_entropy_loss_op.h +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SIGMOID_CROSS_ENTROPY_LOSS_OP_H_ -#define SIGMOID_CROSS_ENTROPY_LOSS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SigmoidCrossEntropyLossOp final : public Operator { - public: - SigmoidCrossEntropyLossOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - scale_(this->template GetSingleArgument("scale", 1.)), - normalize_(this->template GetSingleArgument("normalize", 1)) { - CAFFE_ENFORCE(scale_ >= 0); - CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float scale_; - int normalize_; - Tensor losses_{Context::GetDeviceType()}; - Tensor counts_{Context::GetDeviceType()}; - Tensor normalizer_; -}; - -template -class SigmoidCrossEntropyLossGradientOp final : public Operator { - public: - SigmoidCrossEntropyLossGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - scale_(this->template GetSingleArgument("scale", 1.)), - normalize_(this->template GetSingleArgument("normalize", 1)) { - CAFFE_ENFORCE(scale_ >= 0); - CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float scale_; - int normalize_; - Tensor counts_{Context::GetDeviceType()}; - Tensor normalizer_; -}; - -} // namespace caffe2 - -#endif // SIGMOID_CROSS_ENTROPY_LOSS_OP_H_ diff --git a/modules/detectron/sigmoid_focal_loss_op.cc b/modules/detectron/sigmoid_focal_loss_op.cc deleted file mode 100644 index 583e9a0de3283..0000000000000 --- a/modules/detectron/sigmoid_focal_loss_op.cc +++ /dev/null @@ -1,119 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "sigmoid_focal_loss_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(SigmoidFocalLoss, SigmoidFocalLossOp); -REGISTER_CPU_OPERATOR( - SigmoidFocalLossGradient, - SigmoidFocalLossGradientOp); - -OPERATOR_SCHEMA(SigmoidFocalLoss) - .NumInputs(3) - .NumOutputs(1) - .SetDoc(R"DOC( -The binary form of Focal Loss designed for use in RetinaNet-like models. -The input is assumed to be unnormalized scores (sometimes called 'logits') -arranged in a 4D tensor with shape (N, C, H, W), where N is the number of -elements in the batch, H and W are the height and width, and C = num_anchors * -num_classes defines num_anchors 'groups' of logits, each of length -num_classes. For the binary form of Focal Loss, num_classes does not include -the background category. (So, for COCO, num_classes = 80, not 81.) - -The binary form of focal loss is: - - FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t), - -where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0, -respectively. - -See: https://arxiv.org/abs/1708.02002 for details. -)DOC") - .Arg( - "scale", - "(float) default 1.0; multiply the loss by this scale factor.") - .Arg( - "alpha", - "(float) default 0.25; Focal Loss's alpha hyper-parameter.") - .Arg( - "gamma", - "(float) default 1.0; Focal Loss's gamma hyper-parameter.") - .Arg( - "num_classes", - "(int) default 80; number of classes (excluding background).") - .Input( - 0, - "logits", - "4D tensor of sigmoid inputs (called 'scores' or 'logits') with shape " - "(N, C, H, W), where C = num_anchors * num_classes.") - .Input( - 1, - "labels", - "4D tensor of labels with shape (N, num_anchors, H, W). Each entry is " - "a class label in [0, num_classes - 1] (inclusive). The label " - "identifies the one class that should have a sigmoid target of 1.") - .Input( - 2, - "normalizer", - "Scalar; the loss is normalized by 1 / max(1, normalizer)." - ) - .Output( - 0, - "loss", - "Scalar loss."); - -OPERATOR_SCHEMA(SigmoidFocalLossGradient) - .NumInputs(4) - .NumOutputs(1) - .Input( - 0, - "logits", - "See SigmoidFocalLoss.") - .Input( - 1, - "labels", - "See SigmoidFocalLoss.") - .Input( - 2, - "normalizer", - "See SigmoidFocalLoss.") - .Input( - 3, - "d_loss", - "Gradient of forward output 0 (loss)") - .Output( - 0, - "d_logits", - "Gradient of forward input 0 (logits)"); - -class GetSigmoidFocalLossGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - - vector GetGradientDefs() override { - vector blob_names{ - {I(0), I(1), I(2), GO(0)}, - }; - - return SingleGradientDef( - "SigmoidFocalLossGradient", "", blob_names, vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(SigmoidFocalLoss, GetSigmoidFocalLossGradient); - -} // namespace caffe2 diff --git a/modules/detectron/sigmoid_focal_loss_op.cu b/modules/detectron/sigmoid_focal_loss_op.cu deleted file mode 100644 index e6f2dea21b5df..0000000000000 --- a/modules/detectron/sigmoid_focal_loss_op.cu +++ /dev/null @@ -1,185 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/sigmoid_focal_loss_op.h" - -namespace caffe2 { - -namespace { - -__global__ void SigmoidFocalLossKernel( - const int N, const int D, const int H, const int W, const float* logits, - const int* targets, const float* weight_pos, - const float gamma, const float alpha, - const int num_classes, float* losses) { - CUDA_1D_KERNEL_LOOP(i, N * D * H * W) { - int x = i % W; - int y = (i / W) % H; - int c = (i / (W * H)) % D; // channel, here D is channel dim in input NxDxHxW - int n = i / (W * H * D); // n in NxDxHxW - - int A = D / num_classes; // num_anchors = A - int a = c / num_classes; // current anchor out of A anchors in D = A * num_cls - int d = c % num_classes; // current class - int t = targets[n * (H * W * A) + a * (H * W) + y * W + x]; // target - - // check whether the class is true class or not. - // The target classes are in range 1 - 81 and the d is in range 0-80 - // because we predict A*80 dim, so for comparison purpose, compare t and (d+1) - float c1 = (t == (d + 1)); - float c2 = (t != -1 & t != (d + 1)); - - float Np = c10::cuda::compat::max(weight_pos[0], static_cast(1.0)); - float zn = (1.0 - alpha) / Np; - float zp = alpha / Np; - - // p = 1. / 1. + expf(-x) - float p = 1. / (1. + expf(-logits[i])); - - // (1 - p)**gamma * log(p) where - float term1 = powf((1. - p), gamma) * logf(c10::cuda::compat::max(p, FLT_MIN)); - // p**gamma * log(1 - p) - float term2 = - powf(p, gamma) * - (-1. * logits[i] * (logits[i] >= 0) - - logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))); - - losses[i] = 0.0; - losses[i] += -c1 * term1 * zp; - losses[i] += -c2 * term2 * zn; - } -} - -__global__ void SigmoidFocalLossGradientKernel( - const int N, const int D, const int H, const int W, const float* logits, - const int* targets, float* dX_data, const float* weight_pos, - const float gamma, const float alpha, const int num_classes, - const float* avg_loss) { - CUDA_1D_KERNEL_LOOP(i, N * D * H * W) { - float a_loss = avg_loss[0]; - int x = i % W; - int y = (i / W) % H; - int c = (i / (W * H)) % D; - int n = i / (W * H * D); - - int A = D / num_classes; // num_anchors - int a = c / num_classes; // current anchor - int d = c % num_classes; // current class - - float Np = c10::cuda::compat::max(weight_pos[0], static_cast(1.0)); - float zn = (1.0 - alpha) / Np; - float zp = alpha / Np; - int t = targets[n * (H * W * A) + a * (H * W) + y * W + x]; - - float c1 = (t == (d + 1)); - float c2 = (t != -1 & t != (d + 1)); - float p = 1. / (1. + expf(-logits[i])); - - // (1-p)**g * (1 - p - g*p*log(p)) - float term1 = - powf((1. - p), gamma) * - (1. - p - (p * gamma * logf(c10::cuda::compat::max(p, FLT_MIN)))); - // (p**g) * (g*(1-p)*log(1-p) - p) - float term2 = - powf(p, gamma) * - ((-1. * logits[i] * (logits[i] >= 0) - - logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) * - (1. - p) * gamma - p); - dX_data[i] = 0.0; - dX_data[i] += -c1 * zp * term1; - dX_data[i] += -c2 * zn * term2; - dX_data[i] = dX_data[i] * a_loss; - } -} -} // namespace - -template<> -bool SigmoidFocalLossOp::RunOnDevice() { - // Input logits, for example: N x (A * 80) x H x W in cls-agnostic - auto& X = Input(0); - // Target, for example: N x A x H x W - auto& T = Input(1); - // Number of positive examples: scalar - auto& wp = Input(2); - // output avg Sigmoid focal loss as mentioned in RetinaNet paper - - - int N = X.dim32(0); - int D = X.dim32(1); - int H = X.dim32(2); - int W = X.dim32(3); - - auto* avg_loss = Output(0, vector(), at::dtype()); - losses_.ResizeLike(X); - float* avg_loss_data = avg_loss->mutable_data(); - - SigmoidFocalLossKernel<<>>( - N, D, H, W, X.data(), T.data(), - wp.data(), gamma_, alpha_, num_classes_, - losses_.mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - math::Sum( - losses_.size(), losses_.data(), avg_loss_data, &context_); - math::Scale( - 1, scale_, avg_loss_data, avg_loss_data, &context_); - - return true; -} - - -template<> -bool SigmoidFocalLossGradientOp::RunOnDevice() { - auto& X = Input(0); - auto& T = Input(1); - auto& wp = Input(2); - auto& d_avg_loss = Input(InputSize() - 1); - - - // get input shape - int N = X.dim32(0); - int D = X.dim32(1); - int H = X.dim32(2); - int W = X.dim32(3); - - auto* dX = Output(0, X.sizes(), at::dtype()); - - SigmoidFocalLossGradientKernel<<>>( - N, D, H, W, X.data(), T.data(), dX->mutable_data(), - wp.data(), gamma_, alpha_, num_classes_, - d_avg_loss.data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - math::Scale( - dX->size(), - scale_, - dX->data(), - dX->mutable_data(), - &context_); - - return true; -} - - -REGISTER_CUDA_OPERATOR(SigmoidFocalLoss, - SigmoidFocalLossOp); -REGISTER_CUDA_OPERATOR(SigmoidFocalLossGradient, - SigmoidFocalLossGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/sigmoid_focal_loss_op.h b/modules/detectron/sigmoid_focal_loss_op.h deleted file mode 100644 index 7640e0bc8a430..0000000000000 --- a/modules/detectron/sigmoid_focal_loss_op.h +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SIGMOID_FOCAL_LOSS_OP_H_ -#define SIGMOID_FOCAL_LOSS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SigmoidFocalLossOp final : public Operator { - public: - SigmoidFocalLossOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - scale_(this->template GetSingleArgument("scale", 1.)), - num_classes_(this->template GetSingleArgument("num_classes", 80)), - gamma_(this->template GetSingleArgument("gamma", 1.)), - alpha_(this->template GetSingleArgument("alpha", 0.25)) { - CAFFE_ENFORCE(scale_ >= 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float scale_; - int num_classes_; - float gamma_; - float alpha_; - Tensor losses_{Context::GetDeviceType()}; - Tensor counts_{Context::GetDeviceType()}; -}; - -template -class SigmoidFocalLossGradientOp final : public Operator { - public: - SigmoidFocalLossGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - scale_(this->template GetSingleArgument("scale", 1.)), - num_classes_(this->template GetSingleArgument("num_classes", 80)), - gamma_(this->template GetSingleArgument("gamma", 1.)), - alpha_(this->template GetSingleArgument("alpha", 0.25)) { - CAFFE_ENFORCE(scale_ >= 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float scale_; - int num_classes_; - float gamma_; - float alpha_; - Tensor counts_{Context::GetDeviceType()}; - Tensor weights_{Context::GetDeviceType()}; // unignored weights -}; - -} // namespace caffe2 - -#endif // SIGMOID_FOCAL_LOSS_OP_H_ diff --git a/modules/detectron/smooth_l1_loss_op.cc b/modules/detectron/smooth_l1_loss_op.cc deleted file mode 100644 index 9ea570ac9c1b0..0000000000000 --- a/modules/detectron/smooth_l1_loss_op.cc +++ /dev/null @@ -1,117 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "smooth_l1_loss_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(SmoothL1Loss, SmoothL1LossOp); -REGISTER_CPU_OPERATOR( - SmoothL1LossGradient, - SmoothL1LossGradientOp); - -OPERATOR_SCHEMA(SmoothL1Loss) - .NumInputs(4) - .NumOutputs(1) - .SetDoc(R"DOC( -Smooth L1 Loss is a minor variation of Huber loss in which the point of -transition between L2 loss and L1 loss is adjustable by a hyper-parameter beta: - - SmoothL1(x) = 0.5 * x^2 / beta if |x| < beta - |x| - 0.5 * beta otherwise. - -SmoothL1 is used in Fast R-CNN and descendants as the loss function for bounding -box regression. - -The loss computed by this op has a flexible form: - - scale / N * sum_i alpha_out[i] * SmoothL1(alpha_in[i] * (y_hat[i] - y[i])). - -The weights alpha_in and alpha_out are called the "inside" and "outside" -weights, respectively. The inside weights are typically set to either 0 or 1 to -implement ignoring (when 0) certain samples. The outside weights can be used -to implement a per-sample loss weight. The overall loss is scaled by scale / N, -where N is the number of batch elements in the input predictions. -)DOC") - .Arg( - "beta", - "(float) default 1.0; L2 to L1 transition point.") - .Arg( - "scale", - "(float) default 1.0; multiply the loss by this scale factor.") - .Input( - 0, - "Y_hat", - "Tensor of predictions (at least 1D).") - .Input( - 1, - "Y", - "Tensor of labels with the same shape as Y_hat.") - .Input( - 2, - "alpha_in", - "Tensor of inside weights with the same shape as Y.") - .Input( - 3, - "alpha_out", - "Tensor of outside weights with the same shape as Y.") - .Output( - 0, - "loss", - "Scalar loss."); - -OPERATOR_SCHEMA(SmoothL1LossGradient) - .NumInputs(5) - .NumOutputs(1) - .Input( - 0, - "Y_hat", - "See SmoothL1Loss.") - .Input( - 1, - "Y", - "See SmoothL1Loss.") - .Input( - 2, - "alpha_in", - "See SmoothL1Loss.") - .Input( - 3, - "alpha_out", - "See SmoothL1Loss.") - .Input( - 4, - "d_loss", - "Gradient of forward output 0 (loss).") - .Output( - 0, - "d_Y_hat", - "Gradient of forward input 0 (Y_hat)."); - -class GetSmoothL1LossGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "SmoothL1LossGradient", - "", - vector{I(0), I(1), I(2), I(3), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(SmoothL1Loss, GetSmoothL1LossGradient); - -} // namespace caffe2 diff --git a/modules/detectron/smooth_l1_loss_op.cu b/modules/detectron/smooth_l1_loss_op.cu deleted file mode 100644 index ad2d9148c72f0..0000000000000 --- a/modules/detectron/smooth_l1_loss_op.cu +++ /dev/null @@ -1,185 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/smooth_l1_loss_op.h" - -namespace caffe2 { - -namespace { -template -__global__ void SmoothL1Kernel( - const int n, const T* in, T* out, T beta) { - // f(x) = 0.5 * x^2 / beta if |x| < beta - // |x| - 0.5 * beta otherwise - CUDA_1D_KERNEL_LOOP(index, n) { - T val = in[index]; - T abs_val = c10::cuda::compat::abs(val); - if (abs_val < beta) { - out[index] = 0.5 * val * val / beta; - } else { - out[index] = abs_val - 0.5 * beta; - } - } -} - -template -__global__ void SmoothL1GradientKernel( - const int n, - const T* in, - T* out, - const T* d_loss_data, - T norm, - T beta) { - // f'(x) = x / beta if |x| < beta - // = sign(x) otherwise - // We also scale by norm * d_loss in this kernel for convenience - CUDA_1D_KERNEL_LOOP(index, n) { - T val = in[index]; - T abs_val = c10::cuda::compat::abs(val); - T d_loss = *d_loss_data; - if (abs_val < beta) { - out[index] = norm * d_loss * val / beta; - } else { - out[index] = norm * d_loss * ((T(0) < val) - (val < T(0))); - } - } -} -} // namespace - -template<> -bool SmoothL1LossOp::RunOnDevice() { - auto& Y_hat = Input(0); - auto& Y = Input(1); - auto& alpha_in = Input(2); - auto& alpha_out = Input(3); - - - int N = Y.dim32(0); - // Require the same number of elements along axis 0 (batch size), but - // otherwise don't care about the shape (just the number of elements) - CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0), - "Y_hat and Y must have the same number of elements along axis 0"); - CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(), - "Y_hat and Y must have the same number of elements"); - CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size()); - CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size()); - - auto* avg_loss = Output(0, vector(), at::dtype()); - buff_.ResizeLike(Y); - - // Difference - // d := y_hat - y - math::Sub( - Y.size(), Y_hat.data(), Y.data(), - buff_.mutable_data(), &context_); - // Element-wise weighted difference (can be used to ignore or reweight - // specific components) - // d := alpha_in * (y_hat - y) - math::Mul( - buff_.size(), buff_.data(), alpha_in.data(), - buff_.mutable_data(), &context_); - - // Element-wise smooth l1 loss - // l := SmoothL1(alpha_in * (y_hat - y)) - SmoothL1Kernel - <<>>( - buff_.size(), buff_.data(), buff_.mutable_data(), - beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Element-wise weighted smooth l1 loss (can be used to specify a per-element - // loss weight) - // l := alpha_out * SmoothL1(alpha_in * (y_hat - y)) - math::Mul( - buff_.size(), buff_.data(), alpha_out.data(), - buff_.mutable_data(), &context_); - // Sum of all losses - // al := sum_i l_i - float* avg_loss_data = avg_loss->mutable_data(); - math::Sum( - buff_.size(), buff_.data(), avg_loss_data, &context_); - // Average of input batch size - // al := 1/N * al - math::Scale( - 1, scale_ / N, avg_loss_data, avg_loss_data, &context_); - return true; -} - -template<> -bool SmoothL1LossGradientOp::RunOnDevice() { - auto& Y_hat = Input(0); - auto& Y = Input(1); - auto& alpha_in = Input(2); - auto& alpha_out = Input(3); - auto& d_avg_loss = Input(4); // gradient of net w.r.t. avg_loss ("gradOutput") - // We intentially don't compute gradients for Y, alpha_{in,out} since they - // are not needed (can change in the future if desired) - - int N = Y.dim32(0); - // Require the same number of elements along axis 0 (batch size), but - // otherwise don't care about the shape (just the number of elements) - CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0), - "Y_hat and Y must have the same number of elements along axis 0"); - CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(), - "Y_hat and Y must have the same number of elements"); - CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size()); - CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size()); - CAFFE_ENFORCE_EQ(d_avg_loss.size(), 1); - - auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype()); // gradient of net w.r.t. Y_hat ("gradInput") - buff_.ResizeLike(Y); - - // Difference - // d := y_hat - y - math::Sub( - Y.size(), Y_hat.data(), Y.data(), - buff_.mutable_data(), &context_); - // Element-wise weighted difference (can be used to ignore or reweight - // specific components) - // d := alpha_in * (y_hat - y) - math::Mul( - buff_.size(), buff_.data(), alpha_in.data(), - buff_.mutable_data(), &context_); - // d_Y_hat := d_avg_loss / N * SmoothL1'(alpha_in * (y_hat - y)) - SmoothL1GradientKernel - <<>>( - buff_.size(), buff_.data(), d_Y_hat->mutable_data(), - d_avg_loss.data(), scale_ / N, beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Element-wise scale by alpha_in and alpha_out - math::Mul( - d_Y_hat->size(), d_Y_hat->data(), alpha_in.data(), - d_Y_hat->mutable_data(), &context_); - math::Mul( - d_Y_hat->size(), d_Y_hat->data(), alpha_out.data(), - d_Y_hat->mutable_data(), &context_); - return true; -} - - -REGISTER_CUDA_OPERATOR(SmoothL1Loss, - SmoothL1LossOp); -REGISTER_CUDA_OPERATOR(SmoothL1LossGradient, - SmoothL1LossGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/smooth_l1_loss_op.h b/modules/detectron/smooth_l1_loss_op.h deleted file mode 100644 index 5e5cfd882930e..0000000000000 --- a/modules/detectron/smooth_l1_loss_op.h +++ /dev/null @@ -1,75 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SMOOTH_L1_LOSS_OP_H_ -#define SMOOTH_L1_LOSS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SmoothL1LossOp final : public Operator { - public: - SmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - beta_(this->template GetSingleArgument("beta", 1.)), - scale_(this->template GetSingleArgument("scale", 1.)) { - CAFFE_ENFORCE(beta_ > 0); - CAFFE_ENFORCE(scale_ >= 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float beta_; // Transition point from L1 to L2 loss - float scale_; // Scale the loss by scale_ - Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences -}; - -template -class SmoothL1LossGradientOp final : public Operator { - public: - SmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - beta_(this->template GetSingleArgument("beta", 1.)), - scale_(this->template GetSingleArgument("scale", 1.)) { - CAFFE_ENFORCE(beta_ > 0); - CAFFE_ENFORCE(scale_ >= 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float beta_; // Transition point from L1 to L2 loss - float scale_; // Scale the loss by scale_ - Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences -}; - -} // namespace caffe2 - -#endif // SMOOTH_L1_LOSS_OP_H_ diff --git a/modules/detectron/softmax_focal_loss_op.cc b/modules/detectron/softmax_focal_loss_op.cc deleted file mode 100644 index 7bc44571f7a5e..0000000000000 --- a/modules/detectron/softmax_focal_loss_op.cc +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "modules/detectron/softmax_focal_loss_op.h" - -#include "caffe2/operators/softmax_utils.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(SoftmaxFocalLoss, SoftmaxFocalLossOp); -REGISTER_CPU_OPERATOR( - SoftmaxFocalLossGradient, - SoftmaxFocalLossGradientOp); - -OPERATOR_SCHEMA(SoftmaxFocalLoss) - .NumInputs(3) - .NumOutputs(2) - .SetDoc(R"DOC( -A multiclass form of Focal Loss designed for use in RetinaNet-like models. -The input is assumed to be unnormalized scores (sometimes called 'logits') -arranged in a 4D tensor with shape (N, C, H, W), where N is the number of -elements in the batch, H and W are the height and width, and C = num_anchors * -num_classes. The softmax is applied num_anchors times along the C axis. - -The softmax version of focal loss is: - - FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t), - -where p_i = exp(s_i) / sum_j exp(s_j), t is the target (ground truth) class, and -s_j is the unnormalized score for class j. - -See: https://arxiv.org/abs/1708.02002 for details. -)DOC") - .Arg( - "scale", - "(float) default 1.0; multiply the loss by this scale factor.") - .Arg("alpha", "(float) default 0.25; Focal Loss's alpha hyper-parameter.") - .Arg("gamma", "(float) default 1.0; Focal Loss's gamma hyper-parameter.") - .Arg( - "num_classes", - "(int) default 81; number of classes in each softmax group.") - .Input( - 0, - "scores", - "4D tensor of softmax inputs (called 'scores' or 'logits') with shape " - "(N, C, H, W), where C = num_anchors * num_classes defines num_anchors " - "groups of contiguous num_classes softmax inputs.") - .Input( - 1, - "labels", - "4D tensor of labels with shape (N, num_anchors, H, W). Each entry is " - "a class label in [0, num_classes - 1] (inclusive).") - .Input( - 2, - "normalizer", - "Scalar; the loss is normalized by 1 / max(1, normalizer).") - .Output(0, "loss", "Scalar loss.") - .Output( - 1, - "probabilities", - "4D tensor of softmax probabilities with shape (N, C, H, W), where " - "C = num_anchors * num_classes, and softmax was applied to each of the " - "num_anchors groups; within a group the num_classes values sum to 1."); - -OPERATOR_SCHEMA(SoftmaxFocalLossGradient) - .NumInputs(5) - .NumOutputs(1) - .Input(0, "scores", "See SoftmaxFocalLoss.") - .Input(1, "labels", "See SoftmaxFocalLoss.") - .Input(2, "normalizer", "See SoftmaxFocalLoss.") - .Input( - 3, - "probabilities", - "Output 1 from SoftmaxFocalLoss; See SoftmaxFocalLoss.") - .Input(4, "d_loss", "Gradient of forward output 0 (loss)") - .Output(0, "d_scores", "Gradient of forward input 0 (scores)"); - -class GetSoftmaxFocalLossGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "SoftmaxFocalLossGradient", - "", - vector{I(0), I(1), I(2), O(1), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(SoftmaxFocalLoss, GetSoftmaxFocalLossGradient); - -} // namespace caffe2 diff --git a/modules/detectron/softmax_focal_loss_op.cu b/modules/detectron/softmax_focal_loss_op.cu deleted file mode 100644 index 0612ef7edcc8c..0000000000000 --- a/modules/detectron/softmax_focal_loss_op.cu +++ /dev/null @@ -1,256 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/softmax_focal_loss_op.h" - -namespace caffe2 { - -namespace { - -__global__ void SpatialSoftmaxKernel(const int N, const int A, - const int H, const int W, const float* Xdata, float* Pdata, - const int num_classes) { - CUDA_1D_KERNEL_LOOP(index, N * A * H * W) { - int D = num_classes * A; - int x = index % W; - int y = (index / W) % H; - int a = (index / (W * H)) % A; - int i = index / W / H / A; - - // Subtract max on each cell for numerical reasons - float max_val = -FLT_MAX; - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = i * (H * W * D) + c * (H * W) + y * W + x; - max_val = max(max_val, Xdata[idx]); - } - // Exponentiate - float expsum = 0.0f; - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = i * (H * W * D) + c * (H * W) + y * W + x; - float expx = exp(Xdata[idx] - max_val); - Pdata[idx] = expx; - expsum += expx; - } - // Normalize - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = i * (H * W * D) + c * (H * W) + y * W + x; - Pdata[idx] /= expsum; - } - } -} - - -__global__ void SoftmaxFocalLossKernel( - const int N, const int A, const int H, const int W, - const float* Pdata, const int* targets, float* losses, - const float* weight_pos, const float gamma, const float alpha, - const int num_classes) { - CUDA_1D_KERNEL_LOOP(i, N * A * H * W) { - int D = A * num_classes; - int x = i % W; - int y = (i / W) % H; - int a = (i / (W * H)) % A; - int n = i / (W * H * A); - const int label = static_cast(targets[i]); - - float Np = c10::cuda::compat::max(weight_pos[0], static_cast(1.0)); - float z = (label == 0) * (1 - alpha) / Np + - (label >= 1) * alpha / Np; - - losses[i] = 0.0; - if (label >= 0) { - int offset = a * num_classes; - int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x; - losses[i] = - -(pow(1.0f - Pdata[idx], gamma) * - log(c10::cuda::compat::max(Pdata[idx], FLT_MIN))) * z; - } - } -} - - -__global__ void SoftmaxFocalLossGradientWeightKernel( - const int N, const int A, const int H, const int W, - const float* Pdata, const int* targets, float* buff, - const float* weight_pos, const float gamma, const float alpha, - const int num_classes) { - CUDA_1D_KERNEL_LOOP(i, N * A * H * W) { - int D = A * num_classes; - int x = i % W; - int y = (i / W) % H; - int a = (i / (W * H)) % A; - int n = i / (W * H * A); - const int label = static_cast(targets[i]); - float Np = c10::cuda::compat::max(weight_pos[0], static_cast(1.0)); - float z = (label == 0) * (1 - alpha) / Np + - (label >= 1) * alpha / Np; - - buff[i] = 0.0; - if (label >= 0) { - int offset = a * num_classes; - int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x; - float onemp = 1. - Pdata[idx]; - float p = Pdata[idx]; - buff[i] = - (-pow(onemp, gamma) + - gamma * pow(onemp, gamma - 1) * p * log(c10::cuda::compat::max(p, FLT_MIN))) * z; - } - } -} - - -__global__ void SoftmaxFocalLossGradientKernel( - const int N, const int D, const int H, const int W, - const float* Pdata, const int* targets, const float* buff, - const float* d_loss_data, float* dX, const int num_classes) { - CUDA_1D_KERNEL_LOOP(i, N * D * H * W) { - int A = D / num_classes; - int x = i % W; - int y = (i / W) % H; - int d = (i / (W * H)) % D; - int a = d / num_classes; - int c = d % num_classes; - int n = i / (W * H * D); - float d_loss = *d_loss_data; - - int ind = n * (H * W * A) + a * (H * W) + y * W + x; - const int label = static_cast(targets[ind]); - - float c1 = (label >= 0) * 1.0; - float c2 = (label == c) * 1.0; - dX[i] = 0.0; - dX[i] = c1 * d_loss * buff[ind] * (c2 - Pdata[i]); - } -} - -} // namespace - - -template <> -bool SoftmaxFocalLossOp::RunOnDevice() { - auto& X = Input(0); // Logits - auto& T = Input(1); // Labels - auto& wp = Input(2); // num of foreground - // average loss as output - // softmax probability, going to be re-used in gradient - - int N = X.dim32(0); - int D = X.dim32(1); - int H = X.dim32(2); - int W = X.dim32(3); - int A = D / num_classes_; - - ReinitializeTensor(&losses_, {N * A * H * W}, at::dtype().device(CUDA)); - auto* P = Output(1, {N * D * H * W}, at::dtype()); - auto* avg_loss = Output(0, vector(), at::dtype()); - math::Set( - avg_loss->size(), 0.f, avg_loss->mutable_data(), &context_); - math::Set( - P->size(), 0.f, P->mutable_data(), &context_); - math::Set( - losses_.size(), 0.f, losses_.mutable_data(), &context_); - TORCH_DCHECK_EQ(X.ndim(), 4); - - const float* Xdata = X.data(); - const float* Wdata = wp.data(); - - - // Spatial Softmax Kernel - SpatialSoftmaxKernel - <<>>( - N, A, H, W, Xdata, P->mutable_data(), num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Compute loss for each x,y location - const int* Tdata = T.data(); - SoftmaxFocalLossKernel - <<>>( - N, A, H, W, P->data(), Tdata, losses_.mutable_data(), - Wdata, gamma_, alpha_, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // sum the losses - float* avg_loss_data = avg_loss->mutable_data(); - math::Sum( - losses_.size(), losses_.data(), avg_loss_data, &context_); - math::Scale( - 1, scale_, avg_loss_data, avg_loss_data, &context_); - - return true; -} - - -template<> -bool SoftmaxFocalLossGradientOp::RunOnDevice() { - auto& X = Input(0); // Logits - auto& T = Input(1); // Label - auto& wp = Input(2); // num of foreground example - auto& P = Input(3); // Softmax Probability - auto& d_avg_loss = Input(4); - - - int N = X.dim32(0); - int D = X.dim32(1); - int H = X.dim32(2); - int W = X.dim32(3); - int A = D / num_classes_; - - ReinitializeTensor(&buff_, {N * A * H * W}, at::dtype().device(CUDA)); - - auto* dX = Output(0, X.sizes(), at::dtype()); // gradient wrt logits - - const float* Xdata = X.data(); - const int* Tdata = T.data(); - const float* Pdata = P.data(); - const float* Wdata = wp.data(); - - - // Compute the weight for gradients - SoftmaxFocalLossGradientWeightKernel - <<>>( - N, A, H, W, Pdata, Tdata, buff_.mutable_data(), - Wdata, gamma_, alpha_, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Compute the gradient with the weights - const float* Bdata = buff_.data(); - SoftmaxFocalLossGradientKernel - <<>>( - N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data(), - dX->mutable_data(), num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - math::Scale( - dX->size(), - scale_, - dX->data(), - dX->mutable_data(), - &context_); - return true; -} - - -REGISTER_CUDA_OPERATOR(SoftmaxFocalLoss, - SoftmaxFocalLossOp); -REGISTER_CUDA_OPERATOR(SoftmaxFocalLossGradient, - SoftmaxFocalLossGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/softmax_focal_loss_op.h b/modules/detectron/softmax_focal_loss_op.h deleted file mode 100644 index 413c5bd6d7054..0000000000000 --- a/modules/detectron/softmax_focal_loss_op.h +++ /dev/null @@ -1,91 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SOFTMAX_FOCAL_LOSS_OP_H_ -#define SOFTMAX_FOCAL_LOSS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SoftmaxFocalLossOp final : public Operator { - public: - SoftmaxFocalLossOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - scale_(this->template GetSingleArgument("scale", 1.)), - gamma_(this->template GetSingleArgument("gamma", 1.)), - alpha_(this->template GetSingleArgument("alpha", 0.25)), - num_classes_(this->template GetSingleArgument("num_classes", 81)), - order_(StringToStorageOrder( - this->template GetSingleArgument("order", "NCHW"))) { - CAFFE_ENFORCE(scale_ >= 0); - CAFFE_ENFORCE_EQ( - order_, StorageOrder::NCHW, "Only NCHW order is supported right now."); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float scale_; - float gamma_; - float alpha_; - int num_classes_; - StorageOrder order_; - Tensor losses_; -}; - -template -class SoftmaxFocalLossGradientOp final : public Operator { - public: - SoftmaxFocalLossGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - scale_(this->template GetSingleArgument("scale", 1.)), - gamma_(this->template GetSingleArgument("gamma", 1.)), - alpha_(this->template GetSingleArgument("alpha", 0.25)), - num_classes_(this->template GetSingleArgument("num_classes", 81)), - order_(StringToStorageOrder( - this->template GetSingleArgument("order", "NCHW"))) { - CAFFE_ENFORCE(scale_ >= 0); - CAFFE_ENFORCE_EQ( - order_, StorageOrder::NCHW, "Only NCHW order is supported right now."); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float scale_; - float gamma_; - float alpha_; - int num_classes_; - StorageOrder order_; - Tensor buff_; -}; - -} // namespace caffe2 - -#endif // SOFTMAX_FOCAL_LOSS_OP_H_ diff --git a/modules/detectron/spatial_narrow_as_op.cc b/modules/detectron/spatial_narrow_as_op.cc deleted file mode 100644 index 363aa63a8f122..0000000000000 --- a/modules/detectron/spatial_narrow_as_op.cc +++ /dev/null @@ -1,79 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "spatial_narrow_as_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp); -REGISTER_CPU_OPERATOR( - SpatialNarrowAsGradient, - SpatialNarrowAsGradientOp); - -OPERATOR_SCHEMA(SpatialNarrowAs) - .NumInputs(2) - .NumOutputs(1) - .SetDoc(R"DOC( -Reduces ("narrows") the spatial extent of A to that of B by removing rows and -columns from the bottom and right. -)DOC") - .Input( - 0, - "A", - "3D or 4D input of shape (N, H0, W0) or (N, C, H0, W0).") - .Input( - 1, - "B", - "3D or 4D input of shape (N, H1, W1) or (N, C, H1, W1), where H1 <= H0 " - "and W1 <= W0.") - .Output( - 0, - "C", - "Sub window of A containing rows [0, H1 - 1] (inclusive) and columns " - "[0, W1 - 1] (inclusive)."); - -OPERATOR_SCHEMA(SpatialNarrowAsGradient) - .NumInputs(3) - .NumOutputs(1) - .Input( - 0, - "A", - "See SpatialNarrowAs.") - .Input( - 1, - "B", - "See SpatialNarrowAs.") - .Input( - 2, - "dC", - "Gradient of forward output 0 (C).") - .Output( - 0, - "dA", - "Gradient of forward input 0 (A)"); - -class SpatialNarrowAsGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "SpatialNarrowAsGradient", "", - vector{I(0), I(1), GO(0)}, - vector{GI(0)}); - } -}; -REGISTER_GRADIENT(SpatialNarrowAs, SpatialNarrowAsGradient); - -} // namespace caffe2 diff --git a/modules/detectron/spatial_narrow_as_op.cu b/modules/detectron/spatial_narrow_as_op.cu deleted file mode 100644 index ff8b5632e80a8..0000000000000 --- a/modules/detectron/spatial_narrow_as_op.cu +++ /dev/null @@ -1,165 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "caffe2/core/context_gpu.h" -#include "caffe2/core/operator.h" -#include "modules/detectron/spatial_narrow_as_op.h" - -namespace caffe2 { - -namespace { -template -__global__ void CopyKernel( - const int N, - const int C, - const int in_H, - const int in_W, - const int out_H, - const int out_W, - const T* in_data, - T* out_data) { - CUDA_1D_KERNEL_LOOP(index, N * C * out_H * out_W) { - int w = index % out_W; - int h = (index / out_W) % out_H; - int c = (index / out_W / out_H) % C; - int n = (index / out_W / out_H / C); - int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w; - int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w; - out_data[out_index] = in_data[in_index]; - } -} - -template -__global__ void CopyGradientKernel( - const int N, - const int C, - const int in_H, - const int in_W, - const int out_H, - const int out_W, - const T* in_data, - T* out_data) { - CUDA_1D_KERNEL_LOOP(index, N * C * in_H * in_W) { - int w = index % in_W; - int h = (index / in_W) % in_H; - int c = (index / in_W / in_H) % C; - int n = (index / in_W / in_H / C); - int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w; - int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w; - out_data[out_index] = in_data[in_index]; - } -} -} // namespace - - -template <> -bool SpatialNarrowAsOp::RunOnDevice() { - return DispatchHelper>::call(this, Input(0)); -} - -template <> -template -bool SpatialNarrowAsOp::DoRunWithType() { - // Narrows input 0 (A) spatially to match input 1 (B) - auto& A = Input(0); - auto& B = Input(1); - - - CAFFE_ENFORCE_EQ(A.dim32(0), B.dim32(0), "Input dim 0 must be equal."); - std::vector sizes; - if (A.ndim() == B.ndim()) { - CAFFE_ENFORCE_EQ(A.dim32(1), B.dim32(1), "Input dim 1 must be equal."); - CAFFE_ENFORCE_GE( - A.dim32(2), B.dim32(2), "Input 0 height must be >= input 1 height."); - CAFFE_ENFORCE_GE( - A.dim32(3), B.dim32(3), "Input 0 width must be >= input 1 width."); - sizes = B.sizes().vec(); - } else { - // For (N, H, W) case - CAFFE_ENFORCE_EQ(A.ndim() - 1, B.ndim(), "Dimension mismatch."); - CAFFE_ENFORCE_GE( - A.dim32(2), B.dim32(1), "Input 0 height must be >= input 1 height."); - CAFFE_ENFORCE_GE( - A.dim32(3), B.dim32(2), "Input 0 width must be >= input 1 width."); - sizes = {A.dim32(0), A.dim32(1), B.dim32(1), B.dim32(2)}; - } - auto* C = Output(0, sizes, at::dtype()); - int out_width = C->dim32(3); - int out_height = C->dim32(2); - int in_width = A.dim32(3); - int in_height = A.dim32(2); - - CopyKernel<<< - CAFFE_GET_BLOCKS(C->size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - C->dim32(0), - C->dim32(1), - in_height, - in_width, - out_height, - out_width, - A.template data(), - C->template mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - return true; -} - -template <> -bool SpatialNarrowAsGradientOp::RunOnDevice() { - return DispatchHelper>::call(this, Input(0)); -} - -template <> -template -bool SpatialNarrowAsGradientOp::DoRunWithType() { - auto& A = Input(0); - auto& B = Input(1); - auto& dC = Input(2); // Gradient of net w.r.t. output of forward op - auto* dA = Output(0, A.sizes(), at::dtype()); // Gradient of net w.r.t. input to forward op - - math::Set( - dA->size(), 0.f, dA->template mutable_data(), &context_); - int out_width = dA->dim32(3); - int out_height = dA->dim32(2); - int in_width = dC.dim32(3); - int in_height = dC.dim32(2); - - CopyGradientKernel<<< - CAFFE_GET_BLOCKS(dC.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - dA->dim32(0), - dA->dim32(1), - in_height, - in_width, - out_height, - out_width, - dC.template data(), - dA->template mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - return true; -} - -REGISTER_CUDA_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp); -REGISTER_CUDA_OPERATOR( - SpatialNarrowAsGradient, - SpatialNarrowAsGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/spatial_narrow_as_op.h b/modules/detectron/spatial_narrow_as_op.h deleted file mode 100644 index a1fca861f1c26..0000000000000 --- a/modules/detectron/spatial_narrow_as_op.h +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SPATIAL_NARROW_AS_OP_H_ -#define SPATIAL_NARROW_AS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SpatialNarrowAsOp final : public Operator { - public: - SpatialNarrowAsOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws) {} - USE_OPERATOR_CONTEXT_FUNCTIONS; - USE_DISPATCH_HELPER; - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - template - bool DoRunWithType(); -}; - -template -class SpatialNarrowAsGradientOp final : public Operator { - public: - SpatialNarrowAsGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws) {} - USE_OPERATOR_CONTEXT_FUNCTIONS; - USE_DISPATCH_HELPER; - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - template - bool DoRunWithType(); -}; - -} // namespace caffe2 - -#endif // SPATIAL_NARROW_AS_OP_H_ diff --git a/modules/detectron/upsample_nearest_op.cc b/modules/detectron/upsample_nearest_op.cc deleted file mode 100644 index 631e17b231f91..0000000000000 --- a/modules/detectron/upsample_nearest_op.cc +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "upsample_nearest_op.h" -#ifdef USE_MKLDNN -#include "caffe2/ideep/operators/operator_fallback_ideep.h" -#include "caffe2/ideep/utils/ideep_operator.h" -#endif - -namespace caffe2 { -#ifdef USE_MKLDNN -REGISTER_IDEEP_OPERATOR( - UpsampleNearest, - IDEEPFallbackOp>); -#endif - -REGISTER_CPU_OPERATOR(UpsampleNearest, UpsampleNearestOp); -REGISTER_CPU_OPERATOR( - UpsampleNearestGradient, - UpsampleNearestGradientOp); - -OPERATOR_SCHEMA(UpsampleNearest) - .NumInputs(1) - .NumOutputs(1) - .SetDoc(R"DOC( -Nearest neighbor upsampling operation. Implementation taken from THCUNN. -)DOC") - .Arg( - "scale", - "(int) default 2; integer upsampling factor.") - .Input( - 0, - "X", - "4D feature map input of shape (N, C, H, W).") - .Output( - 0, - "Y", - "4D feature map of shape (N, C, scale * H, scale * W); Values are " - "neareast neighbor samples from X."); - -OPERATOR_SCHEMA(UpsampleNearestGradient) - .NumInputs(2) - .NumOutputs(1) - .Input( - 0, - "X", - "See UpsampleNearest.") - .Input( - 1, - "dY", - "Gradient of forward output 0 (Y).") - .Output( - 0, - "dX", - "Gradient of forward input 0 (X)."); - -class GetUpsampleNearestGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "UpsampleNearestGradient", - "", - vector{I(0), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(UpsampleNearest, GetUpsampleNearestGradient); - -} // namespace caffe2 diff --git a/modules/detectron/upsample_nearest_op.cu b/modules/detectron/upsample_nearest_op.cu deleted file mode 100644 index 0ea32e348c0b3..0000000000000 --- a/modules/detectron/upsample_nearest_op.cu +++ /dev/null @@ -1,223 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Adapted from https://github.com/torch/cunn/blob/master/lib/THCUNN/SpatialUpSamplingNearest.cu - * - * Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) - * Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) - * Copyright (c) 2011-2013 NYU (Clement Farabet) - * Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, - * Leon Bottou, Iain Melvin, Jason Weston) - * Copyright (c) 2006 Idiap Research Institute (Samy Bengio) - * Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, - * Samy Bengio, Johnny Mariethoz) - * - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the names of NEC Laboratories American and IDIAP Research - * Institute nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/upsample_nearest_op.h" - -namespace caffe2 { - -namespace { -__device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor) { - int x, y, z, w; - w = ii % d3; - ii = ii/d3; - z = ii % d2; - ii = ii/d2; - y = ii % d1; - ii = ii/d1; - x = ii; - w = w/scale_factor; - z = z/scale_factor; - d2 /= scale_factor; - d3 /= scale_factor; - return (((x*d1+y)*d2)+z)*d3+w; -} - -__device__ int translate_idx_inv( - int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y) { - int x, y, z, w; - w = ii % d3; - ii = ii/d3; - z = ii % d2; - ii = ii/d2; - y = ii % d1; - ii = ii/d1; - x = ii; - w = w*scale_factor+off_x; - z = z*scale_factor+off_y; - d2 *= scale_factor; - d3 *= scale_factor; - return (((x*d1+y)*d2)+z)*d3+w; -} - -__global__ void upscale(const float *input, float *output, long no_elements, - int scale_factor, int d1, int d2, int d3) { - long ii = threadIdx.x + blockDim.x * blockIdx.x; - ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y; - if (ii >= no_elements) return; - int ipidx = translate_idx(ii, d1, d2, d3, scale_factor); - output[ii]=input[ipidx]; -} - -__global__ void downscale(float *gradInput_data, const float *gradOutput_data, - long no_elements, int scale_factor, int d1, int d2, - int d3) { - long ii = threadIdx.x + blockDim.x * blockIdx.x; - ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y; - if (ii >= no_elements) return; - for (int i=0; i < scale_factor; i++){ - for(int j=0; j < scale_factor; j++){ - int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j); - gradInput_data[ii] += gradOutput_data[ipidx]; - } - } -} -} // namespace - -template<> -bool UpsampleNearestOp::RunOnDevice() { - auto& X = Input(0); - auto* Y = Output(0); - - vector out_shape; - for (int i = 0; i < X.ndim(); ++i) { - out_shape.push_back(X.dim32(i)); - } - out_shape[X.ndim() - 1] *= scale_; - out_shape[X.ndim() - 2] *= scale_; - Y->Resize(out_shape); - - int d1; - int d2; - int d3; - if (X.ndim() == 3) { - d1 = Y->dim32(0); - d2 = Y->dim32(1); - d3 = Y->dim32(2); - } else { - d1 = Y->dim32(1); - d2 = Y->dim32(2); - d3 = Y->dim32(3); - } - long no_elements = Y->size(); - - const float *input_data = X.data(); - float *output_data = Y->mutable_data(); - - // cuda blocks & threads: - long nthreads = 256; - // Max number of blocks: http://en.wikipedia.org/wiki/CUDA - // 65535 for SM 2.x, 2^32 -1 for >= 3.0 - // TODO: When we move to SM 3.5 we should update this - long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535); - long n_yblocks = (long)ceil( - (float)no_elements / (float)(n_xblocks * nthreads)); - CAFFE_ENFORCE(n_yblocks <= 65535); - dim3 blocks(n_xblocks, n_yblocks); - dim3 threads(nthreads); - - upscale<<>>( - input_data, output_data, no_elements, scale_, d1, d2, d3); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - return true; -} - - -template<> -bool UpsampleNearestGradientOp::RunOnDevice() { - auto& X = Input(0); // Original input to "forward" op - auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op - // (aka "gradOutput") - auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op - // (aka "gradInput") - - dX->ResizeLike(X); - float *gradInput_data = dX->mutable_data(); - const float *gradOutput_data = dY.data(); - - int d1; - int d2; - int d3; - if (dX->ndim() == 3) { - d1 = dX->dim32(0); - d2 = dX->dim32(1); - d3 = dX->dim32(2); - } else { - d1 = dX->dim32(1); - d2 = dX->dim32(2); - d3 = dX->dim32(3); - } - long no_elements = dX->size(); - - // cuda blocks & threads: - long nthreads = 256; - // Max number of blocks: http://en.wikipedia.org/wiki/CUDA - // 65535 for SM 2.x, 2^32 -1 for >= 3.0 - // TODO: When we move to SM 3.5 we should update this - long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535); - long n_yblocks = (long)ceil( - (float)no_elements / (float)(n_xblocks * nthreads)); - CAFFE_ENFORCE(n_yblocks <= 65535); - dim3 blocks(n_xblocks, n_yblocks); - dim3 threads(nthreads); - - math::Set(no_elements, 0.f, gradInput_data, &context_); - downscale<<>>( - gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - return true; -} - -REGISTER_CUDA_OPERATOR(UpsampleNearest, - UpsampleNearestOp); -REGISTER_CUDA_OPERATOR(UpsampleNearestGradient, - UpsampleNearestGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/upsample_nearest_op.h b/modules/detectron/upsample_nearest_op.h deleted file mode 100644 index f850f0381a1e8..0000000000000 --- a/modules/detectron/upsample_nearest_op.h +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef UPSAMPLE_NEAREST_OP_H_ -#define UPSAMPLE_NEAREST_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class UpsampleNearestOp final : public Operator { - public: - UpsampleNearestOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - scale_(this->template GetSingleArgument("scale", 2)) { - TORCH_DCHECK_GE(scale_, 1); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - auto& X = Input(0); - - auto out_shape = X.sizes().vec(); - out_shape[X.dim() - 1] *= scale_; - out_shape[X.dim() - 2] *= scale_; - auto* Y = Output(0, out_shape, at::dtype()); - - int d1; - int d2; - int d3; - if (X.dim() == 3) { - d1 = Y->dim32(0); - d2 = Y->dim32(1); - d3 = Y->dim32(2); - } else { - d1 = Y->dim32(0) * Y->dim32(1); - d2 = Y->dim32(2); - d3 = Y->dim32(3); - } - - const T *input_data = X.template data(); - T *output_data = Y->template mutable_data(); - int scaled_d2 = d2 / scale_; - int scaled_d3 = d3 / scale_; - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int i = 0; i < d1; ++i) { - for (int j = 0; j < d2; ++j) { - for (int u = 0; u < d3; ++u) { - int ii = (i * d2 + j) * d3 + u; - int scaled_u = u / scale_; - int scaled_j = j / scale_; - int ipidx = ((i * scaled_d2) + scaled_j) * scaled_d3 + scaled_u; - output_data[ii] = input_data[ipidx]; - } - } - } - - return true; - } - - protected: - int scale_; -}; - -template -class UpsampleNearestGradientOp final : public Operator { - public: - UpsampleNearestGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - scale_(this->template GetSingleArgument("scale", 2)) { - TORCH_DCHECK_GE(scale_, 1); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - int scale_; -}; - -} // namespace caffe2 - -#endif // UPSAMPLE_NEAREST_OP_H_ diff --git a/modules/detectron/upsample_nearest_op_test.py b/modules/detectron/upsample_nearest_op_test.py deleted file mode 100644 index 276d50474d1fe..0000000000000 --- a/modules/detectron/upsample_nearest_op_test.py +++ /dev/null @@ -1,42 +0,0 @@ -import unittest - -import caffe2.python.hypothesis_test_util as hu -import hypothesis.strategies as st -import numpy as np -from caffe2.python import core, dyndep -from hypothesis import given, settings - - -dyndep.InitOpsLibrary("@/caffe2/modules/detectron:detectron_ops") - - -class TestUpsampleNearestOp(hu.HypothesisTestCase): - @given( - N=st.integers(1, 3), - H=st.integers(10, 300), - W=st.integers(10, 300), - scale=st.integers(1, 3), - **hu.gcs, - ) - @settings(deadline=None, max_examples=20) - def test_upsample_nearest_op(self, N, H, W, scale, gc, dc): - C = 32 - X = np.random.randn(N, C, H, W).astype(np.float32) - op = core.CreateOperator("UpsampleNearest", ["X"], ["Y"], scale=scale) - - def ref(X): - outH = H * scale - outW = W * scale - outH_idxs, outW_idxs = np.meshgrid( - np.arange(outH), np.arange(outW), indexing="ij" - ) - inH_idxs = (outH_idxs / scale).astype(np.int32) - inW_idxs = (outW_idxs / scale).astype(np.int32) - Y = X[:, :, inH_idxs, inW_idxs] - return [Y] - - self.assertReferenceChecks(device_option=gc, op=op, inputs=[X], reference=ref) - - -if __name__ == "__main__": - unittest.main() diff --git a/modules/module_test/CMakeLists.txt b/modules/module_test/CMakeLists.txt deleted file mode 100644 index f72120d535f30..0000000000000 --- a/modules/module_test/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -if(NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - # If we are building the standalone module, we set the proper cmake variables. - cmake_minimum_required(VERSION 3.0 FATAL_ERROR) - find_package(Caffe2 REQUIRED) - set(BUILD_TEST ON) - option(BUILD_SHARED_LIBS "Build shared libs." ON) -endif() - -if(BUILD_TEST AND NOT BUILD_LITE_INTERPRETER) - add_library( - caffe2_module_test_dynamic - ${CMAKE_CURRENT_SOURCE_DIR}/module_test_dynamic.cc) - - if(HAVE_SOVERSION) - set_target_properties(caffe2_module_test_dynamic PROPERTIES - VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION}) - endif() - target_link_libraries(caffe2_module_test_dynamic torch_library) - install(TARGETS caffe2_module_test_dynamic DESTINATION lib) - if(MSVC AND BUILD_SHARED_LIBS) - install(FILES $ DESTINATION lib OPTIONAL) - endif() -endif() diff --git a/modules/module_test/module_test_dynamic.cc b/modules/module_test/module_test_dynamic.cc deleted file mode 100644 index 32596167a3761..0000000000000 --- a/modules/module_test/module_test_dynamic.cc +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "caffe2/core/module.h" -#include "caffe2/core/operator.h" - -// An explicitly defined module, testing correctness when we dynamically link a -// module -CAFFE2_MODULE(caffe2_module_test_dynamic, "Dynamic module only used for testing."); - -namespace caffe2 { - -class Caffe2ModuleTestDynamicDummyOp : public OperatorBase { - public: - using OperatorBase::OperatorBase; - bool Run(int /* unused */ /*stream_id*/) override { - return true; - } - virtual string type() { - return "base"; - } -}; - -REGISTER_CPU_OPERATOR( - Caffe2ModuleTestDynamicDummy, Caffe2ModuleTestDynamicDummyOp); -OPERATOR_SCHEMA(Caffe2ModuleTestDynamicDummy); - -} // namespace caffe2 diff --git a/modules/observers/CMakeLists.txt b/modules/observers/CMakeLists.txt deleted file mode 100644 index 050b8a1461e32..0000000000000 --- a/modules/observers/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - if(NOT USE_OBSERVERS) - return() - endif() -else() - cmake_minimum_required(VERSION 3.0 FATAL_ERROR) - project(caffe2_observers CXX) - find_package(Caffe2 REQUIRED) - option(BUILD_SHARED_LIBS "Build shared libs." ON) -endif() - -add_library(caffe2_observers - "${CMAKE_CURRENT_SOURCE_DIR}/net_observer_reporter_print.cc" - "${CMAKE_CURRENT_SOURCE_DIR}/observer_config.cc" - "${CMAKE_CURRENT_SOURCE_DIR}/perf_observer.cc" - ) -if(HAVE_SOVERSION) - set_target_properties(caffe2_observers PROPERTIES - VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION}) -endif() -target_link_libraries(caffe2_observers PUBLIC torch_library) -target_include_directories(caffe2_observers PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..) -target_compile_options(caffe2_observers PRIVATE "-DCAFFE2_BUILD_OBSERVER_LIB") -install(TARGETS caffe2_observers DESTINATION lib) -caffe2_interface_library(caffe2_observers caffe2_observers_library) -if(MSVC AND BUILD_SHARED_LIBS) - install(FILES $ DESTINATION lib OPTIONAL) -endif() - -if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - set(Caffe2_MODULES ${Caffe2_MODULES} caffe2_observers_library PARENT_SCOPE) -endif() diff --git a/modules/observers/macros.h b/modules/observers/macros.h deleted file mode 100644 index e69b055d2a1d5..0000000000000 --- a/modules/observers/macros.h +++ /dev/null @@ -1,7 +0,0 @@ -#include "c10/macros/Macros.h" - -#ifdef CAFFE2_BUILD_OBSERVER_LIB -#define CAFFE2_OBSERVER_API C10_EXPORT -#else -#define CAFFE2_OBSERVER_API C10_IMPORT -#endif diff --git a/modules/observers/net_observer_reporter.h b/modules/observers/net_observer_reporter.h deleted file mode 100644 index bfccef64cee2b..0000000000000 --- a/modules/observers/net_observer_reporter.h +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once - -#include - -#include "caffe2/core/common.h" -#include "caffe2/core/net.h" -#include "observers/macros.h" - -namespace caffe2 { - -struct PerformanceInformation { - // Analytic - int64_t flops = 0; - int64_t bytes_written = 0; - int64_t bytes_read = 0; - std::vector tensor_shapes = {}; - std::vector args = {}; - std::string engine = ""; // the engine used - std::string type = ""; // the type of the operator - // Measured - double latency = 0; - double cpuMilliseconds = 0; -}; - -class CAFFE2_OBSERVER_API NetObserverReporter { - public: - virtual ~NetObserverReporter() = default; - - /* - Report the delay metric collected by the observer. - The delays are saved in a map. The key is an identifier associated - with the reported delay. The value is the delay value in float - */ - virtual void report( - NetBase* net, - std::map&) = 0; -}; -} diff --git a/modules/observers/net_observer_reporter_print.cc b/modules/observers/net_observer_reporter_print.cc deleted file mode 100644 index dca9cbba44bf1..0000000000000 --- a/modules/observers/net_observer_reporter_print.cc +++ /dev/null @@ -1,158 +0,0 @@ -#include "observers/net_observer_reporter_print.h" - -#include -#include -#include "caffe2/core/init.h" -#include "observers/observer_config.h" - -#include - -namespace caffe2 { - -const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer "; -static std::string get_op_args(PerformanceInformation p); -static std::string get_tensor_shapes(PerformanceInformation p); -static std::string sanatize(std::string json_s); - -void NetObserverReporterPrint::report( - NetBase* net, - std::map& info) { - // Not allowed to use json library - std::vector> caffe2_perf; - - for (auto& p : info) { - if ((p.first == "NET_DELAY") && (info.size() == 1)) { - // for Net_delay perf - caffe2_perf.push_back({{"type", "NET"}, - {"value", c10::to_string(p.second.latency * 1000)}, - {"unit", "us"}, - {"metric", "latency"}}); - caffe2_perf.push_back({{"type", "NET_"}, - { - "value", - c10::to_string( - p.second.cpuMilliseconds / - p.second.latency * - 100), - }, - {"unit", "percent"}, - {"metric", "cpu_percent"}}); - } else if (p.first != "NET_DELAY") { - // for operator perf - std::string shape_str = get_tensor_shapes(p.second); - std::string args_str = get_op_args(p.second); - std::string type = p.first; - caffe2_perf.push_back({{"type", type}, - {"value", c10::to_string(p.second.latency * 1000)}, - {"unit", "us"}, - {"metric", "latency"}}); - caffe2_perf.push_back({{"type", type}, - { - "value", - c10::to_string( - p.second.cpuMilliseconds / - p.second.latency * - 100), - }, - {"unit", "percent"}, - {"metric", "cpu_percent"}}); - if (p.second.flops > 0) { - caffe2_perf.push_back({{"type", type}, - {"value", c10::to_string(p.second.flops)}, - {"unit", "flop"}, - {"metric", "flops"}}); - } - if (shape_str != "") { - caffe2_perf.push_back({{"type", type}, - {"info_string", shape_str}, - {"unit", ""}, - {"metric", "tensor_shapes"}}); - } - if (args_str != "") { - caffe2_perf.push_back({{"type", type}, - {"info_string", args_str}, - {"unit", ""}, - {"metric", "op_args"}}); - } - } - } - - // NOLINTNEXTLINE(modernize-loop-convert) - for (auto it = caffe2_perf.begin(); it != caffe2_perf.end(); it++) { - std::stringstream buffer; - auto entry = *it; - buffer << IDENTIFIER << "{"; - // NOLINTNEXTLINE(modernize-raw-string-literal) - buffer << "\"type\": \"" << sanatize(entry["type"]) << "\"," - // NOLINTNEXTLINE(modernize-raw-string-literal) - << "\"unit\": \"" << sanatize(entry["unit"]) << "\"," - // NOLINTNEXTLINE(modernize-raw-string-literal) - << "\"metric\": \"" << sanatize(entry["metric"]) << "\","; - if (entry.find("value") != entry.end()) { - // NOLINTNEXTLINE(modernize-raw-string-literal) - buffer << "\"value\": \"" << sanatize(entry["value"]) << "\""; - } else if (entry.find("info_string") != entry.end()) { - // NOLINTNEXTLINE(modernize-raw-string-literal) - buffer << "\"info_string\": \"" << sanatize(entry["info_string"]) << "\""; - } - buffer << "}"; - LOG(INFO) << buffer.str(); - } -} - -static std::string get_tensor_shapes(PerformanceInformation p) { - std::string shape_str; - std::stringstream shape_stream; - if (!p.tensor_shapes.empty()) { - shape_stream << "["; - for (const auto i : c10::irange(p.tensor_shapes.size())) { - shape_stream << "["; - for (int j = 0; j < p.tensor_shapes[i].dims_size(); j++) { - shape_stream << p.tensor_shapes[i].dims(j) << ", "; - } - shape_stream << "], "; - } - shape_stream << "]"; - shape_str = shape_stream.str(); - } else { - shape_str = ""; - } - return shape_str; -} - -static std::string get_op_args(PerformanceInformation p) { - std::string args_str; - if (!p.args.empty()) { - std::stringstream args; - args << "["; - for (const auto i : c10::irange(p.args.size())) { - args << "{" << p.args[i].name() << ": "; - if (p.args[i].has_i()) { - args << p.args[i].i(); - } else if (p.args[i].has_s()) { - args << p.args[i].s(); - } else if (p.args[i].has_n()) { - args << &p.args[i].n(); - } else if (p.args[i].has_f()) { - args << p.args[i].f(); - } else { - args << "None"; - } - args << "}, "; - } - args << "]"; - args_str = args.str(); - } else { - args_str = ""; - } - return args_str; -} - -static std::string sanatize(std::string json_s) { - // Remove illegal characters from the name that would cause json string to - // become invalid - json_s.erase(std::remove(json_s.begin(), json_s.end(), '"'), json_s.end()); - json_s.erase(std::remove(json_s.begin(), json_s.end(), '\\'), json_s.end()); - return json_s; -} -} diff --git a/modules/observers/net_observer_reporter_print.h b/modules/observers/net_observer_reporter_print.h deleted file mode 100644 index 5d4640c24c994..0000000000000 --- a/modules/observers/net_observer_reporter_print.h +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include "observers/macros.h" -#include "observers/net_observer_reporter.h" - -#include "caffe2/core/common.h" - -namespace caffe2 { - -class CAFFE2_OBSERVER_API NetObserverReporterPrint : public NetObserverReporter { - public: - static const std::string IDENTIFIER; - void report(NetBase* net, std::map&) override; -}; - -} // namespace caffe2 diff --git a/modules/observers/observer_config.cc b/modules/observers/observer_config.cc deleted file mode 100644 index c6ba6a2d370c0..0000000000000 --- a/modules/observers/observer_config.cc +++ /dev/null @@ -1,12 +0,0 @@ -#include "observers/observer_config.h" - -namespace caffe2 { - -int ObserverConfig::netInitSampleRate_ = 0; -int ObserverConfig::netFollowupSampleRate_ = 0; -int ObserverConfig::netFollowupSampleCount_ = 0; -int ObserverConfig::operatorNetSampleRatio_ = 0; -int ObserverConfig::skipIters_ = 0; -unique_ptr ObserverConfig::reporter_ = nullptr; -int ObserverConfig::marker_ = -1; -} diff --git a/modules/observers/observer_config.h b/modules/observers/observer_config.h deleted file mode 100644 index cc967263a66b9..0000000000000 --- a/modules/observers/observer_config.h +++ /dev/null @@ -1,99 +0,0 @@ -#pragma once - -#include "observers/macros.h" -#include "observers/net_observer_reporter.h" - -#include "caffe2/core/common.h" - -namespace caffe2 { - -/* - netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 1 : - Log operator metrics in every iteration - netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 0 : - Log net metrics in every iterationn - netInitSampleRate_ == n && netFollowupSampleRate_ == m && - netFollowupSampleCount == c && operatorNetSampleRatio_ == 1 : - Log operator metrics first at odds of 1 / n. Once first logged, - the following c logs are at odds of 1 / min(n, m). Then repeat - netInitSampleRate_ == n && netFollowupSampleRate_ == m && - netFollowupSampleCount == c && operatorNetSampleRatio_ == 0 : - Log net metrics first at odds of 1 / n. Once first logged, - the following c logs are at odds of 1 / min(n, m). Then repeat - netInitSampleRate_ == n && netFollowupSampleRate_ == m && - netFollowupSampleCount == c && operatorNetSampleRatio_ == o : - Log net metrics first at odds of 1 / n. Once first logged, - the following c logs are at odds of 1 / min(n, m), if the random number - is multiples of o, log operator metrics instead. Then repeat - skipIters_ == n: skip the first n iterations of the net. -*/ -class CAFFE2_OBSERVER_API ObserverConfig { - public: - static void initSampleRate( - int netInitSampleRate, - int netFollowupSampleRate, - int netFollowupSampleCount, - int operatorNetSampleRatio, - int skipIters) { - CAFFE_ENFORCE(netFollowupSampleRate <= netInitSampleRate); - CAFFE_ENFORCE(netFollowupSampleRate >= 1 || netInitSampleRate == 0); - netInitSampleRate_ = netInitSampleRate; - netFollowupSampleRate_ = netFollowupSampleRate; - netFollowupSampleCount_ = netFollowupSampleCount; - operatorNetSampleRatio_ = operatorNetSampleRatio; - skipIters_ = skipIters; - } - static int getNetInitSampleRate() { - return netInitSampleRate_; - } - static int getNetFollowupSampleRate() { - return netFollowupSampleRate_; - } - static int getNetFollowupSampleCount() { - return netFollowupSampleCount_; - } - static int getOpoeratorNetSampleRatio() { - return operatorNetSampleRatio_; - } - static int getSkipIters() { - return skipIters_; - } - static void setReporter(unique_ptr reporter) { - reporter_ = std::move(reporter); - } - static NetObserverReporter* getReporter() { - CAFFE_ENFORCE(reporter_); - return reporter_.get(); - } - static void setMarker(int marker) { - marker_ = marker; - } - static int getMarker() { - return marker_; - } - - private: - /* The odds of log net metric initially or immediately after reset */ - static int netInitSampleRate_; - - /* The odds of log net metric after log once after start of reset */ - static int netFollowupSampleRate_; - - /* The number of follow up logs to be collected for odds of - netFollowupSampleRate_ */ - static int netFollowupSampleCount_; - - /* The odds to log the operator metric instead of the net metric. - When the operator is logged the net is not logged. */ - static int operatorNetSampleRatio_; - - /* skip the first few iterations */ - static int skipIters_; - - static unique_ptr reporter_; - - /* marker used in identifying the metrics in certain reporters */ - static int marker_; -}; - -} diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc deleted file mode 100644 index cfd6130f7255e..0000000000000 --- a/modules/observers/perf_observer.cc +++ /dev/null @@ -1,330 +0,0 @@ -#include "observers/perf_observer.h" -#include "observers/observer_config.h" -#ifndef C10_MOBILE -#include "caffe2/core/flags.h" -#include "observers/net_observer_reporter_print.h" -#endif - -#include -// NOLINTNEXTLINE(modernize-deprecated-headers) -#include -#include "caffe2/core/common.h" -#include "caffe2/core/init.h" -#include "caffe2/core/operator.h" - -#if defined(TARGET_OS_MAC) || \ -defined(TARGET_OS_IPHONE) || \ -defined(TARGET_IPHONE_SIMULATOR) -#define _APPLE 1 -#endif - -#ifdef _WIN32 -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif -#include -#endif - -#ifdef _APPLE -#include -#include -#include -#endif - -#ifndef C10_MOBILE -C10_DEFINE_int64( - aiBench_netInitSampleRate, - 0, - "One in N sampling rate for net delay"); - -C10_DEFINE_int64( - aiBench_netFollowupSampleRate, - 0, - "One in N sampling rate for net delay"); - -C10_DEFINE_int64( - aiBench_netFollowupSampleCount, - 0, - "control the following c logs"); - -C10_DEFINE_int64( - aiBench_operatorNetSampleRatio, - 0, - "One in N sampling rate for operator delay"); - -C10_DEFINE_int64( - aiBench_skipIters, - 0, - "skip the first N iterations of the net run"); -#endif - -namespace caffe2 { -namespace { - -bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) { - AddGlobalNetObserverCreator([](NetBase* subject) { - return std::make_unique(subject); - }); - -#if !defined(C10_MOBILE) - // for aibench usage - caffe2::ObserverConfig::setReporter( - std::make_unique()); - - caffe2::ObserverConfig::initSampleRate( - FLAGS_aiBench_netInitSampleRate, - FLAGS_aiBench_netFollowupSampleRate, - FLAGS_aiBench_netFollowupSampleCount, - FLAGS_aiBench_operatorNetSampleRatio, - FLAGS_aiBench_skipIters); -#endif - - return true; -} -} // namespace - -#ifdef _WIN32 -double getTicksPerMillisecond() { - static LARGE_INTEGER ticks_per_sec; - if (!ticks_per_sec.QuadPart) { - QueryPerformanceFrequency(&ticks_per_sec); - if (!ticks_per_sec.QuadPart) { - return 0.0; - } - } - - return static_cast(ticks_per_sec.QuadPart) / 1000.0; -} -#elif !defined _APPLE -double getClockTimeMilliseconds(clockid_t clk_id) { - int result; - struct timespec tp; - result = clock_gettime(clk_id, &tp); - if (result == -1) { - return 0.0; - } else { - return tp.tv_sec * 1000.0 + tp.tv_nsec / 1000000.0; - } -} -#endif - -double getWallClockTimeMilliseconds() { -#ifdef _WIN32 - double ticks_per_ms = getTicksPerMillisecond(); - if (ticks_per_ms) { - LARGE_INTEGER ticks; - if (QueryPerformanceCounter(&ticks)) { - return static_cast(ticks.QuadPart) / ticks_per_ms; - } - } - - return 0.0; -#elif defined _APPLE - static mach_timebase_info_data_t info; - if (info.denom == 0) { - mach_timebase_info(&info); - } - - uint64_t now = mach_absolute_time(); - now = now * info.numer / info.denom; // convert to nanoseconds - return now / 1000000.0; -#else - return getClockTimeMilliseconds(CLOCK_MONOTONIC); -#endif -} - -double getCpuTimeMilliseconds() { -#ifdef _WIN32 - FILETIME creation_time; - FILETIME exit_time; - FILETIME kernel_time; - FILETIME user_time; - if (GetProcessTimes( - GetCurrentProcess(), - &creation_time, - &exit_time, - &kernel_time, - &user_time)) { - ULARGE_INTEGER kernel; - ULARGE_INTEGER user; - kernel.HighPart = kernel_time.dwHighDateTime; - kernel.LowPart = kernel_time.dwLowDateTime; - user.HighPart = user_time.dwHighDateTime; - user.LowPart = user_time.dwLowDateTime; - return (static_cast(kernel.QuadPart) + - static_cast(user.QuadPart)) / 10000.0; - } - - return 0.0; -#elif defined _APPLE - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) - struct rusage ru; - if (getrusage(RUSAGE_SELF, &ru)) { - return 0.0; - } - - return ru.ru_utime.tv_sec * 1000.0 - + ru.ru_utime.tv_usec / 1000.0 - + ru.ru_stime.tv_sec * 1000.0 - + ru.ru_stime.tv_usec / 1000.0; -#else - return getClockTimeMilliseconds(CLOCK_PROCESS_CPUTIME_ID); -#endif -} - -REGISTER_CAFFE2_EARLY_INIT_FUNCTION( - registerGlobalPerfNetObserverCreator, - ®isterGlobalPerfNetObserverCreator, - "Caffe2 net global observer creator"); - -// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) -PerfNetObserver::PerfNetObserver(NetBase* subject_) - : NetObserver(subject_), numRuns_(0) {} - -// NOLINTNEXTLINE(modernize-use-equals-default) -PerfNetObserver::~PerfNetObserver() {} - -void PerfNetObserver::Start() { - static int visitCount = 0; - // Select whether to log the operator or the net. - // We have one sample rate for the entire app. - int netInitSampleRate = ObserverConfig::getNetInitSampleRate(); - int netFollowupSampleRate = ObserverConfig::getNetFollowupSampleRate(); - int netFollowupSampleCount = ObserverConfig::getNetFollowupSampleCount(); - int operatorNetSampleRatio = ObserverConfig::getOpoeratorNetSampleRatio(); - int skipIters = ObserverConfig::getSkipIters(); - int sampleRate = visitCount > 0 ? netFollowupSampleRate : netInitSampleRate; - // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand) - if (skipIters <= static_cast(numRuns_) && sampleRate > 0 && rand() % sampleRate == 0) { - visitCount++; - if (visitCount == netFollowupSampleCount) { - visitCount = 0; - } - // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand) - if (operatorNetSampleRatio > 0 && rand() % operatorNetSampleRatio == 0) { - logType_ = PerfNetObserver::OPERATOR_DELAY; - } else { - logType_ = PerfNetObserver::NET_DELAY; - } - } else { - logType_ = PerfNetObserver::NONE; - } - numRuns_++; - - if (logType_ == PerfNetObserver::OPERATOR_DELAY) { - /* Always recreate new operator observers - whenever we measure operator delay */ - const auto& operators = subject_->GetOperators(); - for (auto* op : operators) { - observerMap_[op] = op->AttachObserver( - std::make_unique(op, this)); - } - } - - wallMilliseconds_ = getWallClockTimeMilliseconds(); - cpuMilliseconds_ = getCpuTimeMilliseconds(); -} - -void PerfNetObserver::Stop() { - if (logType_ == PerfNetObserver::NONE) { - return; - } - std::map info; - PerformanceInformation net_perf; - net_perf.cpuMilliseconds = - getCpuTimeMilliseconds() - cpuMilliseconds_; - net_perf.latency = - getWallClockTimeMilliseconds() - wallMilliseconds_; - - if (logType_ == PerfNetObserver::OPERATOR_DELAY) { - const auto& operators = subject_->GetOperators(); - for (unsigned idx = 0; idx < operators.size(); ++idx) { - const auto* op = operators[idx]; - auto name = getObserverName(op, static_cast(idx)); - PerformanceInformation p; - const PerfOperatorObserver* opObserver = - static_cast(observerMap_[op]); - p.latency = opObserver->getWallMilliseconds(); - p.cpuMilliseconds = opObserver->getCpuMilliseconds(); - p.engine = op->engine(); - p.type = op->type(); - p.tensor_shapes = - static_cast(observerMap_[op]) - ->getTensorShapes(); - - if (op->has_debug_def()) { - // NOLINTNEXTLINE(performance-for-range-copy) - for (auto arg : op->debug_def().arg()) { - p.args.emplace_back(arg); - } - } - - info.insert({name, p}); - } - - /* clear all operator delay after use so that we don't spent time - collecting the operator delay info in later runs */ - for (auto* op : operators) { - op->DetachObserver(observerMap_[op]); - } - observerMap_.clear(); - } - info.insert({"NET_DELAY", net_perf}); - ObserverConfig::getReporter()->report(subject_, info); -} - -caffe2::string PerfNetObserver::getObserverName(const OperatorBase* op, int idx) - const { - string opType = op->has_debug_def() ? op->debug_def().type() : "NO_TYPE"; - string displayName = - (op->has_debug_def() ? op->debug_def().name().size() - ? op->debug_def().name() - : (op->debug_def().output_size() ? op->debug_def().output(0) - : "NO_OUTPUT") - : "NO_DEF"); - caffe2::string name = - "ID_" + c10::to_string(idx) + "_" + opType + "_" + displayName; - return name; -} - -PerfOperatorObserver::PerfOperatorObserver( - OperatorBase* op, - PerfNetObserver* netObserver) - : ObserverBase(op), - netObserver_(netObserver), - wallMilliseconds_(0), - cpuMilliseconds_(0) { - CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net"); -} - -// NOLINTNEXTLINE(modernize-use-equals-default) -PerfOperatorObserver::~PerfOperatorObserver() {} - -void PerfOperatorObserver::Start() { - wallMilliseconds_ = getWallClockTimeMilliseconds(); - cpuMilliseconds_ = getCpuTimeMilliseconds(); -} - -void PerfOperatorObserver::Stop() { - /* Time from the start of the net minus the time spent on all other - operators is the time spent on this operator */ - cpuMilliseconds_ = - getCpuTimeMilliseconds() - cpuMilliseconds_; - wallMilliseconds_ = - getWallClockTimeMilliseconds() - wallMilliseconds_; - tensor_shapes_ = subject_->InputTensorShapes(); -} - -double PerfOperatorObserver::getWallMilliseconds() const { - return wallMilliseconds_; -} - -double PerfOperatorObserver::getCpuMilliseconds() const { - return cpuMilliseconds_; -} - -std::vector PerfOperatorObserver::getTensorShapes() const { - return tensor_shapes_; -} - -} // namespace caffe2 diff --git a/modules/observers/perf_observer.h b/modules/observers/perf_observer.h deleted file mode 100644 index 71e1190e840ba..0000000000000 --- a/modules/observers/perf_observer.h +++ /dev/null @@ -1,66 +0,0 @@ -#pragma once - -#include "caffe2/core/common.h" -#include "caffe2/core/net.h" -#include "caffe2/core/observer.h" -#include "caffe2/core/timer.h" -#include "observers/macros.h" - -#include - -namespace caffe2 { - -double getClockTimeMilliseconds(); - -class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver { - public: - explicit PerfNetObserver(NetBase* subject_); - virtual ~PerfNetObserver(); - - private: - void Start() override; - void Stop() override; - - caffe2::string getObserverName(const OperatorBase* op, int idx) const; - - private: - enum LogType { - NONE, - OPERATOR_DELAY, - NET_DELAY, - }; - LogType logType_; - unsigned int numRuns_; - std::unordered_map*> - observerMap_; - - double wallMilliseconds_; - double cpuMilliseconds_; -}; - -class PerfOperatorObserver : public ObserverBase { - public: - PerfOperatorObserver(OperatorBase* op, PerfNetObserver* netObserver); - virtual ~PerfOperatorObserver(); - - double getWallMilliseconds() const; - double getCpuMilliseconds() const; - std::vector getTensorShapes() const; - - private: - void Start() override; - void Stop() override; - - private: - // Observer of a net that owns corresponding op. We make sure net is never - // destructed while operator observer is still alive. First operator observer - // gets destructed, then the op, then the net and its observer. - // We do this trick in order to get access to net's name and other fields - // without storing inside the operator observer. Each field is memory - // costly here and a raw pointer is a cheapest sholution - PerfNetObserver* netObserver_; - double wallMilliseconds_; - double cpuMilliseconds_; - std::vector tensor_shapes_; -}; -} // namespace caffe2 diff --git a/setup.py b/setup.py index d774446780b48..84f3d48c958e8 100644 --- a/setup.py +++ b/setup.py @@ -88,12 +88,6 @@ # disables use of system-wide nccl (we will use our submoduled # copy in third_party/nccl) # -# BUILD_CAFFE2_OPS=0 -# disable Caffe2 operators build -# -# BUILD_CAFFE2=0 -# disable Caffe2 build -# # USE_IBVERBS # toggle features related to distributed support # @@ -1317,6 +1311,7 @@ def main(): "include/torch/csrc/onnx/*.h", "include/torch/csrc/profiler/*.h", "include/torch/csrc/profiler/orchestration/*.h", + "include/torch/csrc/profiler/standalone/*.h", "include/torch/csrc/profiler/stubs/*.h", "include/torch/csrc/profiler/unwind/*.h", "include/torch/csrc/profiler/python/*.h", diff --git a/test/cpp/api/autograd.cpp b/test/cpp/api/autograd.cpp index 3d1604752dbc4..4d6bb485be518 100644 --- a/test/cpp/api/autograd.cpp +++ b/test/cpp/api/autograd.cpp @@ -1265,7 +1265,7 @@ int64_t ret_single_non_tensor( torch::Tensor opt_op( const torch::Tensor& self, - const c10::optional& other) { + const std::optional& other) { if (other.has_value()) { return self + other.value(); } else { @@ -1461,11 +1461,11 @@ TEST(TestAutogradNotImplementedFallback, OptOp) { auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::opt_op", ""); auto op = [&](const torch::Tensor& _1, - const c10::optional& _2) { + const std::optional& _2) { return callOpUnboxed< torch::Tensor, const torch::Tensor&, - const c10::optional&>(opHandle, _1, _2); + const std::optional&>(opHandle, _1, _2); }; auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true); diff --git a/test/cpp/api/memory.cpp b/test/cpp/api/memory.cpp index d9f44ea3f7a40..a3adc81406b7b 100644 --- a/test/cpp/api/memory.cpp +++ b/test/cpp/api/memory.cpp @@ -6,8 +6,8 @@ struct TestValue { explicit TestValue(const int& x) : lvalue_(x) {} explicit TestValue(int&& x) : rvalue_(x) {} - c10::optional lvalue_; - c10::optional rvalue_; + std::optional lvalue_; + std::optional rvalue_; }; TEST(MakeUniqueTest, ForwardRvaluesCorrectly) { diff --git a/test/cpp/c10d/ProcessGroupNCCLTest.cpp b/test/cpp/c10d/ProcessGroupNCCLTest.cpp index d1c2380274278..edf4f03c2d692 100644 --- a/test/cpp/c10d/ProcessGroupNCCLTest.cpp +++ b/test/cpp/c10d/ProcessGroupNCCLTest.cpp @@ -42,7 +42,7 @@ class NCCLTestBase { void initialize( int rank, int size, - c10::optional<::std::shared_ptr<::c10d::ProcessGroupNCCL>> split_from = + std::optional<::std::shared_ptr<::c10d::ProcessGroupNCCL>> split_from = c10::nullopt) { store_ = c10::make_intrusive<::c10d::FileStore>(path_, size); diff --git a/test/cpp/jit/test_argument_spec.cpp b/test/cpp/jit/test_argument_spec.cpp index 6ffe167c58768..71785d889952a 100644 --- a/test/cpp/jit/test_argument_spec.cpp +++ b/test/cpp/jit/test_argument_spec.cpp @@ -111,7 +111,7 @@ TEST(ArgumentSpecTest, CompleteArgumentSpec_CUDA) { // } // TEST(ArgumentSpecTest, VaryingShape) { -// c10::VaryingShape vs(c10::optional{}); +// c10::VaryingShape vs(std::optional{}); // auto ptt_empty1 = TensorType::create({}, {}, vs, vs, false); // auto ptt_empty2 = TensorType::create({}, {}, vs, vs, false); // ASSERT_EQ(hashCode(ptt_empty1), hashCode(ptt_empty2)); diff --git a/test/cpp/jit/test_custom_class_registrations.cpp b/test/cpp/jit/test_custom_class_registrations.cpp index 2595c64c9b170..819d5495b06c3 100644 --- a/test/cpp/jit/test_custom_class_registrations.cpp +++ b/test/cpp/jit/test_custom_class_registrations.cpp @@ -27,7 +27,7 @@ struct DefaultArgs : torch::CustomClassHolder { x = scale * x + add; return x; } - int64_t divide(c10::optional factor) { + int64_t divide(std::optional factor) { if (factor) { // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) x = x / *factor; @@ -334,7 +334,7 @@ struct ElementwiseInterpreter : torch::CustomClassHolder { // collection types like vector, optional, and dict. using SerializationType = std::tuple< std::vector /*input_names_*/, - c10::optional /*output_name_*/, + std::optional /*output_name_*/, c10::Dict /*constants_*/, std::vector /*instructions_*/ >; @@ -360,7 +360,7 @@ struct ElementwiseInterpreter : torch::CustomClassHolder { // Class members std::vector input_names_; - c10::optional output_name_; + std::optional output_name_; c10::Dict constants_; std::vector instructions_; }; diff --git a/test/cpp/jit/test_exception.cpp b/test/cpp/jit/test_exception.cpp index be23548e16d15..a4932e76b3e24 100644 --- a/test/cpp/jit/test_exception.cpp +++ b/test/cpp/jit/test_exception.cpp @@ -31,7 +31,7 @@ TEST(TestException, TestAssertion) { bool is_jit_exception = false; std::string message; - c10::optional exception_class; + std::optional exception_class; try { cu_ptr->run_method("foo"); } catch (JITException& e) { @@ -140,7 +140,7 @@ TEST(TestException, TestCustomException) { (torch::jit::GraphFunction*)&cu->get_function("foo"); std::cerr << "Graph is\n" << *gf->graph() << std::endl; bool is_jit_exception = false; - c10::optional exception_class; + std::optional exception_class; std::string message; try { cu->run_method("foo"); diff --git a/test/cpp/jit/test_ir.cpp b/test/cpp/jit/test_ir.cpp index e9a0edabaaf0f..19910cbf31f00 100644 --- a/test/cpp/jit/test_ir.cpp +++ b/test/cpp/jit/test_ir.cpp @@ -194,17 +194,17 @@ TEST(IRTest, OperatorMap) { ASSERT_FALSE(op_map.contains(*op6)); op_map.insert(op1, 1); ASSERT_TRUE(op_map.contains(*op1)); - c10::optional o1 = op_map.find(*op1); + std::optional o1 = op_map.find(*op1); ASSERT_TRUE(o1.has_value()); - c10::optional o2 = op_map.find(*op2); + std::optional o2 = op_map.find(*op2); ASSERT_TRUE(o2.has_value()); - c10::optional o3 = op_map.find(*op3); + std::optional o3 = op_map.find(*op3); ASSERT_FALSE(o3.has_value()); - c10::optional o4 = op_map.find(*op4); + std::optional o4 = op_map.find(*op4); ASSERT_TRUE(o4.has_value()); - c10::optional o5 = op_map.find(*op5); + std::optional o5 = op_map.find(*op5); ASSERT_TRUE(o5.has_value()); - c10::optional o6 = op_map.find(*op6); + std::optional o6 = op_map.find(*op6); ASSERT_FALSE(o6.has_value()); } diff --git a/test/cpp/jit/test_jit_type.cpp b/test/cpp/jit/test_jit_type.cpp index 606c1b0fa36e0..08f7f360731b7 100644 --- a/test/cpp/jit/test_jit_type.cpp +++ b/test/cpp/jit/test_jit_type.cpp @@ -12,7 +12,7 @@ TEST(JitTypeTest, IsComplete) { auto tt = c10::TensorType::create( at::kFloat, at::kCPU, - c10::SymbolicShape(std::vector>({1, 49})), + c10::SymbolicShape(std::vector>({1, 49})), std::vector( {c10::Stride{2, true, 1}, c10::Stride{1, true, 1}, diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp index efe377aad72ce..9c74eb45e535f 100644 --- a/test/cpp/jit/test_misc.cpp +++ b/test/cpp/jit/test_misc.cpp @@ -1302,7 +1302,7 @@ TEST(RecordFunctionTest, OperatorNameOverload) { at::addGlobalCallback(at::RecordFunctionCallback( [](const at::RecordFunction& fn) -> std::unique_ptr { - c10::optional op_name = + std::optional op_name = fn.operator_name(); if (op_name.has_value()) { operator_names.insert(c10::toString(*op_name)); diff --git a/test/cpp/jit/test_shape_analysis.cpp b/test/cpp/jit/test_shape_analysis.cpp index 4940d277ce043..0ff3908d639a5 100644 --- a/test/cpp/jit/test_shape_analysis.cpp +++ b/test/cpp/jit/test_shape_analysis.cpp @@ -296,7 +296,7 @@ TEST(ShapeAnalysisTest, MovingConstantOutOfFusionGroups) { namespace { -c10::optional sym_dim = c10::nullopt; +std::optional sym_dim = c10::nullopt; // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) void assertShapeEqual(c10::SymbolicShape& a, c10::SymbolicShape& e) { @@ -306,8 +306,8 @@ void assertShapeEqual(c10::SymbolicShape& a, c10::SymbolicShape& e) { } void assertShapeEqual( - c10::optional>& actual, - std::vector> expected) { + std::optional>& actual, + std::vector> expected) { ASSERT_TRUE(actual.has_value()); ASSERT_EQ(actual->size(), 1); @@ -332,12 +332,12 @@ TEST(ShapeAnalysisTest, SymbolicShapeAPI) { // Check vector initializer list syntax c10::SymbolicShape ss_concrete = - std::vector>{1, 56, 56}; - c10::SymbolicShape ss1 = std::vector>{sym_dim, 56, 56}; + std::vector>{1, 56, 56}; + c10::SymbolicShape ss1 = std::vector>{sym_dim, 56, 56}; c10::SymbolicShape ss2 = - std::vector>{64, sym_dim, sym_dim}; + std::vector>{64, sym_dim, sym_dim}; c10::SymbolicShape ss3 = - std::vector>{sym_dim, sym_dim, sym_dim, sym_dim}; + std::vector>{sym_dim, sym_dim, sym_dim, sym_dim}; auto res = calculateSymbolicShapesOnOp( schema, std::vector{const_size_1, const_size_1}); @@ -484,7 +484,7 @@ TEST(ShapeAnalysisTest, TestShapeMultipleReturns) { auto res = calculateSymbolicShapesOnOp(max_dim_op, {ss1, const_int, false_ival}); c10::SymbolicShape expected_res = - c10::SymbolicShape(std::vector>{sym_dim}); + c10::SymbolicShape(std::vector>{sym_dim}); assertShapeEqual(res->at(0), expected_res); // res0 and res1 should share the same symbolic symbol EXPECT_EQ(res->at(0), res->at(1)); diff --git a/test/cpp/lazy/test_lazy_ops.cpp b/test/cpp/lazy/test_lazy_ops.cpp index aa31ffc59bb51..745f40729f02d 100644 --- a/test/cpp/lazy/test_lazy_ops.cpp +++ b/test/cpp/lazy/test_lazy_ops.cpp @@ -475,7 +475,7 @@ TEST_F(LazyOpsTest, TestDiv) { } TEST_F(LazyOpsTest, TestDivWithRoundingMode) { - c10::optional rounding_modes[] = { + std::optional rounding_modes[] = { "trunc", "floor", c10::nullopt}; for (const auto& rounding_mode : rounding_modes) { for (torch::ScalarType scalar_type1 : @@ -535,7 +535,7 @@ TEST_F(LazyOpsTest, TestDivInPlace) { } TEST_F(LazyOpsTest, TestDivInPlaceWithRoundingMode) { - c10::optional rounding_modes[] = { + std::optional rounding_modes[] = { "trunc", "floor", c10::nullopt}; for (const auto& rounding_mode : rounding_modes) { for (torch::ScalarType scalar_type1 : {torch::kFloat}) { @@ -1553,7 +1553,7 @@ TEST_F(LazyOpsTest, TestStdWithCorrection) { torch::Tensor a = torch::rand( {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); // int rank = a.dim(); - c10::optional corrections[] = {1, 2, c10::nullopt}; + std::optional corrections[] = {1, 2, c10::nullopt}; for (const auto& correction : corrections) { for (auto keepdim : {true, false}) { for (const auto& dim : @@ -1573,7 +1573,7 @@ TEST_F(LazyOpsTest, TestStdMeanWithCorrection) { torch::Tensor a = torch::rand( {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); // int rank = a.dim(); - c10::optional corrections[] = {1, 2, c10::nullopt}; + std::optional corrections[] = {1, 2, c10::nullopt}; for (const auto& correction : corrections) { for (auto keepdim : {true, false}) { for (const auto& dim : @@ -1710,7 +1710,7 @@ TEST_F(LazyOpsTest, TestVarWithDim) { TEST_F(LazyOpsTest, TestVarWithCorrection) { torch::Tensor a = torch::rand( {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); - c10::optional corrections[] = {1, 2, c10::nullopt}; + std::optional corrections[] = {1, 2, c10::nullopt}; for (const auto& dim : std::vector>{{0, 1}, {-3, -2}}) { for (bool keepDim : {true, false}) { for (const auto& correction : corrections) { @@ -1730,7 +1730,7 @@ TEST_F(LazyOpsTest, TestVarWithCorrection) { TEST_F(LazyOpsTest, TestVarMeanWithCorrection) { torch::Tensor a = torch::rand( {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); - c10::optional corrections[] = {1, 2, c10::nullopt}; + std::optional corrections[] = {1, 2, c10::nullopt}; for (const auto& dim : std::vector>{{0, 1}, {-3, -2}}) { for (const auto& correction : corrections) { for (auto keepdim : {true, false}) { diff --git a/test/cpp/lazy/test_misc.cpp b/test/cpp/lazy/test_misc.cpp index aa4cd1b7e798e..441e5c41eee13 100644 --- a/test/cpp/lazy/test_misc.cpp +++ b/test/cpp/lazy/test_misc.cpp @@ -63,10 +63,10 @@ TEST(HashTest, Sanity) { test_hash_repeatable_sensitive(c10::Scalar(true), c10::Scalar(false)); test_hash_repeatable_sensitive(c10::Scalar(12345), c10::Scalar(12354)); - // c10::optional + // std::optional test_hash_repeatable_sensitive( - c10::optional("I have value!"), - c10::optional(c10::nullopt)); + std::optional("I have value!"), + std::optional(c10::nullopt)); // Containers auto a = std::vector({0, 1, 1, 2, 3, 5, 8}); diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp index 7a4291f0ba447..c26c800a16bf6 100644 --- a/test/cpp/tensorexpr/test_external_calls.cpp +++ b/test/cpp/tensorexpr/test_external_calls.cpp @@ -507,11 +507,11 @@ TEST(ExternalCall, Prepacked_Linear_float) { .findSchemaOrThrow("prepacked::linear_clamp_prepack", "") .typed( at::Tensor, - c10::optional, - const c10::optional&, - const c10::optional&)>(); + std::optional, + const std::optional&, + const std::optional&)>(); auto prepacked = linear_clamp_prepack_op.call( - weight, bias, c10::optional(), c10::optional()); + weight, bias, std::optional(), c10::optional()); BufHandle DummyPrepacked("DummyPrepacked", {1}, kFloat); Tensor Result = Tensor( @@ -581,13 +581,13 @@ TEST(ExternalCall, Prepacked_Conv2d_float) { .findSchemaOrThrow("prepacked::conv2d_clamp_prepack", "") .typed( at::Tensor, - c10::optional, + std::optional, std::vector, std::vector, std::vector, int64_t, - const c10::optional&, - const c10::optional&)>(); + const std::optional&, + const std::optional&)>(); auto prepacked = conv2d_clamp_prepack_op.call( weight, bias, @@ -595,8 +595,8 @@ TEST(ExternalCall, Prepacked_Conv2d_float) { {pad, pad}, {dilation, dilation}, groups, - c10::optional(), - c10::optional()); + std::optional(), + std::optional()); BufHandle DummyPrepacked("DummyPrepacked", {1}, kFloat); Tensor Result = Tensor( @@ -945,7 +945,7 @@ TEST(ExternalCall, JitCustomFusionOp) { const std::vector& inputs, const std::vector& output_shape, const std::vector& output_strides, - const c10::optional& output_type, + const std::optional& output_type, at::Device device) { auto output_dtype = Dtype(*output_type); torch::jit::tensorexpr::BufHandle result_buf( diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index 21b86e9b00707..22f6b64efe1a8 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -888,7 +888,7 @@ TEST_F(Kernel, SumAllAxes) { parseIR(graph_string, &*graph); auto o = at::empty({}, TensorOptions(kCPU)); - c10::optional dtype; + std::optional dtype; if (scalar_type != ScalarType::Undefined) { dtype = static_cast(scalar_type); } @@ -947,7 +947,7 @@ TEST_F(Kernel, SumOneAxis) { env.d("dim", dim); env.d("keepdim", keepdim); env.s("dtype", dtypeConstant(scalar_type)); - c10::optional dtype; + std::optional dtype; if (scalar_type != ScalarType::Undefined) { dtype = static_cast(scalar_type); } @@ -1665,7 +1665,7 @@ Tensor lowerNanToNum( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { auto input_buf = std::get(inputs[0]); auto e = Compute( diff --git a/test/cpp/tensorexpr/test_quantization.cpp b/test/cpp/tensorexpr/test_quantization.cpp index a689358276f2c..af6b539ff33e9 100644 --- a/test/cpp/tensorexpr/test_quantization.cpp +++ b/test/cpp/tensorexpr/test_quantization.cpp @@ -390,8 +390,8 @@ at::Tensor quantized_cat( .typed const&, int64_t, - c10::optional, - c10::optional)>(); + std::optional, + std::optional)>(); return op.redispatch( DispatchKeySet({DispatchKey::QuantizedCPU}), xs, dim, scale, zero); } diff --git a/test/cpp_extensions/extension.cpp b/test/cpp_extensions/extension.cpp index f476a983b14c2..1de9e03971115 100644 --- a/test/cpp_extensions/extension.cpp +++ b/test/cpp_extensions/extension.cpp @@ -23,7 +23,7 @@ struct MatrixMultiplier { torch::Tensor tensor_; }; -bool function_taking_optional(c10::optional tensor) { +bool function_taking_optional(std::optional tensor) { return tensor.has_value(); } diff --git a/test/cpp_extensions/maia_extension.cpp b/test/cpp_extensions/maia_extension.cpp index 13315810f54c4..8dbc64f82076d 100644 --- a/test/cpp_extensions/maia_extension.cpp +++ b/test/cpp_extensions/maia_extension.cpp @@ -20,8 +20,8 @@ Tensor get_tensor(caffe2::TypeMeta dtype, IntArrayRef size) { return Tensor(std::move(tensor_impl)); } -Tensor empty_override(IntArrayRef size, c10::optional dtype, c10::optional layout, c10::optional device, - c10::optional pin_memory, c10::optional optional_memory_format) { +Tensor empty_override(IntArrayRef size, std::optional dtype, c10::optional layout, c10::optional device, + std::optional pin_memory, c10::optional optional_memory_format) { test_int = 0; return get_tensor(scalarTypeToTypeMeta(dtype_or_default(dtype)), size); } @@ -32,7 +32,7 @@ Tensor& add_out_override(const Tensor & a, const Tensor & b , const Scalar& c, T } Tensor fake_convolution( - const Tensor& input, const Tensor& weight, const c10::optional& bias, + const Tensor& input, const Tensor& weight, const std::optional& bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed, IntArrayRef output_padding, int64_t groups) { test_int = 2; diff --git a/test/cpp_extensions/open_registration_extension.cpp b/test/cpp_extensions/open_registration_extension.cpp index f5b61102af7b2..df46d827339b4 100644 --- a/test/cpp_extensions/open_registration_extension.cpp +++ b/test/cpp_extensions/open_registration_extension.cpp @@ -277,11 +277,11 @@ REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_custom_alloc); // basic dummy empty function, so we can directly construct tensors on the custom device // This dummy test device will just use the CPU allocator, and ignores pinned memory. at::Tensor custom_empty_memory_format(at::IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional memory_format) { constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1); return at::detail::empty_generic(size, &global_custom_alloc, @@ -290,11 +290,11 @@ at::Tensor custom_empty_memory_format(at::IntArrayRef size, memory_format); } at::Tensor custom_empty_symint(c10::IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional memory_format) { constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1); return at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, c10::dtype_or_default(dtype), memory_format); @@ -368,10 +368,10 @@ at::Tensor custom__copy_from_and_resize(const at::Tensor& self, const at::Tensor at::Tensor custom_empty_strided(c10::IntArrayRef size, c10::IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) { constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1); auto dtype = c10::dtype_or_default(dtype_opt); return at::detail::empty_strided_generic(size, stride, &global_custom_alloc, private_use_ks, dtype); @@ -406,7 +406,7 @@ at::Tensor& custom_set_source_Storage_storage_offset(at::Tensor& result, // basic dummy functions related to pin_memory. std::vector custom_pinned_data_ptr; -at::Tensor custom__pin_memory(const at::Tensor& self, c10::optional device) { +at::Tensor custom__pin_memory(const at::Tensor& self, std::optional device) { TORCH_CHECK( self.device().is_cpu(), "cannot pin '", @@ -420,7 +420,7 @@ at::Tensor custom__pin_memory(const at::Tensor& self, c10::optional return dump_pinned_tensor; } -bool custom_is_pinned(const at::Tensor& self, c10::optional device) { +bool custom_is_pinned(const at::Tensor& self, std::optional device) { // Only CPU tensors can be pinned if (!self.is_cpu()) { return false; @@ -436,7 +436,7 @@ bool custom_is_pinned(const at::Tensor& self, c10::optional device) } const at::Tensor& custom_resize_(const at::Tensor& self, at::IntArrayRef size, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { at::TensorImpl* tensor_impl = self.unsafeGetTensorImpl(); tensor_impl->set_sizes_contiguous(size); const auto itemsize = tensor_impl->dtype().itemsize(); diff --git a/test/cpp_extensions/rng_extension.cpp b/test/cpp_extensions/rng_extension.cpp index 2e657d15a3979..3fc62ee69f752 100644 --- a/test/cpp_extensions/rng_extension.cpp +++ b/test/cpp_extensions/rng_extension.cpp @@ -33,15 +33,15 @@ struct TestCPUGenerator : public c10::GeneratorImpl { uint64_t value_; }; -Tensor& random_(Tensor& self, c10::optional generator) { +Tensor& random_(Tensor& self, std::optional generator) { return at::native::templates::random_impl(self, generator); } -Tensor& random_from_to(Tensor& self, int64_t from, optional to, c10::optional generator) { +Tensor& random_from_to(Tensor& self, int64_t from, optional to, std::optional generator) { return at::native::templates::random_from_to_impl(self, from, to, generator); } -Tensor& random_to(Tensor& self, int64_t to, c10::optional generator) { +Tensor& random_to(Tensor& self, int64_t to, std::optional generator) { return random_from_to(self, 0, to, generator); } diff --git a/test/custom_operator/op.cpp b/test/custom_operator/op.cpp index c9389713428bc..ab0506a822f61 100644 --- a/test/custom_operator/op.cpp +++ b/test/custom_operator/op.cpp @@ -29,7 +29,7 @@ struct CustomOpAutogradFunction : public torch::autograd::Function var3) { + std::optional var3) { ctx->saved_data["mul"] = mul; ctx->saved_data["var3_has_value"] = var3.has_value(); ctx->save_for_backward({var1, var2}); @@ -59,7 +59,7 @@ torch::Tensor custom_op_with_autograd( torch::Tensor var1, int64_t mul, torch::Tensor var2, - c10::optional var3) { + std::optional var3) { return CustomOpAutogradFunction::apply(var1, mul, var2, var3); } diff --git a/test/custom_operator/test_custom_ops.cpp b/test/custom_operator/test_custom_ops.cpp index b1e830f7b65c7..a526bebd26144 100644 --- a/test/custom_operator/test_custom_ops.cpp +++ b/test/custom_operator/test_custom_ops.cpp @@ -57,7 +57,7 @@ void get_autograd_operator_from_registry_and_execute() { torch::Tensor z = torch::randn({5,5}, torch::requires_grad()); torch::Tensor output = - helpers::get_operator_from_registry_and_execute("custom::op_with_autograd", x, 2, y, c10::optional()); + helpers::get_operator_from_registry_and_execute("custom::op_with_autograd", x, 2, y, std::optional()); TORCH_INTERNAL_ASSERT(output.allclose(x + 2*y + x*y)); auto go = torch::ones({}, torch::requires_grad()); @@ -88,7 +88,7 @@ void get_autograd_operator_from_registry_and_execute_in_nograd_mode() { torch::Tensor y = torch::randn({5,5}, torch::requires_grad()); torch::Tensor output = - helpers::get_operator_from_registry_and_execute("custom::op_with_autograd", x, 2, y, c10::optional()); + helpers::get_operator_from_registry_and_execute("custom::op_with_autograd", x, 2, y, std::optional()); TORCH_INTERNAL_ASSERT(output.allclose(x + 2*y + x*y)); } diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py index e826ca7a000d9..eec060d3004cc 100644 --- a/test/distributed/_composable/fsdp/test_fully_shard_training.py +++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py @@ -19,6 +19,7 @@ register_fsdp_forward_method, ) from torch.distributed._tensor import DTensor, init_device_mesh +from torch.distributed._tensor.debug.comm_mode import CommDebugMode from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( _CHECKPOINT_PREFIX, apply_activation_checkpointing, @@ -29,11 +30,6 @@ get_optimizer_state_dict, ) from torch.distributed.device_mesh import DeviceMesh -from torch.distributed.tensor.parallel import ( - ColwiseParallel, - parallelize_module, - RowwiseParallel, -) from torch.testing._internal.common_cuda import TEST_CUDA from torch.testing._internal.common_distributed import skip_if_lt_x_gpu from torch.testing._internal.common_fsdp import ( @@ -41,8 +37,8 @@ FSDPTest, FSDPTestMultiThread, MLP, + MLPStack, patch_all_gather, - patch_all_reduce, patch_reduce_scatter, test_compiled_fsdp, ) @@ -59,6 +55,8 @@ ) from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir +c10d_ops = torch.ops.c10d + class TestFullyShardForwardInputs(FSDPTestMultiThread): @property @@ -696,8 +694,7 @@ def _test_gradient_accumulation( return # skip since not common torch.manual_seed(42) - local_batch_size, lin_dim, num_mlps, num_microbatches = (2, 32, 3, 3) - global_batch_size = local_batch_size * self.world_size + batch_size, lin_dim, num_mlps, num_microbatches = (2, 32, 3, 3) if mode == "some_mlps": num_mlps_to_disable_reduce_scatter = 2 modules = [nn.Linear(lin_dim, lin_dim)] @@ -716,32 +713,9 @@ def _test_gradient_accumulation( ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2) optim = torch.optim.Adam(model.parameters(), lr=1e-2) - # TODO: Migrate to `CommDebugMode` once it supports c10d collectives. - orig_all_gather = dist.all_gather_into_tensor - orig_reduce_scatter = dist.reduce_scatter_tensor - orig_all_reduce = dist.all_reduce - all_gather_count, reduce_scatter_count, all_reduce_count = 0, 0, 0 - - def all_gather_with_count(*args, **kwargs): - nonlocal all_gather_count - all_gather_count += 1 - return orig_all_gather(*args, **kwargs) - - def reduce_scatter_with_count(*args, **kwargs): - nonlocal reduce_scatter_count - reduce_scatter_count += 1 - return orig_reduce_scatter(*args, **kwargs) - - def all_reduce_with_count(*args, **kwargs): - nonlocal all_reduce_count - all_reduce_count += 1 - return orig_all_reduce(*args, **kwargs) - - torch.manual_seed(1) # same on all ranks + torch.manual_seed(42 + self.rank + 1) for iter_idx in range(5): - with patch_all_gather(all_gather_with_count), patch_reduce_scatter( - reduce_scatter_with_count - ), patch_all_reduce(all_reduce_with_count): + with CommDebugMode() as comm_mode: for microbatch_idx in range(num_microbatches): is_last_microbatch = microbatch_idx == num_microbatches - 1 if mode == "all": @@ -762,19 +736,18 @@ def all_reduce_with_count(*args, **kwargs): is_last_microbatch, recurse=False ) - global_inp = torch.rand((global_batch_size, lin_dim), device="cuda") - local_inp = global_inp[ - self.rank - * local_batch_size : (self.rank + 1) - * local_batch_size - ].detach() + inp = torch.randn(batch_size, lin_dim, device="cuda") losses: List[torch.Tensor] = [] - for _model, inp in ((ref_model, global_inp), (model, local_inp)): + for _model in (ref_model, model): losses.append(_model(inp).sum()) losses[-1].backward() - dist.all_reduce(losses[1]) # partial -> replicated self.assertEqual(losses[0], losses[1]) + comm_counts = comm_mode.get_comm_counts() + all_gather_count = comm_counts[c10d_ops._allgather_base_] + reduce_scatter_count = comm_counts[c10d_ops._reduce_scatter_base_] + all_reduce_count = comm_counts[c10d_ops.allreduce_] + # Expect one reduce-scatter per MLP plus one for the root's linear # on the last microbatch expected_reduce_scatter_count = num_mlps + 1 @@ -788,13 +761,10 @@ def all_reduce_with_count(*args, **kwargs): # Expect additional reduce-scatters for all MLPs expected_reduce_scatter_count += (num_mlps) * (num_microbatches - 1) self.assertEqual(reduce_scatter_count, expected_reduce_scatter_count) - # Exclude the loss all-reduce per microbatch in our training loop - all_reduce_count -= num_microbatches - if mesh.ndim == 2: - self.assertEqual(all_reduce_count, expected_reduce_scatter_count) - else: - self.assertEqual(all_reduce_count, 0) - reduce_scatter_count = all_reduce_count = 0 + expected_all_reduce_count = ( + expected_reduce_scatter_count if mesh.ndim == 2 else 0 + ) + self.assertEqual(all_reduce_count, expected_all_reduce_count) # Expect one all-gather per MLP plus one for the root's linear in # the first microbatch's forward @@ -817,13 +787,10 @@ def all_reduce_with_count(*args, **kwargs): # microbatch forward expected_all_gather_count += num_mlps * (num_microbatches - 1) self.assertEqual(all_gather_count, expected_all_gather_count) - all_gather_count = 0 - # Average the ref model's gradients over the world size to match - # data parallel semantics for param in ref_model.parameters(): if param.grad is not None: - param.grad.div_(self.world_size) + dist.all_reduce(param.grad, op=dist.ReduceOp.AVG) check_sharded_parity(self, ref_model, model) for _optim in (optim, ref_optim): _optim.step() @@ -933,38 +900,14 @@ def _test_train_parity_2d_mlp( dp_pg = dp_mesh.get_group() # used for `replicate()` torch.manual_seed(42) - model = nn.Sequential( - nn.LayerNorm(mlp_dim, bias=False), - # Use multiplier of 3 to exercise uneven case - MLP(mlp_dim, dim_multiplier=3), - MLP(mlp_dim), - MLP(mlp_dim, dim_multiplier=3), - ) + model = MLPStack(mlp_dim) ref_model = copy.deepcopy(model).cuda() replicate(ref_model, device_ids=[self.rank], process_group=dp_pg) - ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2) - - model = parallelize_module( - model, - device_mesh=tp_mesh, - # Leave the layer norm as implicitly replicated - parallelize_plan={ - # Pass `use_local_output=False` to keep as DTensor to preserve - # uneven activation dims - "1.in_proj": ColwiseParallel(use_local_output=False), - "1.out_proj": RowwiseParallel(use_local_output=False), - "2.in_proj": ColwiseParallel(use_local_output=False), - "2.out_proj": RowwiseParallel(use_local_output=False), - "3.in_proj": ColwiseParallel(use_local_output=False), - "3.out_proj": RowwiseParallel(), - }, + ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=False) + model.parallelize( + tp_mesh, dp_mesh, use_activation_checkpointing, reshard_after_forward ) - for mlp in model: - if use_activation_checkpointing: - checkpoint(mlp) - fully_shard(mlp, mesh=dp_mesh, reshard_after_forward=reshard_after_forward) - fully_shard(model, mesh=dp_mesh, reshard_after_forward=reshard_after_forward) - optim = torch.optim.Adam(model.parameters(), lr=1e-2) + optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=False) torch.manual_seed(42 + dp_pg.rank() + 1) device = torch.device("cuda") @@ -992,6 +935,8 @@ def test_train_parity_2d_transformer_checkpoint_resume(self): # else construct new ones (requiring eager optim state init) "reuse_model_optim": [False, True], "optimizer_class": [torch.optim.Adam, torch.optim.AdamW], + # TODO: need to update `parallelize` before including foreach=True for testing + "foreach": [False], }, self._test_train_parity_2d_transformer_checkpoint_resume, ) @@ -1001,6 +946,7 @@ def _test_train_parity_2d_transformer_checkpoint_resume( use_seq_parallel: bool, reuse_model_optim: bool, optimizer_class: Type[torch.optim.Optimizer], + foreach: bool, ): def train_step( _model: nn.Module, _optim: torch.optim.Optimizer, _inp: torch.Tensor @@ -1026,7 +972,9 @@ def parallelize(_model: Transformer, mesh: DeviceMesh, use_seq_parallel: bool): model_no_cp = parallelize( Transformer(model_args), global_mesh, use_seq_parallel ) - optim_no_cp = optimizer_class(model_no_cp.parameters(), lr=1e-2) + optim_no_cp = optimizer_class( + model_no_cp.parameters(), lr=1e-2, foreach=foreach + ) torch.manual_seed(42 + global_mesh["dp"].get_local_rank() + 1) inp = torch.randint(0, model_args.vocab_size, (3, 16), device="cuda") @@ -1037,7 +985,7 @@ def parallelize(_model: Transformer, mesh: DeviceMesh, use_seq_parallel: bool): # model/optimizer, load checkpoint, and run another iteration torch.manual_seed(seed) model_cp = parallelize(Transformer(model_args), global_mesh, use_seq_parallel) - optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2) + optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2, foreach=foreach) loss_cp1 = train_step(model_cp, optim_cp, inp) self.assertEqual(loss_no_cp1, loss_cp1) @@ -1066,7 +1014,7 @@ def parallelize(_model: Transformer, mesh: DeviceMesh, use_seq_parallel: bool): model_cp = parallelize( Transformer(model_args), global_mesh, use_seq_parallel ) - optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2) + optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2, foreach=foreach) self.assertNotEqual(loss_no_cp2, train_step(model_cp, optim_cp, inp)) sharded_sd = { @@ -1106,6 +1054,7 @@ def test_2d_mlp_with_nd_mesh(self): "reshard_after_forward": [False, True], "use_activation_checkpointing": [False, True], "mlp_dim": [3, 16, 17], + "foreach": [False], }, functools.partial(self._test_2d_mlp_with_nd_mesh, global_mesh), ) @@ -1116,6 +1065,7 @@ def _test_2d_mlp_with_nd_mesh( reshard_after_forward: bool, use_activation_checkpointing: bool, mlp_dim: int, + foreach: bool, ): global_mesh = self.init_global_mesh() pp_mesh, dp_mesh, tp_mesh = ( @@ -1126,38 +1076,14 @@ def _test_2d_mlp_with_nd_mesh( dp_pg = dp_mesh.get_group() # used for `replicate()` torch.manual_seed(42) - model = nn.Sequential( - nn.LayerNorm(mlp_dim, bias=False), - # Use multiplier of 3 to exercise uneven case - MLP(mlp_dim, dim_multiplier=3), - MLP(mlp_dim), - MLP(mlp_dim, dim_multiplier=3), - ) + model = MLPStack(mlp_dim) ref_model = copy.deepcopy(model).cuda() replicate(ref_model, device_ids=[self.rank], process_group=dp_pg) - ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2) - - model = parallelize_module( - model, - device_mesh=tp_mesh, - # Leave the layer norm as implicitly replicated - parallelize_plan={ - # Pass `use_local_output=False` to keep as DTensor to preserve - # uneven activation dims - "1.in_proj": ColwiseParallel(use_local_output=False), - "1.out_proj": RowwiseParallel(use_local_output=False), - "2.in_proj": ColwiseParallel(use_local_output=False), - "2.out_proj": RowwiseParallel(use_local_output=False), - "3.in_proj": ColwiseParallel(use_local_output=False), - "3.out_proj": RowwiseParallel(), - }, + ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=foreach) + model.parallelize( + tp_mesh, dp_mesh, use_activation_checkpointing, reshard_after_forward ) - for mlp in model: - if use_activation_checkpointing: - checkpoint(mlp) - fully_shard(mlp, mesh=dp_mesh, reshard_after_forward=reshard_after_forward) - fully_shard(model, mesh=dp_mesh, reshard_after_forward=reshard_after_forward) - optim = torch.optim.Adam(model.parameters(), lr=1e-2) + optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=foreach) torch.manual_seed(42 + dp_pg.rank() + 1) device = torch.device("cuda") diff --git a/test/distributed/_tensor/test_dtensor.py b/test/distributed/_tensor/test_dtensor.py index 2f7f522b9e7b0..e2a9d33241e7c 100644 --- a/test/distributed/_tensor/test_dtensor.py +++ b/test/distributed/_tensor/test_dtensor.py @@ -819,7 +819,7 @@ def test_split_tensor_1D(self) -> None: ) if size == 0: # when tensor size is 0, there is no padding needed for all the ranks. - expected_pad_sizes = [0] * self.world_size + expected_pad_sizes = [] assert_array_equal(expected_pad_sizes, pad_sizes) is_tensor_empty = [ diff --git a/test/distributed/_tensor/test_optimizers.py b/test/distributed/_tensor/test_optimizers.py index e7ce18eefa634..512b5c97ce6a2 100644 --- a/test/distributed/_tensor/test_optimizers.py +++ b/test/distributed/_tensor/test_optimizers.py @@ -84,23 +84,26 @@ def _assert_optimizer( # Default 'rtol' and 'atol' for attr:`~torch.float32` are ``1.3e-6`` and ``1e-5`` self.assertEqual(p1, p2, atol=atol, rtol=rtol) + def test_optimizer_foreach_supported_types_include_DTensor(self): + from torch.optim.optimizer import _foreach_supported_types + + self.assertTrue(DTensor in _foreach_supported_types) + @with_comms def test_adam_1d_sharding(self): mesh = DeviceMesh(self.device_type, list(range(self.world_size))) # TODO: add fused_adam support adam_configs = [ - {"lr": 0.1}, + {"lr": 0.1, "foreach": False}, + {"lr": 0.1, "weight_decay": 0.05, "foreach": False}, {"lr": 0.1, "weight_decay": 0.05}, - {"lr": 0.1, "foreach": True}, - {"lr": 0.1, "weight_decay": 0.05, "foreach": True}, - {"lr": 0.1, "weight_decay": 0.05, "amsgrad": True, "foreach": True}, + {"lr": 0.1, "weight_decay": 0.05, "amsgrad": True}, { "lr": 0.1, "weight_decay": 0.05, "maximize": True, "amsgrad": True, - "foreach": True, }, {"lr": 0.1, "fused": True}, {"lr": 0.1, "weight_decay": 0.05, "amsgrad": True, "fused": True}, @@ -132,16 +135,15 @@ def test_adamw_1d_sharding(self): mesh = DeviceMesh(self.device_type, list(range(self.world_size))) adamw_configs = [ - {"lr": 0.1}, + {"lr": 0.1, "foreach": False}, + {"lr": 0.1, "weight_decay": 0.05, "foreach": False}, {"lr": 0.1, "weight_decay": 0.05}, - {"lr": 0.1, "weight_decay": 0.05, "foreach": True}, { "lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6, "weight_decay": 0.05, "amsgrad": True, - "foreach": True, }, { "lr": 0.1, @@ -150,7 +152,6 @@ def test_adamw_1d_sharding(self): "weight_decay": 0.05, "maximize": True, "amsgrad": True, - "foreach": True, }, {"lr": 0.1, "weight_decay": 0.05, "fused": True}, { @@ -191,16 +192,17 @@ def test_sgd_1d_sharding(self): mesh = DeviceMesh(self.device_type, list(range(self.world_size))) sgd_configs = [ - {"lr": 0.1}, + {"lr": 0.1, "foreach": False}, + {"lr": 0.1, "momentum": 0.05, "foreach": False}, {"lr": 0.1, "momentum": 0.05}, - {"lr": 0.1, "momentum": 0.05, "foreach": True}, - {"lr": 0.1, "momentum": 0.06, "dampening": 0.07, "foreach": True}, + {"lr": 0.1, "momentum": 0.06, "dampening": 0.07}, { "lr": 0.1, "momentum": 0.08, "weight_decay": 0.05, "nesterov": True, "maximize": True, + "foreach": False, }, { "lr": 0.1, @@ -208,7 +210,6 @@ def test_sgd_1d_sharding(self): "weight_decay": 0.05, "nesterov": True, "maximize": True, - "foreach": True, }, ] @@ -231,14 +232,15 @@ def test_adagrad_1d_sharding(self): mesh = DeviceMesh(self.device_type, list(range(self.world_size))) adagrad_configs = [ - {"lr": 0.1}, - {"lr": 0.1, "lr_decay": 0.05}, - {"lr": 0.1, "lr_decay": 0.02, "weight_decay": 0.05}, + {"lr": 0.1, "foreach": False}, + {"lr": 0.1, "lr_decay": 0.05, "foreach": False}, + {"lr": 0.1, "lr_decay": 0.02, "weight_decay": 0.05, "foreach": False}, { "lr": 0.1, "lr_decay": 0.02, "weight_decay": 0.05, "initial_accumulator_value": 0.03, + "foreach": False, }, { "lr": 0.1, @@ -246,6 +248,7 @@ def test_adagrad_1d_sharding(self): "weight_decay": 0.05, "initial_accumulator_value": 0.03, "eps": 1e-6, + "foreach": False, }, { "lr": 0.1, @@ -254,6 +257,7 @@ def test_adagrad_1d_sharding(self): "initial_accumulator_value": 0.03, "eps": 1e-6, "maximize": True, + "foreach": False, }, { "lr": 0.1, @@ -262,7 +266,6 @@ def test_adagrad_1d_sharding(self): "initial_accumulator_value": 0.03, "eps": 1e-6, "maximize": True, - "foreach": True, }, ] @@ -285,16 +288,23 @@ def test_RMSprop_1d_sharding(self): mesh = DeviceMesh(self.device_type, list(range(self.world_size))) RMSprop_configs = [ - {"lr": 0.1}, - {"lr": 0.1, "alpha": 0.85}, - {"lr": 0.1, "alpha": 0.88, "eps": 1e-6}, - {"lr": 0.1, "alpha": 0.88, "eps": 1e-6, "weight_decay": 0.05}, + {"lr": 0.1, "foreach": False}, + {"lr": 0.1, "alpha": 0.85, "foreach": False}, + {"lr": 0.1, "alpha": 0.88, "eps": 1e-6, "foreach": False}, + { + "lr": 0.1, + "alpha": 0.88, + "eps": 1e-6, + "weight_decay": 0.05, + "foreach": False, + }, { "lr": 0.1, "alpha": 0.88, "eps": 1e-6, "weight_decay": 0.05, "momentum": 0.9, + "foreach": False, }, { "lr": 0.1, @@ -303,6 +313,7 @@ def test_RMSprop_1d_sharding(self): "weight_decay": 0.05, "momentum": 0.9, "centered": True, + "foreach": False, }, { "lr": 0.1, @@ -312,6 +323,7 @@ def test_RMSprop_1d_sharding(self): "momentum": 0.9, "centered": True, "maximize": True, + "foreach": False, }, { "lr": 0.1, @@ -321,7 +333,6 @@ def test_RMSprop_1d_sharding(self): "momentum": 0.9, "centered": True, "maximize": True, - "foreach": True, }, ] @@ -344,23 +355,27 @@ def test_adadelta_1d_sharding(self): mesh = DeviceMesh(self.device_type, list(range(self.world_size))) adadelta_configs = [ - {"lr": 0.1}, - {"lr": 0.1, "rho": 0.85}, - {"lr": 0.1, "rho": 0.88, "eps": 1e-5}, - {"lr": 0.1, "rho": 0.88, "eps": 1e-6, "weight_decay": 0.05}, + {"lr": 0.1, "foreach": False}, + {"lr": 0.1, "rho": 0.85, "foreach": False}, + {"lr": 0.1, "rho": 0.88, "eps": 1e-5, "foreach": False}, + { + "lr": 0.1, + "rho": 0.88, + "eps": 1e-6, + "weight_decay": 0.05, + "foreach": False, + }, { "lr": 0.1, "rho": 0.88, "eps": 1e-6, "weight_decay": 0.05, - "foreach": True, }, { "lr": 0.1, "rho": 0.88, "eps": 1e-6, "weight_decay": 0.05, - "foreach": True, "maximize": True, }, ] @@ -384,15 +399,14 @@ def test_nadam_1d_sharding(self): mesh = DeviceMesh(self.device_type, list(range(self.world_size))) nadam_configs = [ - {"lr": 0.1}, + {"lr": 0.1, "foreach": False}, + {"lr": 0.1, "weight_decay": 0.05, "foreach": False}, {"lr": 0.1, "weight_decay": 0.05}, - {"lr": 0.1, "weight_decay": 0.05, "foreach": True}, { "lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6, "weight_decay": 0.05, - "foreach": True, }, { "lr": 0.1, @@ -400,7 +414,6 @@ def test_nadam_1d_sharding(self): "eps": 1e-6, "weight_decay": 0.05, "decoupled_weight_decay": True, - "foreach": True, }, ] @@ -423,15 +436,17 @@ def test_radam_1d_sharding(self): mesh = DeviceMesh(self.device_type, list(range(self.world_size))) radam_configs = [ - {"lr": 0.1}, - {"lr": 0.1, "weight_decay": 0.05}, - {"lr": 0.1, "weight_decay": 0.05, "foreach": True}, + {"lr": 0.1, "foreach": False}, + {"lr": 0.1, "weight_decay": 0.05, "foreach": False}, + { + "lr": 0.1, + "weight_decay": 0.05, + }, { "lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6, "weight_decay": 0.05, - "foreach": True, }, { "lr": 0.1, @@ -439,7 +454,6 @@ def test_radam_1d_sharding(self): "eps": 1e-6, "weight_decay": 0.05, "decoupled_weight_decay": True, - "foreach": True, }, ] @@ -462,23 +476,27 @@ def test_adamax_1d_sharding(self): mesh = DeviceMesh(self.device_type, list(range(self.world_size))) adamax_configs = [ - {"lr": 0.1}, - {"lr": 0.1, "betas": (0.6, 0.66)}, - {"lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6}, - {"lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6, "weight_decay": 0.05}, + {"lr": 0.1, "foreach": False}, + {"lr": 0.1, "betas": (0.6, 0.66), "foreach": False}, + {"lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6, "foreach": False}, + { + "lr": 0.1, + "betas": (0.6, 0.66), + "eps": 1e-6, + "weight_decay": 0.05, + "foreach": False, + }, { "lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6, "weight_decay": 0.05, - "foreach": True, }, { "lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6, "weight_decay": 0.05, - "foreach": True, "maximize": True, }, ] @@ -502,11 +520,18 @@ def test_asgd_1d_sharding(self): mesh = DeviceMesh(self.device_type, list(range(self.world_size))) asgd_configs = [ - {"lr": 0.1}, - {"lr": 0.1, "lambd": 0.001}, - {"lr": 0.1, "lambd": 0.001, "alpha": 0.85}, - {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "t0": 1e5}, - {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "t0": 1e5, "weight_decay": 0.05}, + {"lr": 0.1, "foreach": False}, + {"lr": 0.1, "lambd": 0.001, "foreach": False}, + {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "foreach": False}, + {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "t0": 1e5, "foreach": False}, + { + "lr": 0.1, + "lambd": 0.001, + "alpha": 0.85, + "t0": 1e5, + "weight_decay": 0.05, + "foreach": False, + }, { "lr": 0.1, "lambd": 0.001, diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py index e1dd16bcf9650..5b5ae3e3cb620 100644 --- a/test/distributed/elastic/agent/server/test/api_test.py +++ b/test/distributed/elastic/agent/server/test/api_test.py @@ -126,7 +126,9 @@ def __init__(self, spec): self.stop_workers_call_count = 0 self.start_workers_call_count = 0 - def _stop_workers(self, worker_group: WorkerGroup) -> None: + def _stop_workers( + self, worker_group: WorkerGroup, is_restart: bool = False + ) -> None: # workers are fake, nothing to stop; just clear the rdzv info worker_group.group_rank = None worker_group.group_world_size = None diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py index 9658ed087ab05..75e903807ff9b 100644 --- a/test/distributed/elastic/multiprocessing/api_test.py +++ b/test/distributed/elastic/multiprocessing/api_test.py @@ -465,6 +465,30 @@ def test_function_raise(self): self.assertTrue(pc._stderr_tail.stopped()) self.assertTrue(pc._stdout_tail.stopped()) + def test_wait_for_all_child_procs_to_exit(self): + """ + Tests that MultiprocessingContext actually waits for + the child process to exit (not just that the entrypoint fn has + finished running). + """ + + mpc = MultiprocessContext( + name="echo", + entrypoint=echo0, + args={}, + envs={}, + start_method="spawn", + logs_specs=DefaultLogsSpecs(log_dir=self.log_dir()), + ) + + with mock.patch.object( + mpc, "_is_done", return_value=True + ), mock.patch.object(mpc, "_pc"), mock.patch.object( + mpc._pc, "join", side_effect=[True, False, False, True] + ) as mock_join: + mpc._poll() + self.assertEqual(4, mock_join.call_count) + ######################################## # start_processes as binary tests ######################################## diff --git a/test/dynamo_expected_failures/TestAOTAutograd.test_set__and_data_mutation_good b/test/distributed/pipelining/__init__.py similarity index 100% rename from test/dynamo_expected_failures/TestAOTAutograd.test_set__and_data_mutation_good rename to test/distributed/pipelining/__init__.py diff --git a/test/distributed/pipelining/model_registry.py b/test/distributed/pipelining/model_registry.py new file mode 100644 index 0000000000000..f88bebd3a5598 --- /dev/null +++ b/test/distributed/pipelining/model_registry.py @@ -0,0 +1,61 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates +# Owner(s): ["oncall: distributed"] +# This file is a model zoo for testing torch.distributed.pipelining. +import torch +from torch.distributed.pipelining import pipe_split + + +class ExampleCode(torch.nn.Module): + default_dhid = 512 + default_batch_size = 256 + + def __init__(self, d_hid: int = default_dhid): + super().__init__() + self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.mm_param1 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.register_buffer("cval", torch.randn((d_hid,), requires_grad=False)) + self.lin0 = torch.nn.Linear(d_hid, d_hid) + self.lin1 = torch.nn.Linear(d_hid, d_hid) + + def forward(self, x, y=torch.zeros(default_batch_size, default_dhid)): + x = torch.mm(x, self.mm_param0) + x = x + y + x = torch.relu(x) + # try passing a value that doesn't require_grad across skip boundaries + a_constant = self.cval.clone() + x = self.lin0(x) + pipe_split() + x = torch.relu(x) + a_constant + x = torch.mm(x, self.mm_param1) + x = self.lin1(x) + x = torch.relu(x) + return x + + +# MLP Layer +class MLPModule(torch.nn.Module): + def __init__(self, d_hid): + super().__init__() + self.net1 = torch.nn.Linear(d_hid, d_hid) + self.relu = torch.nn.ReLU() + self.net2 = torch.nn.Linear(d_hid, d_hid) + + def forward(self, x): + x = self.net1(x) + x = self.relu(x) + x = self.net2(x) + return x + + +# Multi-MLP model +class MultiMLP(torch.nn.Module): + def __init__(self, d_hid): + super().__init__() + self.mlp0 = MLPModule(d_hid) + self.mlp1 = MLPModule(d_hid) + + def forward(self, x): + x = self.mlp0(x) + pipe_split() + x = self.mlp1(x) + return x diff --git a/test/distributed/pipelining/test_pipe.py b/test/distributed/pipelining/test_pipe.py index c966a20b3cbc0..74d13111bec73 100644 --- a/test/distributed/pipelining/test_pipe.py +++ b/test/distributed/pipelining/test_pipe.py @@ -1,8 +1,15 @@ # Copyright (c) Meta Platforms, Inc. and affiliates # Owner(s): ["oncall: distributed"] import torch + +from model_registry import MLPModule from torch.distributed.pipelining import pipe_split, pipeline -from torch.testing._internal.common_utils import run_tests, TestCase +from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + parametrize, + run_tests, + TestCase, +) d_hid = 512 @@ -39,21 +46,6 @@ def forward(self, x, y): return x -# MLP example -class MLPModule(torch.nn.Module): - def __init__(self, d_hid): - super().__init__() - self.net1 = torch.nn.Linear(d_hid, d_hid) - self.relu = torch.nn.ReLU() - self.net2 = torch.nn.Linear(d_hid, d_hid) - - def forward(self, x): - x = self.net1(x) - x = self.relu(x) - x = self.net2(x) - return x - - class MultiMLP(torch.nn.Module): def __init__(self): super().__init__() @@ -74,8 +66,9 @@ def forward(self, x, y): class PipeTests(TestCase): - def _test_model_split(self, model_class): - mod = model_class() + @parametrize("ModelClass", [ExampleCode, MultiMLP]) + def test_model_split(self, ModelClass): + mod = ModelClass() x = torch.randn(batch_size, d_hid) y = torch.randn(batch_size, d_hid) @@ -108,12 +101,8 @@ def _test_model_split(self, model_class): """ print("Qualname check passed") - def test_example_code(self): - self._test_model_split(ExampleCode) - - def test_multi_mlp(self): - self._test_model_split(MultiMLP) +instantiate_parametrized_tests(PipeTests) if __name__ == "__main__": run_tests() diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py index e8e37bcf208f8..8357f3b66108d 100644 --- a/test/distributed/pipelining/test_schedule.py +++ b/test/distributed/pipelining/test_schedule.py @@ -1,13 +1,15 @@ # Copyright (c) Meta Platforms, Inc. and affiliates # Owner(s): ["oncall: distributed"] +import copy import os import sys import tempfile import torch import torch.distributed as dist + +from model_registry import ExampleCode, MultiMLP from torch.distributed.pipelining import ( - pipe_split, pipeline, PipelineStage, Schedule1F1B, @@ -32,30 +34,6 @@ torch.manual_seed(0) -class ExampleCode(torch.nn.Module): - def __init__(self): - super().__init__() - self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) - self.mm_param1 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) - self.register_buffer("cval", torch.randn((d_hid,), requires_grad=False)) - self.lin0 = torch.nn.Linear(d_hid, d_hid) - self.lin1 = torch.nn.Linear(d_hid, d_hid) - - def forward(self, x, y=torch.zeros(batch_size, d_hid)): - x = torch.mm(x, self.mm_param0) - x = x + y - x = torch.relu(x) - # try passing a value that doesn't require_grad across skip boundaries - a_constant = self.cval.clone() - x = self.lin0(x) - pipe_split() - x = torch.relu(x) + a_constant - x = torch.mm(x, self.mm_param1) - x = self.lin1(x) - x = torch.relu(x) - return x - - class ScheduleTest(MultiProcContinousTest): @classmethod def backend_str(cls) -> str: @@ -78,7 +56,7 @@ def test_ec_forward(self): # Setting this flag for numerical stability torch.distributed.pipelining.microbatch._debug_mask_minibatches = True - mod = ExampleCode() + mod = ExampleCode(d_hid) mod.to(self.device) x = torch.randn(batch_size, d_hid, device=self.device) @@ -125,7 +103,7 @@ def test_ec_forward(self): @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs") @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B]) def test_ec_backward(self, ScheduleClass): - mod = ExampleCode() + mod = ExampleCode(d_hid) mod.to(self.device) x = torch.randn(batch_size, d_hid, device=self.device) @@ -168,6 +146,79 @@ def test_ec_backward(self, ScheduleClass): torch.testing.assert_close(out, ref_out, rtol=1e-2, atol=5e-3) torch.testing.assert_close(pipe_loss, ref_loss) + @requires_nccl() + @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs") + @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B]) + def test_grad(self, ScheduleClass): + mod = MultiMLP(d_hid) + mod.to(self.device) + + ref_mod = copy.deepcopy(mod) + x = torch.randn(batch_size, d_hid, device=self.device) + with torch.no_grad(): + y = ref_mod(x) + # Add a small perturbation + target = y + torch.randn(batch_size, d_hid, device=self.device) + + loss_fn = torch.nn.MSELoss(reduction="sum") + + # Run reference + for _ in range(2): + ref_mod.zero_grad() + ref_out = ref_mod(x) + ref_loss = loss_fn(ref_out, target) + ref_loss.backward() + + # Create a pipeline + pipe = pipeline( + mod, + chunks, + example_args=(x,), + ) + + stage = PipelineStage( + pipe, + self.rank, + device=self.device, + ) + + # Attach to a schedule + schedule = ScheduleClass(stage, chunks, loss_fn=loss_fn) + + # Run + stage_module = pipe.get_stage_module(self.rank) + for _ in range(2): + # Zero gradients + stage_module.zero_grad() + if self.rank == 0: + schedule.step(x) + elif self.rank == self.world_size - 1: + losses = [] + out = schedule.step(target=target, losses=losses) + else: + schedule.step() + + dist.barrier() + + # Last rank checks result + if self.rank == self.world_size - 1: + # Check output + torch.testing.assert_close(out, ref_out) + # Check loss + # Since the reduction used in the loss function above is "sum", we use + # "sum" here to reduce microbatch losses into a single value too. + pipe_loss = sum(losses) + torch.testing.assert_close(pipe_loss, ref_loss) + + # Every rank checks gradients + for name, p in stage_module.named_parameters(): + ref_p = ref_mod.get_parameter(name) + try: + torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5) + except AssertionError: + print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}") + raise + instantiate_parametrized_tests(ScheduleTest) diff --git a/test/distributed/pipelining/test_stage_backward.py b/test/distributed/pipelining/test_stage_backward.py index 358607ab91c3f..5791f40c6102a 100644 --- a/test/distributed/pipelining/test_stage_backward.py +++ b/test/distributed/pipelining/test_stage_backward.py @@ -3,6 +3,8 @@ import copy import torch + +from model_registry import MLPModule from torch.distributed.pipelining._backward import stage_backward from torch.testing._internal.common_utils import run_tests, TestCase @@ -11,20 +13,6 @@ batch_size = 256 -class MLPModule(torch.nn.Module): - def __init__(self, d_hid): - super().__init__() - self.net1 = torch.nn.Linear(d_hid, d_hid) - self.relu = torch.nn.ReLU() - self.net2 = torch.nn.Linear(d_hid, d_hid) - - def forward(self, x): - x = self.net1(x) - x = self.relu(x) - x = self.net2(x) - return x - - class StageBackwardTests(TestCase): def test_stage_backward(self): # MLP as a stage module @@ -65,8 +53,6 @@ def test_stage_backward(self): print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}") raise - print("Stage backward test passed") - if __name__ == "__main__": run_tests() diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py index cd038dbbb2737..eb5e6b5e5a1df 100644 --- a/test/distributed/tensor/parallel/test_tp_examples.py +++ b/test/distributed/tensor/parallel/test_tp_examples.py @@ -262,8 +262,11 @@ def test_transformer_training(self, is_seq_parallel=False): # Ensure model weights are still the same after update. optim.step() - with CommDebugMode() as comm_mode: - optim_tp.step() + from torch.distributed._tensor.experimental import implicit_replication + + with implicit_replication(): + with CommDebugMode() as comm_mode: + optim_tp.step() self._check_module(model, model_tp) if is_seq_parallel: self.assertDictEqual( diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py index 393f81cb5e7ca..fdb23e3f590f3 100644 --- a/test/dynamo/test_functions.py +++ b/test/dynamo/test_functions.py @@ -642,6 +642,13 @@ def test_get_autocast_gpu_dtype(x): dtype = torch.get_autocast_gpu_dtype() return x.type(dtype) + @make_test + def test_is_any_autocast_enabled(x): + if torch._C._is_any_autocast_enabled(): + return x + 1 + else: + return x - 1 + @make_test def test_list_compare_polyfill(x): for a, b, c in [ @@ -1170,6 +1177,19 @@ def test_set_contains(a, b): y = a - b return x, y + def test_set_isdisjoint(self): + x = {"apple", "banana", "cherry"} + y = {"google", "microsoft", "apple"} + + def fn(a): + if x.isdisjoint(y): + return a + 1 + else: + return a - 1 + + test = make_test(fn) + test(self) + @make_test def test_tuple_iadd(a, b): output = (a, b) @@ -1317,6 +1337,13 @@ def isinstance_namedtuple(obj) -> bool: else: return a - b + @make_test + def test_torch_size_hasattr(x): + if hasattr(x.shape, "_fields"): + return x + 1 + else: + return x - 1 + @make_test def test_is_quantized(a, b): if not a.is_quantized: diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py index c0b5bfc595363..880e761037cd9 100644 --- a/test/dynamo/test_higher_order_ops.py +++ b/test/dynamo/test_higher_order_ops.py @@ -5894,7 +5894,7 @@ def wrapper_fn(x, in_dims): actual = opt(x, 0), opt(x, 1), opt(x, 2) self.assertEqual(expected, actual) self.assertEqual(cnt.frame_count, 3) - self.assertEqual(cnt.op_count, 33) + self.assertEqual(cnt.op_count, 27) def test_vmap_multiple_invocation_out_dims(self): counters.clear() @@ -5910,7 +5910,7 @@ def wrapper_fn(x, out_dims): actual = opt(x, 0), opt(x, 1), opt(x, 2) self.assertEqual(expected, actual) self.assertEqual(cnt.frame_count, 3) - self.assertEqual(cnt.op_count, 30) + self.assertEqual(cnt.op_count, 27) def test_vmap_new_tensor_in_body(self): def fn(x): diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py index b46ab432831dc..a70e5767f3d64 100644 --- a/test/dynamo/test_misc.py +++ b/test/dynamo/test_misc.py @@ -32,6 +32,7 @@ import torch.onnx.operators import torch.utils._pytree as pytree +from torch import Tensor from torch._C import FileCheck from torch._dynamo import allow_in_graph, bytecode_analysis, bytecode_transformation from torch._dynamo.eval_frame import _debug_get_cache_entry_list @@ -864,7 +865,7 @@ def fn(x): return x + y torch._dynamo.testing.standard_test( - self, fn, 1, expected_ops=1, expected_ops_dynamic=ifdynstaticdefault(1, 10) + self, fn, 1, expected_ops=1, expected_ops_dynamic=ifdynstaticdefault(1, 4) ) def test_int_int_comparisons(self): @@ -907,10 +908,8 @@ def fn(x): out = 1 return x + out - # expect for dynamic: size, index, 6 comparison ops, add - torch._dynamo.testing.standard_test( - self, fn, 1, expected_ops=1, expected_ops_dynamic=ifdynstaticdefault(1, 9) - ) + # TODO: Test the guards maybe? + torch._dynamo.testing.standard_test(self, fn, 1, expected_ops=1) def test_int_shape_comparisons(self): def fn(x): @@ -932,10 +931,8 @@ def fn(x): out = 1 return x + out - # expect for dynamic: size, index, 6 comparison ops, add - torch._dynamo.testing.standard_test( - self, fn, 1, expected_ops=1, expected_ops_dynamic=ifdynstaticdefault(1, 9) - ) + # TODO: Test the guards maybe? + torch._dynamo.testing.standard_test(self, fn, 1, expected_ops=1) def test_param_shape_binops(self): class MyModule(torch.nn.Module): @@ -1262,13 +1259,11 @@ def fn(x): y.add_(1.0) return y - # expect extra size node for dynamic torch._dynamo.testing.standard_test( self, fn, 1, expected_ops=20, - expected_ops_dynamic=ifdynstaticdefault(20, 21), ) def test_empty_list(self): @@ -1658,7 +1653,7 @@ def fn(a, b): opt_fn = torch._dynamo.optimize(cnts)(fn) self.assertEqual(opt_fn(v1, v2), correct) self.assertEqual(cnts.frame_count, 1) - self.assertEqual(cnts.op_count, 3) + self.assertEqual(cnts.op_count, 4) @patch.object(torch._dynamo.config, "capture_scalar_outputs", False) def test_tensor_item_no_capture(self): @@ -1738,13 +1733,11 @@ def fn(a): a += 1 return a - # expect 1 more op (size call) for dynamic return torch._dynamo.testing.standard_test( self, fn=fn, nargs=1, expected_ops=9, - expected_ops_dynamic=ifdynstaticdefault(9, 10), ) def test_build_tuple_unpack(self): @@ -4337,7 +4330,7 @@ def forward(self, x, ref_id): if torch._dynamo.config.assume_static_by_default: self.assertExpectedInline(cnts.op_count, """2""") else: - self.assertExpectedInline(cnts.op_count, """3""") + self.assertExpectedInline(cnts.op_count, """2""") torch._dynamo.reset() cnts = torch._dynamo.testing.CompileCounter() @@ -4347,7 +4340,7 @@ def forward(self, x, ref_id): if torch._dynamo.config.assume_static_by_default: self.assertExpectedInline(cnts.op_count, """1""") else: - self.assertExpectedInline(cnts.op_count, """2""") + self.assertExpectedInline(cnts.op_count, """1""") def test_inline_func_jump_on_tensor_condition(self): def f1(input): @@ -8518,6 +8511,28 @@ def f(lengths, values): f(torch.tensor([2, 3, 4]), torch.randn(9)) + @torch._dynamo.config.patch( + capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True + ) + def test_unbacked_auto_functionalize_op(self): + @torch.library.custom_op( + "mylib::mk_image", mutates_args=("decoder",), device_types=["cpu"] + ) + def mk_image(decoder: Tensor) -> Tensor: + return torch.randn(2, 3, 4, 5) + + @torch.library.register_fake("mylib::mk_image") + def _(decoder: Tensor) -> Tensor: + image_size = [torch.library.get_ctx().new_dynamic_size() for _ in range(4)] + return torch.empty(image_size) + + @torch.compile(fullgraph=True) + def f(x): + return torch.ops.mylib.mk_image.default(x) + + x = torch.zeros(100, dtype=torch.int64) + f(x) + @torch._dynamo.config.patch(capture_scalar_outputs=True) def test_runtime_assert_replacement(self): @torch.compile(backend="aot_eager") @@ -9614,7 +9629,7 @@ def test_shape_env_equal_unbacked(self): ShapeEnv not equal: field values don't match: ==> name_to_node: values don't match. - > Left: {f0, u0, u1} + > Left: {u0, u1, zuf0} > Right: {} ==> unbacked_symfloat_counter: values don't match. > Left: 1 @@ -9623,7 +9638,7 @@ def test_shape_env_equal_unbacked(self): > Left: 2 > Right: 0 ==> var_to_range: values don't match. - > Left: {f0: ValueRanges(lower=-oo, upper=oo, is_bool=False), u0: ValueRanges(lower=-9223372036854775808, upper=9223372036854775807, is_bool=False), u1: ValueRanges(lower=0, upper=1, is_bool=False)} + > Left: {u0: ValueRanges(lower=-9223372036854775808, upper=9223372036854775807, is_bool=False), u1: ValueRanges(lower=0, upper=1, is_bool=False), zuf0: ValueRanges(lower=-oo, upper=oo, is_bool=False)} > Right: {} """, ) diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py index d4233ac8e0e3b..8ecfe493650d7 100644 --- a/test/dynamo/test_repros.py +++ b/test/dynamo/test_repros.py @@ -962,7 +962,7 @@ def test_do_paste_mask(self): ) # (dynamic shapes, static shapes) self.assertIn(cnt.frame_count, (5, 7)) - self.assertIn(cnt.op_count, (106, 127)) + self.assertIn(cnt.op_count, (104, 106, 127)) def test_convert_boxes_to_pooler_format(self): boxes1 = [ @@ -989,7 +989,7 @@ def test_convert_boxes_to_pooler_format(self): self.assertExpectedInline(cnt.op_count, """10""") else: self.assertExpectedInline(cnt.frame_count, """4""") - self.assertExpectedInline(cnt.op_count, """16""") + self.assertExpectedInline(cnt.op_count, """14""") def test_boxes_len(self): def fn(boxes): @@ -1194,7 +1194,7 @@ def test_hf_t5_forward(self): self.assertExpectedInline(cnt.op_count, """11""") else: self.assertExpectedInline(cnt.frame_count, """1""") - self.assertExpectedInline(cnt.op_count, """12""") + self.assertExpectedInline(cnt.op_count, """11""") def test_module_in_skipfiles(self): model = nn.Linear(10, 10) @@ -4540,29 +4540,10 @@ def f(x): """\ def forward(self, s0 : torch.SymInt, s1 : torch.SymInt, L_x_ : torch.Tensor): l_x_ = L_x_ - size = l_x_.size() - getitem = size[0]; size = None - gt = getitem > 3; getitem = None getitem_2 = l_x_[0] sum_1 = getitem_2.sum(); getitem_2 = None gt_1 = sum_1 > 0; sum_1 = None _assert_async = torch._assert_async(gt_1, 'assertion error'); gt_1 = None - size_1 = l_x_.size() - getitem_3 = size_1[0]; size_1 = None - floordiv = getitem_3 // 2; getitem_3 = None - mod = 1 % floordiv; floordiv = None - ne = mod != 0; mod = None - size_2 = l_x_.size() - getitem_5 = size_2[0]; size_2 = None - floordiv_1 = getitem_5 // 2; getitem_5 = None - pow_1 = floordiv_1 ** 2; floordiv_1 = None - mul = 32 * pow_1; pow_1 = None - size_3 = l_x_.size() - getitem_7 = size_3[0]; size_3 = None - floordiv_2 = getitem_7 // 2; getitem_7 = None - mul_1 = 16 * floordiv_2; floordiv_2 = None - sub = mul - mul_1; mul = mul_1 = None - ne_1 = sub != 0; sub = None cos = l_x_.cos(); l_x_ = None return (cos,)""", ) @@ -4933,6 +4914,40 @@ def ladder(x): opt_ladder = torch.compile(ladder, fullgraph=True, backend="eager") self.assertEqual(opt_ladder(data), ladder(data)) + @unittest.expectedFailure + def test_trace_functional_tensor_with_error(self): + from torch._subclasses.fake_tensor import FakeTensorMode + from torch._subclasses.functional_tensor import ( + FunctionalTensor, + FunctionalTensorMode, + ) + + def f(a, tmp): + a_view = a.view(-1) + with torch.no_grad(): + a.set_(tmp) + a_view.mul_(2) + return a + tmp + + fake_mode = FakeTensorMode() + with FunctionalTensorMode(): + inp = torch.ones(3, 3, requires_grad=True) + inp = fake_mode.from_tensor(inp, static_shapes=True) + inp = FunctionalTensor.to_functional(inp) + + tmp = torch.ones(3, 3, requires_grad=True) + tmp = fake_mode.from_tensor(tmp, static_shapes=True) + tmp = FunctionalTensor.to_functional(tmp) + + opt_f = torch.compile(f, backend="eager") + with self.assertRaisesRegex( + RuntimeError, "cannot mutate tensors with frozen storage" + ): + opt_f(inp, tmp) + + # grad state may not be properly reset after the error + self.assertTrue(torch.is_grad_enabled()) + def test_const_dict_keyerror(self): d = {} diff --git a/test/dynamo/test_subgraphs.py b/test/dynamo/test_subgraphs.py index babc33d29a96d..aae13c92e0586 100644 --- a/test/dynamo/test_subgraphs.py +++ b/test/dynamo/test_subgraphs.py @@ -439,7 +439,7 @@ def fn(a, b): x = x / (a + b) return x - self._common(fn, 1, 6) + self._common(fn, 1, 5) # item gets DCE'd @patch.object(torch._dynamo.config, "capture_scalar_outputs", False) def test_graph_break_on_item(self): diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py index 83443a5a55763..9b49f5ff8bb6a 100644 --- a/test/dynamo/test_unspec.py +++ b/test/dynamo/test_unspec.py @@ -11,6 +11,7 @@ from torch._dynamo.comptime import comptime from torch._dynamo.testing import CompileCounter, same +from torch.testing._internal.logging_utils import logs_to_string # The intention of this test file is you should put test cases specifically @@ -485,6 +486,41 @@ def fn(x): compl_fn = torch.compile(fn, dynamic=True, backend="eager") self.assertEqual(compl_fn(inputs), fn(inputs)) + @torch._dynamo.config.patch(specialize_float=False, assume_static_by_default=True) + def test_unspec_float_input(self): + cnts = torch._dynamo.testing.CompileCounter() + + def f(x, y): + if y == 5.0: + return x + 2 + else: + return x + y + + cf = torch.compile(backend=cnts, fullgraph=True)(f) + + x = torch.randn(3) + self.assertEqual(f(x, 3.0), cf(x, 3.0)) + self.assertEqual(f(x, 4.0), cf(x, 4.0)) + self.assertExpectedInline(cnts.frame_count, """1""") # no recompile + self.assertEqual(f(x, 5.0), cf(x, 5.0)) + self.assertExpectedInline(cnts.frame_count, """2""") # guard worked + self.assertEqual(f(x, math.nan), cf(x, math.nan)) + self.assertExpectedInline(cnts.frame_count, """3""") # nan always recompiles + + @torch._dynamo.config.patch(specialize_float=False, assume_static_by_default=True) + def test_unspec_float_output(self): + cnts = torch._dynamo.testing.CompileCounter() + + def f(x, y): + return x + 1, y * 2 + + cf = torch.compile(backend=cnts, fullgraph=True)(f) + x = torch.randn(3) + + self.assertEqual(f(x, 3.0), cf(x, 3.0)) + self.assertEqual(f(x, 4.0), cf(x, 4.0)) + self.assertEqual(f(x, 5.0), cf(x, 5.0)) + @torch._dynamo.config.patch(capture_scalar_outputs=True) def test_data_dependent_evaluate_expr_graph_break(self): cnts = torch._dynamo.testing.CompileCounter() @@ -511,7 +547,26 @@ def fn(x): fn(x) self.assertExpectedInline(cnts.frame_count, """2""") - self.assertExpectedInline(cnts.op_count, """3""") + self.assertExpectedInline(cnts.op_count, """4""") + + def test_prune_torch_check(self): + log_stream, ctx = logs_to_string("torch._dynamo.output_graph", "graph_code") + + @torch.compile(fullgraph=True, dynamic=True, backend="eager") + def f(x, y): + torch._check(y + 5 == 85) + torch._check(x.size(0) == 80) + + with ctx(): + f(torch.randn(80, 100), 80) + + out = "\n".join(log_stream.getvalue().strip().split("\n")[3:]).strip() + self.assertExpectedInline( + out, + """\ +def forward(self): + return ()""", + ) @torch._dynamo.config.patch(capture_scalar_outputs=True) def test_split_aot_autograd(self): diff --git a/test/dynamo_expected_failures/TestOldSerialization.test_serialization_filelike_api_requirements b/test/dynamo_expected_failures/TestOldSerialization.test_serialization_filelike_api_requirements new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/test/dynamo_expected_failures/TestProfiler.test_profiler_strides b/test/dynamo_expected_failures/TestProfiler.test_profiler_strides new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/test/dynamo_expected_failures/TestSerialization.test_serialization_filelike_api_requirements b/test/dynamo_expected_failures/TestSerialization.test_serialization_filelike_api_requirements new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/test/expect/HasDecompTest.test_has_decomposition.expect b/test/expect/HasDecompTest.test_has_decomposition.expect index 98c8fa664cb98..bbd8475fd6802 100644 --- a/test/expect/HasDecompTest.test_has_decomposition.expect +++ b/test/expect/HasDecompTest.test_has_decomposition.expect @@ -335,9 +335,6 @@ aten::_functional_assert_async.msg aten::_functional_assert_scalar aten::_functional_sym_constrain_range aten::_functional_sym_constrain_range_for_size -aten::_fused_adagrad -aten::_fused_adagrad.out -aten::_fused_adagrad_ aten::_fused_adam aten::_fused_adam.out aten::_fused_adam.tensor_lr @@ -934,8 +931,6 @@ aten::min.dim_min aten::min.unary_out aten::miopen_batch_norm aten::miopen_batch_norm.out -aten::miopen_batch_norm_backward -aten::miopen_batch_norm_backward.out aten::miopen_convolution aten::miopen_convolution.out aten::miopen_convolution_add_relu diff --git a/test/export/opinfo_schema.py b/test/export/opinfo_schema.py new file mode 100644 index 0000000000000..06e0445a5fa2a --- /dev/null +++ b/test/export/opinfo_schema.py @@ -0,0 +1,108 @@ +# Owner(s): ["oncall: export"] + +import torch +from torch._dispatch.python import enable_python_dispatcher +from torch._subclasses.schema_check_mode import SchemaCheckMode +from torch.fx.operator_schemas import normalize_function +from torch.testing._internal.common_device_type import ( + instantiate_device_type_tests, + ops, +) +from torch.testing._internal.common_methods_invocations import op_db +from torch.testing._internal.common_utils import TestCase +from torch.utils._pytree import tree_map + +# Simplified naming for C++ classes +SchemaArgument = torch._C._SchemaArgument +SchemaArgType = torch._C._SchemaArgType +SchemaInfo = torch._C._SchemaInfo + +test_classes = {} + + +class PreDispatchSchemaCheckMode(SchemaCheckMode): + """ + Dispatch mode built on top of SchemaCheckMode that checks for incorrect op schemas + for PreDispatch IR. This is meant to run ops in eager mode on concrete inputs, to + see if they incorrectly claim to be functional (aliasing or mutating). + + If an op is claimed to be functional and either is detected, an error is raised. + Errors will be silenced if the schema admits aliasing or mutation - the op may + later decompose and become functional. + """ + + def __init__(self): + self._dispatch_key = torch._C.DispatchKey.PreDispatch + super().__init__() + + def _may_alias_or_mutate(self, func, types, args, kwargs): + def unwrap(e): + if isinstance(e, torch.Tensor) and not type(e) == torch.Tensor: + try: + return e.elem + except AttributeError as t: + return e + return e + + # get arguments, outputs + schema_info = SchemaInfo(func._schema) + pre_arguments = normalize_function( + func, args, kwargs, normalize_to_only_use_kwargs=True + ).kwargs + schema_info.add_argument_values(pre_arguments) + out = func(*args, **kwargs) + tuple_out = out if isinstance(out, tuple) else (out,) + tuple_out = tree_map(unwrap, tuple_out) + + # check schema + for i in range(len(func._schema.arguments)): + for j in range(len(tuple_out)): + if schema_info.may_contain_alias( + SchemaArgument(SchemaArgType.output, j), + SchemaArgument(SchemaArgType.input, i), + ): + return True + if schema_info.is_mutable( + SchemaArgument(SchemaArgType.input, i), + ): + return True + + return False + + # creating this just so we have access to the offending op + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + try: + return super().__torch_dispatch__(func, types, args=args, kwargs=kwargs) + except RuntimeError as e: + # check if schema claims to be either aliasing or mutating + alias_or_mutate = self._may_alias_or_mutate(func, types, args, kwargs) + if ( + not alias_or_mutate + ): # if schema is aliasing or mutating, will decompose further + msg = e.args[0] + e.args = ( + f"""SchemaCheckMode failed with the following error on op <{func}>, meaning + this op contains aliasing or mutations, despite claiming to be functional:\n\n""" + + msg, + ) + raise e + + +class TestOpInfo(TestCase): + @ops(op_db, allowed_dtypes=(torch.float, torch.int)) + def test_schema_check_op(self, device, dtype, op): + sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=False) + inputs = next(sample_inputs_itr) + args = [inputs.input] + list(inputs.args) + kwargs = inputs.kwargs + with enable_python_dispatcher(): + with PreDispatchSchemaCheckMode(): + op.op(*args, **kwargs) + + +instantiate_device_type_tests(TestOpInfo, globals()) + +if __name__ == "__main__": + from torch._dynamo.test_case import run_tests + + run_tests() diff --git a/test/export/test_export.py b/test/export/test_export.py index 586fc403da9a9..cec463fa3dc0e 100644 --- a/test/export/test_export.py +++ b/test/export/test_export.py @@ -631,7 +631,13 @@ def forward(self, x, weight, bias): self.assertEqual(actual_result, expected_result) # TODO(yidi) - @unittest.expectedFailure + # Expected failure for test cases that calls run_decomposition(). + # The top-level cond node has pre-existing metadata, + # which overrides the metadata for operators in subgraph due to interpreter.run(), + # where cond is a single node in the interpreter.run(). And we preserve metadata + # by copying current node's metadata for all nodes created during interpreting. + @testing.expectedFailurePreDispatchRunDecomp + @testing.expectedFailureRetraceability def test_export_cond_preserve_torch_fn_for_subgraphs(self): class MySubModule(torch.nn.Module): def foo(self, x): @@ -2091,6 +2097,32 @@ def forward(self, x): ): export(Module(), (torch.tensor(1, device="cpu"),)) + def test_float_conversion(self): + class Module(torch.nn.Module): + def forward(self, x): + return x.float() + + ep = export(Module(), (torch.tensor(1, dtype=torch.float),)) + ops = [] + for node in ep.graph.nodes: + if node.op == "call_function": + ops.append(node.target) + self.assertGreater(len(ops), 0) + for op in ops: + self.assertIn(op, (torch.ops.aten._to_copy.default,)) + + def test_device_to_mutation_float(self): + class Module(torch.nn.Module): + def forward(self, x): + y = x.float() + y.add_(1) + return y, x + + with self.assertRaisesRegex( + RuntimeError, "cannot mutate tensors with frozen storage" + ): + export(Module(), (torch.tensor(1, dtype=torch.float),)) + def test_module(self): class MyLinear(torch.nn.Module): def __init__(self): @@ -3301,6 +3333,23 @@ def forward(self, x): test_inp = torch.ones(8, 4) self.assertTrue(torch.allclose(ep.module()(test_inp), Foo().forward(test_inp))) + @testing.expectedFailureRetraceability + def test_runtime_assert_with_size(self): + class M(torch.nn.Module): + def forward(self, x, y): + a = x.item() + torch._check_is_size(a) + torch._check(a <= y.size(0)) + return y[:a] + + ep = export( + M(), + (torch.tensor(5), torch.ones(10)), + dynamic_shapes={"x": None, "y": {0: torch.export.Dim("t")}}, + ) + inp = (torch.tensor(6), torch.randn(13)) + self.assertTrue(torch.allclose(ep.module()(*inp), M()(*inp))) + def test_issue_113041(self): class TestModule(torch.nn.Module): def __init__(self): @@ -4873,6 +4922,31 @@ def forward(self, x): unflattened = unflatten(ep) self.assertTrue(torch.allclose(m1(*inps), unflattened(*inps))) + @testing.expectedFailureRetraceability + def test_unused_aliases(self): + class Foo(torch.nn.Module): + def __init__(self): + super().__init__() + # param + self.alpha = torch.nn.Parameter(torch.randn(4)) + self.beta = self.alpha + self.gamma = self.alpha + + def forward(self, x): + return x + self.gamma + + inps = (torch.randn(4),) + ep = export(Foo(), inps) + # placeholder nodes will be deduplicated in strict-mode, + # but check that all params still appear in state dict + for param in ["alpha", "beta", "gamma"]: + self.assertTrue(param in ep.state_dict) + + # check that they also appear in unflattened state dict + unep = unflatten(ep) + for param in ["alpha", "beta", "gamma"]: + self.assertTrue(param in unep.state_dict()) + @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support") class TestOneOffModelExportResult(TestCase): diff --git a/test/export/test_unflatten.py b/test/export/test_unflatten.py index b8ff48334f011..19c55982d590d 100644 --- a/test/export/test_unflatten.py +++ b/test/export/test_unflatten.py @@ -708,6 +708,44 @@ def forward(self, input_): umod = unflatten(ep_non_strict) self.assertTrue(torch.allclose(umod(input_), mod(input_))) + def test_simple_alias(self): + # handle weight sharing, check tensor ids after unflattening + class Foo(torch.nn.Module): + def __init__(self): + super().__init__() + # alias param + self.bias = torch.nn.Parameter(torch.randn(4)) + self.m = torch.nn.Linear(4, 4) + self.m.bias = self.bias + + def forward(self, x): + return self.m(x) + self.bias + + m = Foo() + inps = (torch.randn(4, 4),) + ep = export(m, inps) + unep = unflatten(ep) + self.assertTrue(id(unep.m.bias) == id(unep.bias)) + + # handle aliasing where one alias is unused + class Foo(torch.nn.Module): + def __init__(self): + super().__init__() + self.bias = torch.nn.Parameter(torch.randn(4)) + self.m = torch.nn.Linear(4, 4) + self.m.bias = ( + self.bias + ) # self.bias is unused, aliasing should be handled + + def forward(self, x): + return self.m(x) + + m = Foo() + inps = (torch.randn(4, 4),) + ep = export(m, inps) + unep = unflatten(ep) + self.assertTrue(torch.allclose(unep(*inps), m(*inps))) + if __name__ == "__main__": run_tests() diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py index 5c17b7f84d0d4..ffa71a7e905b5 100644 --- a/test/functorch/test_aotdispatch.py +++ b/test/functorch/test_aotdispatch.py @@ -103,8 +103,13 @@ class AOTTestCase(TestCase): def setUp(self): + self.prev_grad_state = torch.is_grad_enabled() super().setUp() + def tearDown(self): + torch.set_grad_enabled(self.prev_grad_state) + super().tearDown() + class TestPythonKey(AOTTestCase): def test_make_fx(self, device): diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py index a1aeb8c1de7d3..92a988d83db39 100644 --- a/test/functorch/test_control_flow.py +++ b/test/functorch/test_control_flow.py @@ -890,6 +890,42 @@ def f(x): )(inp) self.assertEqual(gm_functional(torch.zeros(1, 2)), f(torch.zeros(1, 2))) + def test_cond_subgraph_same_shape_env_as_parent(self): + def true_fn(x): + return x.sin() + 10 + + def false_fn(x): + return x.cos() - 20 + + def f(x, pred): + y = cond(pred, true_fn, false_fn, [x]) + z = torch.add(y, y) + return z + + symbolic_traced_graph = self._check_tracing(f, (torch.ones(4), True))[ + "symbolic" + ] + graph_shape_env = symbolic_traced_graph.shape_env + + def _node_shape_env_iter(gm): + for node in symbolic_traced_graph.graph.nodes: + if node.op == "call_function": + val = node.meta.get("val") + if isinstance(val, tuple): + for v in val: + yield v.fake_mode.shape_env + else: + yield val.fake_mode.shape_env + + for shape_env in _node_shape_env_iter(symbolic_traced_graph): + self.assertTrue(shape_env is graph_shape_env) + + for shape_env in _node_shape_env_iter(symbolic_traced_graph.true_graph_0): + self.assertTrue(shape_env is graph_shape_env) + + for shape_env in _node_shape_env_iter(symbolic_traced_graph.false_graph_0): + self.assertTrue(shape_env is graph_shape_env) + def test_cond_functionalized_nested(self): def true_true_fn(x): y = x.cos() diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py index e913d90dde48c..068123a78e8c5 100644 --- a/test/functorch/test_ops.py +++ b/test/functorch/test_ops.py @@ -349,8 +349,6 @@ def is_inplace(op, variant): vjp_fail = { xfail("tensor_split"), # data_ptr composite compliance - decorate("nn.functional.batch_norm", decorator=skipIfRocm), - decorate("nn.functional.instance_norm", decorator=skipIfRocm), # https://github.com/pytorch/pytorch/issues/96560 decorate("nn.functional.scaled_dot_product_attention", decorator=skipIfRocm), } @@ -569,11 +567,6 @@ def abs_if_complex(t): xfail( "NumpyExpMarkDirtyAutogradFunction" ), # TODO: https://github.com/pytorch/pytorch/issues/91280 - # https://github.com/pytorch/pytorch/issues/96560 - # ROCm: NotImplementedError - decorate("nn.functional.batch_norm", decorator=skipIfRocm), - # ROCm: NotImplementedError - decorate("nn.functional.instance_norm", decorator=skipIfRocm), # --- Non-Contiguous Failures! --- # This is expected to fail as the operator # expects last dim to have stride=1 @@ -1282,9 +1275,6 @@ def test_vmapvjp(self, device, dtype, op): xfail("_native_batch_norm_legit"), # TODO: implement batching rule xfail("_batch_norm_with_update"), - # https://github.com/pytorch/pytorch/issues/96560 - # ROCm: NotImplementedError - decorate("nn.functional.instance_norm", decorator=skipIfRocm), # ---------------------------------------------------------------------- } diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py index a23b51da923f9..7735594c58230 100644 --- a/test/functorch/test_vmap.py +++ b/test/functorch/test_vmap.py @@ -59,7 +59,6 @@ markDynamoStrictTest, parametrize, run_tests, - skipIfRocm, skipIfTorchDynamo, subtest, TEST_WITH_TORCHDYNAMO, @@ -4200,8 +4199,6 @@ def test(): xfail("tril"), # Exception not raised on error input xfail("triu"), # Exception not raised on error input xfail("as_strided", "partial_views"), - # https://github.com/pytorch/pytorch/issues/96560 - decorate("nn.functional.batch_norm", decorator=skipIfRocm), # RuntimeError: output with shape [4, 4] doesn't match the broadcast shape [1, 4, 4] xfail("addcdiv"), xfail("addcmul"), @@ -4375,8 +4372,6 @@ def test_vmap_exhaustive(self, device, dtype, op): xfail("linalg.lu", ""), skip("linalg.ldl_solve", ""), skip("_softmax_backward_data"), - # https://github.com/pytorch/pytorch/issues/96560 - decorate("nn.functional.batch_norm", decorator=skipIfRocm), # One or more of the overload doesn't have a Batch rule. xfail("bincount"), # RuntimeError: Expected all tensors to be on the same device, diff --git a/test/inductor/extension_backends/cpp/extension_device.cpp b/test/inductor/extension_backends/cpp/extension_device.cpp index 71f3f5919a9b2..c801f9ea06837 100644 --- a/test/inductor/extension_backends/cpp/extension_device.cpp +++ b/test/inductor/extension_backends/cpp/extension_device.cpp @@ -44,7 +44,7 @@ at::Tensor custom_to_device( at::ScalarType dtype, bool non_blocking, bool copy, - c10::optional memory_format) { + std::optional memory_format) { TORCH_CHECK(self.is_cpu() || self.device().type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device."); TORCH_CHECK(device.is_cpu() || device.type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device."); // Some dummy asserts for the basic use case: inputs are the same size / dtype, all contiguous. @@ -121,11 +121,11 @@ at::Tensor custom__copy_from(const at::Tensor& self, const at::Tensor& dst, bool } at::Tensor custom_empty_memory_format(at::IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional memory_format) { constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1); return at::detail::empty_generic(size, &global_custom_alloc, @@ -134,7 +134,7 @@ at::Tensor custom_empty_memory_format(at::IntArrayRef size, memory_format); } -at::Tensor custom_empty_strided(c10::IntArrayRef size, c10::IntArrayRef stride, c10::optional dtype_opt, c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { +at::Tensor custom_empty_strided(c10::IntArrayRef size, c10::IntArrayRef stride, std::optional dtype_opt, c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { op_counter += 1; constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1); diff --git a/test/inductor/test_benchmark_fusion.py b/test/inductor/test_benchmark_fusion.py index c2f435e9bf94e..3970148b2747f 100644 --- a/test/inductor/test_benchmark_fusion.py +++ b/test/inductor/test_benchmark_fusion.py @@ -203,7 +203,7 @@ def setUpClass(cls): { "benchmark_kernel": True, "benchmark_fusion": True, - "benchmark_multi_templates": True, + "benchmark_epilogue_fusion": True, } ) ) @@ -231,7 +231,7 @@ def foo(m, inp): torch._dynamo.reset() with unittest.mock.patch.object( - torch._inductor.config, "benchmark_multi_templates", False + torch._inductor.config, "benchmark_epilogue_fusion", False ): foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo) with torch.no_grad(): diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py index a8fabacd742ec..b2d0ed91809f9 100644 --- a/test/inductor/test_compiled_optimizers.py +++ b/test/inductor/test_compiled_optimizers.py @@ -46,7 +46,6 @@ OneCycleLR, PolynomialLR, ReduceLROnPlateau, - SequentialLR, StepLR, ) @@ -73,9 +72,11 @@ StepLR: {"step_size": 1, "gamma": 100}, MultiStepLR: {"milestones": [1, 2], "gamma": 100}, ExponentialLR: {"gamma": 100}, - SequentialLR: {"schedulers": None, "milestones": [1, 2]}, CosineAnnealingLR: {"T_max": 7}, - ChainedScheduler: {"schedulers": None}, + # These schedulers have memory leaks in eager + # https://github.com/pytorch/pytorch/issues/126131 + # SequentialLR: {"schedulers": None, "milestones": [1, 2]}, + # ChainedScheduler: {"schedulers": None}, CyclicLR: {"base_lr": 0.001, "max_lr": 0.02, "cycle_momentum": False}, CosineAnnealingWarmRestarts: {"T_0": 1}, OneCycleLR: { @@ -766,6 +767,25 @@ def test_get_value_on_static_address(self): self.assertEqual(ret_val, x) + # compile a large foreach op and verify + # that the time taken is within an expected range + @requires_cuda + def test_compile_time_smoketest(self): + import time + + xs = [torch.ones(2, 2, device="cuda") for _ in range(100)] + ys = [torch.ones(2, 2, device="cuda") for _ in range(100)] + + @torch.compile + def fn(xs, ys): + return torch._foreach_add(xs, ys) + + start = time.perf_counter() + fn(xs, ys) + end = time.perf_counter() + + self.assertLess(end - start, 90) + for optim_cls, name, kwargs, scheduler_cls in COMPILED_OPT_KWARG_DB: setattr( diff --git a/test/inductor/test_cuda_cpp_wrapper.py b/test/inductor/test_cuda_cpp_wrapper.py index 42df6813c63e5..5bbe588d3a84e 100644 --- a/test/inductor/test_cuda_cpp_wrapper.py +++ b/test/inductor/test_cuda_cpp_wrapper.py @@ -99,7 +99,6 @@ class DynamicShapesCudaWrapperCudaTests(InductorTestCase): xfail_list = [ "test_bernoulli1_cuda", # cpp fallback op naming issue "test_profiler_mark_wrapper_call_cuda", - "test_randint_cuda", "test_scaled_dot_product_attention_cuda_dynamic_shapes", ] for test_name in xfail_list: diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py index c8877d4a8e978..e0e1395c4908b 100644 --- a/test/inductor/test_cudagraph_trees.py +++ b/test/inductor/test_cudagraph_trees.py @@ -1629,9 +1629,13 @@ def test_incompatible_cudagraph_ops_item(self): def foo(x): return x.item() + # NB: This doesn't work with float, because float unbacked codegen + # is currently broken. But testing the float case here is also + # awkward, because we plan to Tensor-ify the float compute, and as + # a result we'd actually expect this to work with cuda graphs! with capture_stderr() as captured_output: - self.assertEqual(foo(torch.tensor(3.0, device="cuda")), 3.0) - self.assertEqual(foo(torch.tensor(6.0, device="cuda")), 6.0) + self.assertEqual(foo(torch.tensor(3, device="cuda")), 3) + self.assertEqual(foo(torch.tensor(6, device="cuda")), 6) # NOTE: this test is named after incompatible ops, but is not skipping due to incompatible ops. # This should get fixed. diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py index 37461bc2c50a8..9df905d2ad547 100644 --- a/test/inductor/test_flex_attention.py +++ b/test/inductor/test_flex_attention.py @@ -126,6 +126,19 @@ def score_mod(score, b, h, m, n): class TestTemplatedSDPA(InductorTestCase): + def _check_equal(self, golden_out, ref_out, compiled_out, dtype): + compiled_error = (golden_out - compiled_out).abs().mean() + ref_error = (golden_out - ref_out).abs().mean() + # Note, it seems like we really are less accurate than the float32 + # computation, likely due to the online softmax + if dtype == torch.float32: + fudge_factor = 10.0 + else: + fudge_factor = 1.1 + if compiled_error > ref_error * fudge_factor: + msg = f"Compiled error {compiled_error} is greater than ref error {ref_error} by more than {fudge_factor}X." + self.assertTrue(False, msg) + def run_test( self, score_mod: Callable, @@ -145,18 +158,114 @@ def run_test( ) ref_out = sdpa_partial(q, k, v) compiled_out = compiled_sdpa(q, k, v) + self._check_equal(golden_out, ref_out, compiled_out, dtype) - compiled_error = (golden_out - compiled_out).abs().mean() - ref_error = (golden_out - ref_out).abs().mean() - # Note, it seems like we really are less accurate than the float32 - # computation, likely due to the online softmax - if dtype == torch.float32: - fudge_factor = 10.0 - else: - fudge_factor = 1.1 - if compiled_error > ref_error * fudge_factor: - msg = f"Compiled error {compiled_error} is greater than ref error {ref_error} by more than {fudge_factor}X." - self.assertTrue(False, msg) + def run_dynamic_test( + self, + score_mod: Callable, + dtype: torch.dtype = torch.float16, + B: int = B, + H: int = H, + S: int = S, + D: int = D, + ): + sdpa_partial = create_attention(score_mod) + # The first eager batch, shape (B, H, S, D) + q1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + k1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + v1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + golden_out1 = sdpa_partial( + q1.to(torch.float64), k1.to(torch.float64), v1.to(torch.float64) + ) + ref_out1 = sdpa_partial(q1, k1, v1) + + # The second eager batch, shape (B * 2, H, S / 2, D) + B = int(B * 2) + S = int(S / 2) + q2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + k2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + v2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + golden_out2 = sdpa_partial( + q2.to(torch.float64), k2.to(torch.float64), v2.to(torch.float64) + ) + ref_out2 = sdpa_partial(q2, k2, v2) + + # Need to clear dynamo counters, since flex attention eager mode also uses dynamo tracing. + # We check dynamo counters["frames"]["ok"] to ensure there is no re-compilation. + torch._dynamo.reset() + # Compiling with dynamic shape in the first batch. + compiled_sdpa = torch.compile(sdpa_partial, dynamic=True) + compiled_out1 = compiled_sdpa(q1, k1, v1) + self._check_equal(golden_out1, ref_out1, compiled_out1, dtype) + self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 1) + + # No re-compilation, use the compiled dynamic shape version. + compiled_out2 = compiled_sdpa(q2, k2, v2) + self._check_equal(golden_out2, ref_out2, compiled_out2, dtype) + self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 1) + + def run_automatic_dynamic_test( + self, + score_mod: Callable, + dtype: torch.dtype = torch.float16, + B: int = B, + H: int = H, + S: int = S, + D: int = D, + ): + sdpa_partial = create_attention(score_mod) + # The first eager batch, shape (B, H, S, D) + q1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + k1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + v1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + golden_out1 = sdpa_partial( + q1.to(torch.float64), k1.to(torch.float64), v1.to(torch.float64) + ) + ref_out1 = sdpa_partial(q1, k1, v1) + + # The second eager batch, shape (B * 2, H, S / 2, D) + B = int(B * 2) + S = int(S / 2) + q2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + k2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + v2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + golden_out2 = sdpa_partial( + q2.to(torch.float64), k2.to(torch.float64), v2.to(torch.float64) + ) + ref_out2 = sdpa_partial(q2, k2, v2) + + # The third eager batch, shape (B * 4, H, S / 4, D) + B = int(B * 2) + S = int(S / 2) + q3 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + k3 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + v3 = torch.randn((B, H, S, D), dtype=dtype, device="cuda") + golden_out3 = sdpa_partial( + q3.to(torch.float64), k3.to(torch.float64), v3.to(torch.float64) + ) + ref_out3 = sdpa_partial(q3, k3, v3) + + # Need to clear dynamo counters, since flex attention eager mode also uses dynamo tracing. + # We check dynamo counters["frames"]["ok"] to ensure: + # 1, the first batch is compiled with static shape + # 2, the second batch is compiled with dynamic shape + # 3, no re-compilation in the third batch + torch._dynamo.reset() + # The first batch. + compiled_sdpa = torch.compile(sdpa_partial) + compiled_out1 = compiled_sdpa(q1, k1, v1) + self._check_equal(golden_out1, ref_out1, compiled_out1, dtype) + self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 1) + + # The second batch (automatic dynamic). + compiled_out2 = compiled_sdpa(q2, k2, v2) + self._check_equal(golden_out2, ref_out2, compiled_out2, dtype) + self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 2) + + # The third batch (no re-compilation). + compiled_out3 = compiled_sdpa(q3, k3, v3) + self._check_equal(golden_out3, ref_out3, compiled_out3, dtype) + self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 2) @supported_platform @common_utils.parametrize("dtype", test_dtypes) @@ -164,6 +273,20 @@ def run_test( def test_builtin_score_mods(self, dtype: torch.dtype, score_mod: Callable): self.run_test(score_mod, dtype) + @supported_platform + @common_utils.parametrize("dtype", test_dtypes) + @common_utils.parametrize("score_mod", test_score_mods) + def test_builtin_score_mods_dynamic(self, dtype: torch.dtype, score_mod: Callable): + self.run_dynamic_test(score_mod, dtype) + + @supported_platform + @common_utils.parametrize("dtype", test_dtypes) + @common_utils.parametrize("score_mod", test_score_mods) + def test_builtin_score_mods_automatic_dynamic( + self, dtype: torch.dtype, score_mod: Callable + ): + self.run_automatic_dynamic_test(score_mod, dtype) + @supported_platform @common_utils.parametrize("dtype", test_dtypes) def test_skip_odd_keys(self, dtype: torch.dtype): @@ -289,7 +412,51 @@ def natten_mask(score, b, h, q, kv): self.run_test(natten_mask, dtype) @supported_platform - @expectedFailure + @common_utils.parametrize("dtype", test_dtypes_fast) + def test_subgraph_respect_decompostion(self, dtype): + from torch._decomp import core_aten_decompositions + from torch.fx.experimental.proxy_tensor import make_fx + + def score_mod_func(score, b, h, q, kv): + return score - q // (1 + kv) + + make_tensor = functools.partial( + torch.randn, + (2, 2, 8, 4), + device="cuda", + dtype=torch.float64, + requires_grad=True, + ) + query, key, value = make_tensor(), make_tensor(), make_tensor() + # floor_div is not decomposed in decompostion_table is empty + gm = make_fx(_flex_attention, decomposition_table={})( + query, key, value, score_mod_func + ) + self.assertExpectedInline( + gm.sdpa_score0.code.strip(), + """\ +def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1): + add = torch.ops.aten.add.Tensor(arg4_1, 1); arg4_1 = None + floor_divide = torch.ops.aten.floor_divide.default(arg3_1, add); arg3_1 = add = None + sub = torch.ops.aten.sub.Tensor(arg0_1, floor_divide); arg0_1 = floor_divide = None + return sub""", + ) + + # floor_div is decomposed for core_aten_decompositions + gm = make_fx(_flex_attention, decomposition_table=core_aten_decompositions())( + query, key, value, score_mod_func + ) + self.assertExpectedInline( + gm.sdpa_score0.code.strip(), + """\ +def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1): + add = torch.ops.aten.add.Tensor(arg4_1, 1); arg4_1 = None + div = torch.ops.aten.div.Tensor_mode(arg3_1, add, rounding_mode = 'floor'); arg3_1 = add = None + sub = torch.ops.aten.sub.Tensor(arg0_1, div); arg0_1 = div = None + return sub""", + ) + + @supported_platform @common_utils.parametrize("dtype", test_dtypes_fast) def test_silu_on_score(self, dtype): def silu_score(score, b, h, q, kv): diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py index c8622de6faf8f..c5f0afa118f87 100644 --- a/test/inductor/test_max_autotune.py +++ b/test/inductor/test_max_autotune.py @@ -622,6 +622,82 @@ def f(x, weight): def test_empty_conv_input_with_1x1_kernel(self): self.test_empty_conv_input(kernel_size=1) + def test_non_contiguous_input_mm(self): + """ + Make sure the triton template can work with non-contiguous inputs without crash. + Check https://github.com/pytorch/pytorch/issues/125437 for more details. + """ + x = torch.empty_strided( + (50257, 32768), (1, 50304), dtype=torch.bfloat16, device="cuda" + ) + y = torch.empty_strided( + (32768, 768), (768, 1), dtype=torch.bfloat16, device="cuda" + ) + + @torch.compile(mode="max-autotune") + def f(x, y): + return x @ y + + ref = x @ y + act = f(x, y) + self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3)) + + def test_non_contiguous_input_addmm(self): + b = torch.empty((768), dtype=torch.bfloat16, device="cuda") + x = torch.empty_strided( + (50257, 32768), (1, 50304), dtype=torch.bfloat16, device="cuda" + ) + y = torch.empty_strided( + (32768, 768), (768, 1), dtype=torch.bfloat16, device="cuda" + ) + + @torch.compile(mode="max-autotune") + def f(x, y): + return torch.addmm(b, x, y) + + ref = torch.addmm(b, x, y) + act = f(x, y) + self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3)) + + def test_non_contiguous_input_bmm(self): + x = torch.empty_strided( + (1, 50257, 32768), (0, 1, 50304), dtype=torch.bfloat16, device="cuda" + ) + y = torch.empty_strided( + (1, 32768, 768), (0, 768, 1), dtype=torch.bfloat16, device="cuda" + ) + + @torch.compile(mode="max-autotune") + def f(x, y): + return torch.bmm(x, y) + + ref = torch.bmm(x, y) + act = f(x, y) + self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3)) + + def test_non_contiguous_input_mm_plus_mm(self): + x1 = torch.empty_strided( + (50257, 32768), (1, 50304), dtype=torch.bfloat16, device="cuda" + ) + y1 = torch.empty_strided( + (32768, 768), (768, 1), dtype=torch.bfloat16, device="cuda" + ) + + x2 = torch.empty_strided( + (50257, 32768), (1, 50304), dtype=torch.bfloat16, device="cuda" + ) + y2 = torch.empty_strided( + (32768, 768), (768, 1), dtype=torch.bfloat16, device="cuda" + ) + + @torch.compile(mode="max-autotune") + def f(x1, y1, x2, y2): + return x1 @ y1 + x2 @ y2 + + ref = x1 @ y1 + x2 @ y2 + act = f(x1, y1, x2, y2) + self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3)) + class TestBenchmarkRequest(BenchmarkRequest): def __init__( diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py index 5aeb4d01edbd0..b16e5e5d62edf 100644 --- a/test/inductor/test_pad_mm.py +++ b/test/inductor/test_pad_mm.py @@ -5,10 +5,15 @@ import torch._inductor.config as inductor_config from torch._dynamo.testing import rand_strided -from torch._inductor.fx_passes.pad_mm import get_alignment_size, get_padded_length +from torch._inductor.fx_passes.pad_mm import ( + get_alignment_size, + get_pad_cache, + get_padded_length, + should_pad_common, +) from torch._inductor.test_case import run_tests, TestCase -from torch._inductor.utils import run_and_get_code +from torch._inductor.utils import fresh_inductor_cache, run_and_get_code from torch.testing import FileCheck from torch.testing._internal.inductor_utils import HAS_CUDA @@ -125,7 +130,7 @@ def forward(self, a, b): b = rand_strided((K, N), (1, K), device="cuda", dtype=torch.float32) # TODO: Getting the alignment right requires pattern matcher to # run on newly added nodes - aligned_m = get_padded_length(M, get_alignment_size(a)) + M - 3 + aligned_m = get_padded_length(M, get_alignment_size(a)) + M torch._dynamo.mark_dynamic(a, 1) torch._dynamo.mark_dynamic(b, 0) with unittest.mock.patch( @@ -312,6 +317,103 @@ def forward(self, a, b, c): FileCheck().check(f"K = {K}").run(code) self.assertEqual(res1, res2) + @inductor_config.patch(force_shape_pad=True) + def test_pad_single_cat(self): + @torch.compile() + def foo(x, y): + return x @ y + + inps = [torch.rand([5, 5], device="cuda") for _ in range(2)] + out = foo(*inps) + self.assertEqual(out, inps[0] @ inps[1]) + + @inductor_config.patch(force_shape_pad=True) + @fresh_inductor_cache() + def test_pad_addmm_2d_bias(self): + @torch.compile() + def foo(input, x, y): + return torch.ops.aten.addmm(input, x, y) + + for a in [1, 4]: + for b in [1, 6]: + inps = ( + torch.rand([a, b], device="cuda"), + torch.rand([4, 5], device="cuda"), + torch.rand([5, 6], device="cuda"), + ) + out = foo(*inps) + out_eager = torch.ops.aten.addmm(*inps) + self.assertEqual(out, out_eager) + + for a in [1, 6]: + inps = ( + torch.rand([a], device="cuda"), + torch.rand([4, 5], device="cuda"), + torch.rand([5, 6], device="cuda"), + ) + out = foo(*inps) + out_eager = torch.ops.aten.addmm(*inps) + self.assertEqual(out, out_eager) + + @inductor_config.patch(force_shape_pad=True) + def test_pad_batch(self): + m = 6 + n = 9 + k = 11 + batch_size = 3 + mat1 = torch.ones((batch_size, m, k), device="cuda", dtype=torch.float16) + mat2 = torch.ones((batch_size, k, n), device="cuda", dtype=torch.float16) + expected_alignment = get_alignment_size(mat1) + + assert expected_alignment == 8, "Alignment for float16 should be 8" + assert should_pad_common( + mat1, mat2 + ), "This should pass the common padding criteria" + + @torch.compile() + def bmm(mat1, mat2): + return torch.bmm(mat1, mat2) + + res2, (code,) = run_and_get_code(bmm, mat1, mat2) + bmm_expected_result = torch.bmm(mat1, mat2) + # in call code, expect to see a single pad per input, and then we should see padded allocation for output + FileCheck().check("del async_compile").check_count( + ".run(", 2, exactly=True + ).check("empty_strided_cuda((3, 8, 16)").run(code) + + assert torch.allclose( + res2, bmm_expected_result + ), "BMM results are not identical" + + @fresh_inductor_cache() + def test_exclude_padding(self): + @torch.compile() + def mm(a, b): + return a @ b + + mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda")) + local_cache = get_pad_cache().get_local_cache() + self.assertTrue(len(local_cache) == 2) + FileCheck().check_count("exclude_pad:False", 2, exactly=True).run( + repr(local_cache) + ) + + @torch.compile() + def mm(a, b): + return (a + 1) @ b + + mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda")) + local_cache = get_pad_cache().get_local_cache() + # reuse original base timing + self.assertTrue(len(local_cache) == 3) + + FileCheck().check_count("exclude_pad:False", 3, exactly=True).run( + repr(local_cache) + ) + FileCheck().check_count("exclude_pad:True", 1, exactly=True).run( + repr(local_cache) + ) + if __name__ == "__main__": if HAS_CUDA: diff --git a/test/inductor/test_padding.py b/test/inductor/test_padding.py index 7aef585842e61..e08ac285801d7 100644 --- a/test/inductor/test_padding.py +++ b/test/inductor/test_padding.py @@ -7,9 +7,9 @@ import torch from torch import nn, Tensor +from torch._dynamo.convert_frame import maybe_cprofile from torch._dynamo.test_case import run_tests, TestCase from torch._dynamo.testing import rand_strided, reduce_to_scalar_loss -from torch._dynamo.utils import maybe_cprofile from torch._inductor import config, ir, metrics from torch._inductor.fx_passes import pad_mm as pad_mm_pass from torch._inductor.runtime.runtime_utils import do_bench diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py index fc1a9a5ec507d..cc7c3f7084c88 100644 --- a/test/inductor/test_pattern_matcher.py +++ b/test/inductor/test_pattern_matcher.py @@ -6,6 +6,7 @@ import torch import torch._dynamo.config as dynamo_config import torch._inductor.config as inductor_config +import torch.nn.functional as F from torch._dynamo.utils import count_calls, counters from torch._higher_order_ops.out_dtype import out_dtype from torch._inductor.fx_passes import joint_graph @@ -28,6 +29,7 @@ from torch.testing._internal.common_cuda import SM80OrLater from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm from torch.testing._internal.inductor_utils import HAS_CUDA +from torch.utils import _pytree as pytree class TestPatternMatcher(TestCase): @@ -38,15 +40,22 @@ def common( expected_matches, expected_nodes, additional_check=lambda code: None, + reference_in_float=False, ): counters.clear() torch.manual_seed(42) - expected = fn(*args) + if reference_in_float: + ref_inputs = pytree.tree_map_only( + torch.Tensor, lambda x: x.to(torch.float32), args + ) + else: + ref_inputs = args + expected = fn(*ref_inputs) torch.manual_seed(42) actual, codes = run_and_get_code(torch.compile(fn), *args) if len(codes) == 1: codes = codes[0] - torch.testing.assert_close(actual, expected) + torch.testing.assert_close(actual, expected, check_dtype=not reference_in_float) self.assertEqual( counters["inductor"]["pattern_matcher_count"], expected_matches @@ -1170,6 +1179,46 @@ def fn1(a, b): stable_topological_sort(graph) self.assertEqual(list(graph.nodes), [b, a, c]) + def test_scaled_softmax(self): + def mul_softmax(a, b): + return F.softmax(a * b, dim=0) + + def div_softmax(x, inv_scale): + return F.softmax(x / inv_scale, dim=0) + + x = torch.randn(10, 10) + scale = 1e6 + inv_scale = 1 / scale + self.common(mul_softmax, (x, scale), 1, 3) + self.common(mul_softmax, (scale, x), 1, 3) + self.common(div_softmax, (x, inv_scale), 1, 3) + + scale = torch.randn(10) * 1e6 + inv_scale = 1 / scale + self.common(mul_softmax, (x, scale), 1, 3) + self.common(mul_softmax, (scale, x), 1, 3) + self.common(div_softmax, (x, inv_scale), 1, 3) + + scale = torch.randn(1, 10) * 1e6 + inv_scale = 1 / scale + self.common(mul_softmax, (x, scale), 1, 3) + self.common(mul_softmax, (scale, x), 1, 3) + self.common(div_softmax, (x, inv_scale), 1, 3) + + # Test matching with type promotion + x = torch.randn(10, 10, dtype=torch.bfloat16) + scale = torch.randn(10, dtype=torch.bfloat16) * 1e6 + inv_scale = 1 / scale + self.common(mul_softmax, (x, scale), 1, 4, reference_in_float=True) + self.common(mul_softmax, (scale, x), 1, 4, reference_in_float=True) + self.common(div_softmax, (x, inv_scale), 1, 4, reference_in_float=True) + + # No match if scale changes in softmax dim + scale = torch.randn(10, 10) + self.common(mul_softmax, (x, scale), 0, 0) + self.common(mul_softmax, (scale, x), 0, 0) + self.common(div_softmax, (x, scale), 0, 0) + if __name__ == "__main__": if IS_LINUX and HAS_CUDA: diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py index c4394b3964865..09e913350e143 100644 --- a/test/inductor/test_perf.py +++ b/test/inductor/test_perf.py @@ -240,9 +240,8 @@ def f(a, b): def f(a, b): return torch.cat([torch.softmax(a, dim=-1), torch.softmax(b, dim=-1)]).cos() - # potentially beneficial to fuse but we exclude reductions from pointwise cat inp = (T(10, 10), T(10, 10)) - self.assertExpectedInline(count_numel(f, *inp), """800""") + self.assertExpectedInline(count_numel(f, *inp), """680""") # Should turn into pointwise even if only some of inputs are pointwise. def f(a, b): @@ -267,6 +266,13 @@ def f(a, b): inp = (T(10, 10), T(10, 10)) self.assertExpectedInline(count_numel(f, *inp), """400""") + def f(a, b): + b = b.cos() + return torch.cat([a, b]) + + inp = (T(10, 10), T(10, 10)) + self.assertExpectedInline(count_numel(f, *inp), """400""") + @patch.object(config, "split_cat_fx_passes", False) @patch.object( config, diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index c74f776c1527a..3a7b66d660658 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -17,7 +17,9 @@ import time import typing import unittest +import unittest.mock import weakref +from pathlib import Path from typing import Tuple from unittest.mock import patch @@ -40,6 +42,9 @@ from torch._inductor.test_case import TestCase as InductorTestCase from torch._inductor.utils import ( add_scheduler_init_hook, + aoti_compile_with_persistent_cache, + aoti_eager_cache_dir, + load_aoti_eager_cache, run_and_get_code, run_and_get_triton_code, ) @@ -81,7 +86,6 @@ from torch.utils import _pytree as pytree from torch.utils._python_dispatch import TorchDispatchMode from torch.utils._pytree import tree_flatten, tree_unflatten -from torch.utils._triton import has_triton from torch.utils.weak import WeakTensorKeyDictionary DO_PERF_TEST = os.environ.get("DO_PERF_TEST") == "1" @@ -761,6 +765,102 @@ def fn(a, b): ), ) + @skipCUDAIf(not SM80OrLater, "Requires sm80") + def test_eager_aoti_cache_hit(self): + ns = "aten" + op_name = "abs" + dispatch_key = "CPU" + device = "cpu" + if self.device.lower() == "cuda": + dispatch_key = "CUDA" + device = "cuda" + + input_tensor = torch.randn(128, dtype=torch.float, device=device) + kernel_lib_path = aoti_compile_with_persistent_cache( + ns, + op_name, + device, + False, + getattr(torch.ops.aten, op_name), + (input_tensor,), + {}, + ) + self.assertTrue(Path(kernel_lib_path).exists()) + + from unittest import mock + + # Patch the aoti_compile_with_persistent_cache as None to ensure no new kernel is generated + with mock.patch( + "torch._inductor.utils.aoti_compile_with_persistent_cache", None + ): + qualified_op_name = f"{ns}::{op_name}" + _, overload_names = torch._C._jit_get_operation(qualified_op_name) + + with _scoped_library("aten", "IMPL") as torch_compile_op_lib_impl: + # Get ref result from eager + ref_value = getattr(torch.ops.aten, op_name)(input_tensor) + + for overload_name in overload_names: + try: + reg_op_name = qualified_op_name + schema = torch._C._get_schema(qualified_op_name, overload_name) + if schema.overload_name: + reg_op_name = f"{qualified_op_name}.{schema.overload_name}" + torch_compile_op_lib_impl._impl_with_aoti_compile( # noqa: F821 + reg_op_name, dispatch_key + ) + except Exception as e: + continue + + # Invoke the pre-compiled kernel and get result. + res_value = getattr(torch.ops.aten, op_name)(input_tensor) + + self.assertEqual(ref_value, res_value) + + @skipCUDAIf(not SM80OrLater, "Requires sm80") + def test_aoti_compile_with_persistent_cache(self): + def fn(a): + return torch.abs(a) + + ns = "aten" + op_name = "abs" + + device = "cpu" + if self.device.lower() == "cuda": + device = "cuda" + + input_tensor = torch.randn(128, dtype=torch.float, device=device) + kernel_lib_path = aoti_compile_with_persistent_cache( + ns, + op_name, + input_tensor.device.type, + False, + fn, + args=(input_tensor,), + kwargs={}, + ) + self.assertTrue(len(kernel_lib_path) > 0) + + device_kernel_cache = aoti_eager_cache_dir(ns, device) + kernel_conf = device_kernel_cache / f"{op_name}.json" + self.assertTrue(kernel_conf.exists()) + + json_data = load_aoti_eager_cache("aten", "abs", input_tensor.device.type) + self.assertTrue(json_data is not None) + self.assertTrue(isinstance(json_data, list)) + self.assertTrue(len(json_data) > 0) + + op_info = json_data[0] + self.assertTrue(isinstance(op_info, dict)) + self.assertTrue("meta_info" in op_info) + self.assertTrue("kernel_path" in op_info) + kernel_libs_abs_path = [] + for item in json_data: + kernel_path = device_kernel_cache / item["kernel_path"] + kernel_libs_abs_path.append(kernel_path.as_posix()) + + self.assertTrue(kernel_lib_path in kernel_libs_abs_path) + @skipCUDAIf(not SM80OrLater, "Requires sm80") def test_torch_compile_override_registration(self): dynamic = False @@ -1105,6 +1205,22 @@ def repeat(x, n): self.assertEqual(expect, actual) self.assertEqual(actual, repeat(x, 3)) + def test_index_propagation_abs(self): + def reflection_pad_left(x, n): + # e.g. x=[1, 2, 3], n=2 => returns [3, 2, 1, 2, 3] + i = torch.arange(x.shape[0] + n, device=x.device) + return x[(i - n).abs()] + + x = torch.randn(8, device=self.device) + opt_fn = torch._dynamo.optimize("inductor")(reflection_pad_left) + + # this should be collapsed to direct indexing + actual = _run_and_assert_no_indirect_indexing( + self, opt_fn, x, 3, has_wrapping=False + ) + expect = reflection_pad_left(x, 3) + self.assertEqual(expect, actual) + @skipIfRocm @config.patch(debug_index_asserts=False) def test_neg_index(self): @@ -2222,14 +2338,74 @@ def fn_int_input(a, i): def test_div_precision(self): # Reproducer for https://github.com/pytorch/pytorch/issues/101039 - def forward(y): - z = y.div(1e-06) + def forward(x, y): + z = x.div(y) return F.softmax(z, dim=-1) query = torch.randn(1, 10, 40) key = torch.randn(1, 2, 40) - y = torch.matmul(query, key.transpose(-2, -1)) - self.common(forward, (y,)) + x = torch.matmul(query, key.transpose(-2, -1)) + self.common(forward, (x, 1e-6)) + + x = torch.tensor( + [ + [ + [ + [-16.1649, 5.6846, -5.1022, -9.1134], + [-11.5552, -2.2615, -12.8913, 10.6538], + [-7.1666, -5.3333, 2.0776, -9.7984], + [7.4469, -2.3948, 2.7371, 0.9201], + ], + [ + [-8.0361, -16.3771, 22.7741, 4.4685], + [20.8047, -0.7771, -2.4355, -2.2299], + [3.8343, -2.0914, -2.4077, 2.2740], + [-15.8663, -2.7015, -12.5241, -3.0040], + ], + [ + [-2.5139, 14.4393, -3.7186, 1.2255], + [5.6742, 14.1842, -8.5976, 16.8366], + [-9.7358, -3.0279, 11.8164, -4.0787], + [-9.0621, 8.2580, 29.9486, -2.4107], + ], + [ + [7.3622, 12.5640, -20.5592, 13.6237], + [-11.5640, 0.8832, 16.7275, -2.5009], + [-2.0953, -12.2276, -26.2633, 4.5268], + [15.3329, -11.7492, 6.5650, -9.2483], + ], + ], + [ + [ + [7.9980, -4.9369, 3.1508, 5.2994], + [3.8052, 3.9514, 8.4987, -10.5045], + [-2.6827, -4.0010, -4.0611, 6.4091], + [-19.0318, 6.4073, 2.8923, 8.0250], + ], + [ + [7.1650, -3.4585, 5.7720, -5.0305], + [-0.9765, -3.0086, 11.7114, 8.0555], + [-3.1027, -3.5514, 9.6182, -8.8526], + [-9.2348, -6.0239, 6.2528, -6.7221], + ], + [ + [11.5936, 22.4139, -0.4089, -4.9889], + [14.8217, -2.3426, -17.6189, 3.7427], + [1.9546, -13.0902, 8.6293, -7.2457], + [-7.6900, -4.5796, 9.6332, -10.2631], + ], + [ + [0.8027, -1.0955, 14.8404, -0.2673], + [3.2143, -1.8640, -2.9678, 6.5165], + [-3.9865, 6.5230, 6.3019, -0.4247], + [8.3185, -13.5076, 27.0986, -1.6792], + ], + ], + ] + ) + x = torch.matmul(x, x) + y = torch.tensor([[[0.6331]], [[1.6358]], [[-0.3459]], [[1.0196]]]) + self.common(forward, (x, y)) def test_div_by_zero(self): def fn(x, runtime_zero, runtime_neg_zero): @@ -5949,6 +6125,7 @@ def fn(a, b): (a, b), ) + @skipIfXpu def test_nll_loss_backward(self): def fn(a, b, c): return aten.nll_loss_backward( @@ -9786,6 +9963,7 @@ def fn(n): res = torch.compile(fn)(20) self.assertTrue(torch.all((0 <= res) & (res < 10)).item()) + @torch._inductor.config.patch(force_shape_pad=True) def test_should_pad_bench_for_bmm(self): B = 2 M = 1024 @@ -9795,25 +9973,9 @@ def test_should_pad_bench_for_bmm(self): mat1 = torch.rand(B, M, K, device=self.device) mat2 = torch.rand(B, K, N, device=self.device) - def return_true(*args, **kwargs): - return True - - # return value of is_mm_compute_bound depends on flops and membw of - # the GPU. Mock it so the test does not becomes flaky when running - # on different GPUs. - patch1 = patch.object(pad_mm, "is_mm_compute_bound", return_true) - # mock get_cached_should_pad so the test does not rely on benchmarking - # result. - patch2 = patch.object(pad_mm, "get_cached_should_pad", return_true) + should_pad = pad_mm.should_pad_bench(None, mat1, mat2, torch.ops.aten.bmm) - with patch1, patch2: - should_pad = pad_mm.should_pad_bench(mat1, mat2, torch.ops.aten.bmm) - - if has_triton(): - self.assertTrue(should_pad) - else: - # should_pad_bench always returns False if has_triton returns False - self.assertFalse(should_pad) + self.assertTrue(should_pad) @parametrize( "name, op", diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py index c5200a6014241..9ee63752f8e0d 100644 --- a/test/inductor/test_torchinductor_dynamic_shapes.py +++ b/test/inductor/test_torchinductor_dynamic_shapes.py @@ -315,6 +315,7 @@ def f(x, r): f(torch.tensor([3], device=device), torch.randn(10, device=device)) + @unittest.expectedFailure @torch._dynamo.config.patch( capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True ) diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py index 87df89f04d574..c9591a747d77e 100644 --- a/test/profiler/test_profiler.py +++ b/test/profiler/test_profiler.py @@ -16,9 +16,12 @@ import collections import gc import json +import mmap import os import pickle +import random import re +import struct import subprocess import sys import threading @@ -64,7 +67,9 @@ from torch.testing._internal.common_device_type import skipCUDAVersionIn from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, + IS_ARM64, IS_JETSON, + IS_LINUX, IS_WINDOWS, parametrize, run_tests, @@ -1216,6 +1221,26 @@ def test_profiler_op_event_args(self): f"Failed finding record funciont for op = {e}", ) + def test_profiler_strides(self): + torch._C._profiler._set_record_concrete_inputs_enabled_val(True) + base_tensor = torch.randn(1024, dtype=torch.float32) + a = base_tensor.as_strided((16, 16), (17, 1), 0) + b = base_tensor.as_strided((16, 16), (25, 2), 272) + with _profile(record_shapes=True) as prof: + c = torch.add(a, b) + + with TemporaryFileName(mode="w+") as fname: + prof.export_chrome_trace(fname) + with open(fname) as f: + j = json.load(f) + op_events = [ + e for e in j["traceEvents"] if e.get("cat", "") == "cpu_op" + ] + for e in op_events: + args = e["args"] + if e["name"] == "aten::add": + self.assertEqual(args["Input Strides"], [[17, 1], [25, 2], []]) + def test_profiler_fwd_bwd_link(self): with _profile(use_kineto=True) as prof: t1, t2 = torch.ones(1, requires_grad=True), torch.ones( @@ -2416,6 +2441,70 @@ def test_profiler_pattern_matcher_json_report(self): finally: os.remove("torchtidy_report.json") + @unittest.skipIf(IS_ARM64 or not IS_LINUX, "x86 linux only cpp unwinding") + def test_fuzz_symbolize(self): + # generate some random addresses in the text section and make sure the + # symbolizers do not throw exceptions/crash + def get_text_sections(): + text_sections = [] + seen = set() + for filename in os.listdir("/proc/self/map_files"): + library = os.readlink("/proc/self/map_files/" + filename) + if ".so" not in library or library in seen: + continue + seen.add(library) + with open(os.path.join("/proc/self/map_files", library), "rb") as f: + mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ) + + def unpack(fmt, offset): + return struct.unpack( + fmt, mm[offset : offset + struct.calcsize(fmt)] + ) + + if mm[:4] != b"\x7fELF": + continue + (section_headers_start,) = unpack("Q", 40) + (section_header_size,) = unpack("H", 58) + (num_section_headers,) = unpack("H", 60) + (shstrndx,) = unpack("H", 62) + (shstrtab_offset,) = unpack( + "Q", section_headers_start + shstrndx * section_header_size + 24 + ) + for i in range(num_section_headers): + (section_name_offset,) = unpack( + "I", section_headers_start + i * section_header_size + ) + name_start = shstrtab_offset + section_name_offset + section_name = mm[name_start : name_start + 6] + if section_name != b".text\0": + continue + (section_offset,) = unpack( + "Q", section_headers_start + i * section_header_size + 24 + ) + (section_size,) = unpack( + "Q", section_headers_start + i * section_header_size + 32 + ) + start = int(filename.split("-")[0], 16) + section_offset + text_sections.append((start, section_size)) + break + mm.close() + return text_sections + + r = random.Random() + r.seed(1) + text_sections = get_text_sections() + addrs = [] + for i in range(200): + s = r.randrange(0, len(text_sections)) + start, size = text_sections[s] + addr = r.randrange(start, start + size) + addrs.append(addr) + fast = torch._C._profiler.symbolize_addresses(addrs, "fast") + dladdr = torch._C._profiler.symbolize_addresses(addrs, "dladdr") + addr2line = torch._C._profiler.symbolize_addresses(addrs, "addr2line") + self.assertEqual(len(fast), len(addrs)) + self.assertEqual(len(addr2line), len(fast)) + if __name__ == "__main__": run_tests() diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py index 00462023b6b5b..64c939c7e6846 100644 --- a/test/quantization/core/test_quantized_tensor.py +++ b/test/quantization/core/test_quantized_tensor.py @@ -1615,6 +1615,58 @@ def test_decomposed_choose_qparams_per_token_asymmetric_backward(self): out = x.div(s).add(zp).round() out.sum().backward() + def test_decomposed_quantize_per_channel_group(self): + # register the ops + import torch.ao.quantization.fx._decomposed + qmin, qmax = (-8, 7) + group_size = 128 + x = torch.randn(100, 256) + s = torch.randn(100, 2) + zp = torch.randint(qmax, size=(100, 2), dtype=torch.int32) + + # simulate fake quantize per channel group with qdq + q = torch.ops.quantized_decomposed.quantize_per_channel_group( + x, s, zp, qmin, qmax, torch.int8, group_size, + ) + dq = torch.ops.quantized_decomposed.dequantize_per_channel_group( + q, s, zp, qmin, qmax, torch.int8, group_size, torch.float32 + ) + + # express per group fake quant using `torch.fake_quantize_per_channel_affine` + x_grouped = x.reshape(-1, group_size) + s_flattened = s.flatten() + zp_flattened = zp.flatten() + fq = torch.fake_quantize_per_channel_affine( + x_grouped, s_flattened, zp_flattened, 0, qmin, qmax, + ) + fq = fq.reshape_as(x) + torch.testing.assert_close(dq, fq, rtol=0, atol=0) + + def test_decomposed_quantize_per_token(self): + # register the ops + import torch.ao.quantization.fx._decomposed + qmin, qmax = (-8, 7) + x = torch.randn(100, 256) + s = torch.randn(100, 1) + zp = torch.randint(qmax, size=(100, 1), dtype=torch.int32) + + # simulate fake quantize per token with qdq + q = torch.ops.quantized_decomposed.quantize_per_token( + x, s, zp, qmin, qmax, torch.int8, + ) + dq = torch.ops.quantized_decomposed.dequantize_per_token( + q, s, zp, qmin, qmax, torch.int8, torch.float32 + ) + + # express per token fake quant using `torch.fake_quantize_per_channel_affine` + s_flattened = s.flatten() + zp_flattened = zp.flatten() + fq = torch.fake_quantize_per_channel_affine( + x, s_flattened, zp_flattened, 0, qmin, qmax, + ) + torch.testing.assert_close(dq, fq, rtol=0, atol=0) + + if __name__ == '__main__': raise RuntimeError("This test file is not meant to be run directly, use:\n\n" "\tpython test/test_quantization.py TESTNAME\n\n" diff --git a/test/run_test.py b/test/run_test.py index af3b4d6866730..5b24a00789964 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -37,7 +37,6 @@ TEST_WITH_ASAN, TEST_WITH_CROSSREF, TEST_WITH_ROCM, - TEST_WITH_SLOW, TEST_WITH_SLOW_GRADCHECK, ) @@ -76,9 +75,11 @@ sys.path.remove(str(REPO_ROOT)) TEST_CONFIG = os.getenv("TEST_CONFIG", "") +BUILD_ENVIRONMENT = os.getenv("BUILD_ENVIRONMENT", "") RERUN_DISABLED_TESTS = os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1" DISTRIBUTED_TEST_PREFIX = "distributed" INDUCTOR_TEST_PREFIX = "inductor" +IS_SLOW = "slow" in TEST_CONFIG or "slow" in BUILD_ENVIRONMENT # Note [ROCm parallel CI testing] @@ -494,7 +495,7 @@ def run_test( None if not options.enable_timeout else THRESHOLD * 6 - if TEST_WITH_SLOW + if IS_SLOW else THRESHOLD * 3 if should_retry and isinstance(test_module, ShardedTest) @@ -1180,18 +1181,25 @@ def parse_args(): and ( TEST_WITH_CROSSREF or TEST_WITH_ASAN - or ( - strtobool(os.environ.get("TD_DISTRIBUTED", "False")) - and TEST_CONFIG == "distributed" - and TEST_CUDA - ) + or (TEST_CONFIG == "distributed" and TEST_CUDA) or (IS_WINDOWS and not TEST_CUDA) or TEST_CONFIG == "nogpu_AVX512" or TEST_CONFIG == "nogpu_NO_AVX2" + or ( + "sm86" not in BUILD_ENVIRONMENT + and TEST_CONFIG == "default" + and TEST_CUDA + ) + or (not TEST_CUDA and TEST_CONFIG == "default") ) and get_pr_number() is not None and not strtobool(os.environ.get("NO_TD", "False")) - and not TEST_WITH_SLOW, + and not IS_SLOW + and not TEST_WITH_ROCM + and not IS_MACOS + and "onnx" not in BUILD_ENVIRONMENT + and "debug" not in BUILD_ENVIRONMENT + and "parallelnative" not in BUILD_ENVIRONMENT, ) parser.add_argument( "additional_unittest_args", diff --git a/test/test_cuda.py b/test/test_cuda.py index 3f5cb476cb292..1872faee6a281 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -4703,7 +4703,7 @@ class TestCudaOptims(TestCase): [ optim for optim in optim_db - if "foreach" in optim.supported_impls and "cuda" in optim.supports_fused_on + if "foreach" in optim.supported_impls and "fused" in optim.supported_impls ], dtypes=[torch.float32], ) diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py index 40075eb24e04c..d4953101d26b9 100644 --- a/test/test_fake_tensor.py +++ b/test/test_fake_tensor.py @@ -1,76 +1,93 @@ # Owner(s): ["module: meta tensors"] -from torch.testing._internal.common_utils import ( - TestCase, TEST_WITH_TORCHDYNAMO, run_tests, skipIfCrossRef, skipIfRocm, skipIfTorchDynamo, parametrize, - instantiate_parametrized_tests, TemporaryFileName) -import torch -import torch._dynamo -from torch._dynamo.testing import make_test_cls_with_patches +import contextlib +import copy +import dataclasses +import inspect import itertools +import pickle +import unittest +import weakref +from unittest.mock import patch + import numpy as np -from torch.testing._internal.jit_utils import RUN_CUDA +import torch +import torch._dynamo +import torch._functorch.config +import torch._prims as prims +import torch.testing._internal.optests as optests +import torch.utils._pytree as pytree + +from torch import distributed as dist +from torch._C._functorch import _add_batch_dim, get_unwrapped, is_batchedtensor +from torch._dynamo.testing import make_test_cls_with_patches, rand_strided from torch._guards import tracing, TracingContext from torch._subclasses.fake_tensor import ( + DynamicOutputShapeException, extract_tensor_metadata, FakeTensor, - FakeTensorMode, FakeTensorConverter, - DynamicOutputShapeException, - UnsupportedOperatorException, + FakeTensorMode, unset_fake_temporarily, + UnsupportedOperatorException, ) +from torch.fx.experimental.proxy_tensor import make_fx from torch.fx.experimental.symbolic_shapes import ( - ShapeEnv, DimDynamic, free_symbols, StatelessSymbolicContext, ShapeEnvSettings, statically_known_true + DimDynamic, + free_symbols, + ShapeEnv, + ShapeEnvSettings, + StatelessSymbolicContext, + statically_known_true, ) -from torch.testing._internal.custom_op_db import custom_op_db -from torch.testing._internal.common_device_type import ops -from torch.testing._internal.common_device_type import instantiate_device_type_tests, OpDTypes -from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION from torch.fx.passes.fake_tensor_prop import FakeTensorProp -from torch._dynamo.testing import rand_strided -from torch._C._functorch import is_batchedtensor, _add_batch_dim, get_unwrapped from torch.testing import FileCheck -import dataclasses -import inspect -import unittest -import torch._prims as prims -import contextlib -import weakref -import copy -import pickle -import torch._functorch.config -import torch.testing._internal.optests as optests -from unittest.mock import patch - -from torch import distributed as dist +from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION +from torch.testing._internal.common_device_type import ( + instantiate_device_type_tests, + OpDTypes, + ops, +) +from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + parametrize, + run_tests, + skipIfCrossRef, + skipIfRocm, + skipIfTorchDynamo, + TemporaryFileName, + TEST_WITH_TORCHDYNAMO, + TestCase, +) +from torch.testing._internal.custom_op_db import custom_op_db +from torch.testing._internal.jit_utils import RUN_CUDA from torch.utils._mode_utils import no_dispatch from torch.utils._python_dispatch import TorchDispatchMode -import torch.utils._pytree as pytree -from torch.fx.experimental.proxy_tensor import make_fx aten = torch.ops.aten torch._dynamo.config.fake_tensor_cache_enabled = True torch._dynamo.config.fake_tensor_cache_crosscheck_enabled = True + def expectedFailurePropagateRealTensors(fn): fn._expected_failure_propagate_real_tensors = True return fn + class FakeTensorTest(TestCase): def checkType(self, t, device_str, size): self.assertTrue(isinstance(t, FakeTensor)) self.assertEqual(t.device.type, device_str) self.assertEqual(list(t.size()), size) - @unittest.skipIf(not RUN_CUDA, "requires cuda") def test_cuda_initialized(self): # doesnt error with FakeTensorMode(): - p = torch.randn(4, 2, requires_grad=True, device='cuda') - x = torch.randn(8, 4, device='cuda') + p = torch.randn(4, 2, requires_grad=True, device="cuda") + x = torch.randn(8, 4, device="cuda") y = torch.mm(x, p).square().sum() y.backward() @@ -86,18 +103,20 @@ def test_basic(self): self.assertTrue(isinstance(z, FakeTensor)) def test_custom_op_fallback(self): - from torch.library import Library, impl + from torch.library import impl, Library try: test_lib = Library("my_test_op", "DEF") # noqa: TOR901 - test_lib.define('foo(Tensor self) -> Tensor') + test_lib.define("foo(Tensor self) -> Tensor") - @impl(test_lib, 'foo', 'CPU') + @impl(test_lib, "foo", "CPU") def foo_impl(self): return self.cos() x = torch.empty(2, 2, device="cpu") - with self.assertRaisesRegex(UnsupportedOperatorException, "my_test_op.foo.default"): + with self.assertRaisesRegex( + UnsupportedOperatorException, "my_test_op.foo.default" + ): with FakeTensorMode(allow_fallback_kernels=True) as mode: x = mode.from_tensor(x) torch.ops.my_test_op.foo(x) @@ -114,6 +133,7 @@ def test_parameter_instantiation(self): @unittest.skipIf(not dist.is_available(), "requires distributed") def test_fsdp_flat_param(self): from torch.distributed.fsdp._flat_param import FlatParameter + with FakeTensorMode() as m: data = torch.randn(2, 2) param = FlatParameter(data, requires_grad=True) @@ -127,11 +147,13 @@ def test_non_parameter_grad(self): fake_t = mode.from_tensor(t) self.assertEqual(fake_t.requires_grad, t.requires_grad) - @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile") + @unittest.skipIf( + TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile" + ) @unittest.skipIf(not RUN_CUDA, "requires cuda") def test_index_cuda_with_cpu(self): with FakeTensorMode(): - x = torch.rand([2048], device='cuda') + x = torch.rand([2048], device="cuda") out = x[torch.zeros([36], dtype=torch.int64)] self.checkType(out, "cuda", [36]) @@ -148,14 +170,14 @@ def test_shape_take_not_device(self): def test_repr(self): with FakeTensorMode(): x = torch.empty(2, 2, device="cpu") - self.assertEqual(repr(x), 'FakeTensor(..., size=(2, 2))') + self.assertEqual(repr(x), "FakeTensor(..., size=(2, 2))") x = torch.empty(2, 2, device="meta") self.assertEqual(repr(x), "FakeTensor(..., device='meta', size=(2, 2))") @unittest.skipIf(not RUN_CUDA, "requires cuda") def test_zero_dim(self): with FakeTensorMode() as mode: - x = torch.tensor(0.) + x = torch.tensor(0.0) y = torch.rand([4, 4], device="cuda") out = x + y self.assertEqual(out.shape, (4, 4)) @@ -173,7 +195,7 @@ def test_nan_to_num(self): @unittest.skipIf(not RUN_CUDA, "requires cuda") def test_throw(self): - x = torch.tensor(0.) # TODO: tensor() errors + x = torch.tensor(0.0) # TODO: tensor() errors with FakeTensorMode() as mode: x_conv = mode.from_tensor(x) y = torch.rand([4, 4], device="cuda") @@ -207,17 +229,25 @@ def test_device_inplace_copy(self): def test_fake_dispatch_keys(self): with FakeTensorMode(): x = torch.rand([4]) - f = FileCheck().check("CPU").check("ADInplaceOrView").check("AutogradCPU").check("AutocastCPU") + f = ( + FileCheck() + .check("CPU") + .check("ADInplaceOrView") + .check("AutogradCPU") + .check("AutocastCPU") + ) f.run(torch._C._dispatch_key_set(x)) with torch.inference_mode(): x = torch.rand([4]) y = x + x - FileCheck().check("CPU").check("AutocastCPU").run(torch._C._dispatch_key_set(y)) - FileCheck().check_not("ADInplaceOrView").check_not("Autograd").run(torch._C._dispatch_key_set(y)) + FileCheck().check("CPU").check("AutocastCPU").run( + torch._C._dispatch_key_set(y) + ) + FileCheck().check_not("ADInplaceOrView").check_not("Autograd").run( + torch._C._dispatch_key_set(y) + ) - # TODO: functorch support for propagate real tensors - @expectedFailurePropagateRealTensors def test_batch_tensor(self): x = torch.rand((3, 4, 5)) b = _add_batch_dim(x, 0, 0) @@ -289,7 +319,9 @@ def test_fake_mode_error(self): with FakeTensorMode(): y = x[0] - @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile") + @unittest.skipIf( + TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile" + ) def test_fake_grad_copy(self): x = torch.rand([4, 4], requires_grad=True) x.grad = torch.rand([4, 4]) @@ -306,7 +338,7 @@ def test_index_put_error(self): for context in [contextlib.nullcontext, lambda: mode]: with context(): y = torch.randn(2, 2, 3) - x = torch.randn(2, 2, 3).to('cuda') + x = torch.randn(2, 2, 3).to("cuda") with self.assertRaises(RuntimeError): x[[1, 1]] = y @@ -314,10 +346,12 @@ def test_index_put_error(self): torch.ops.aten.index_put(x, torch.tensor([1, 1], device="cuda"), y) # no error - torch.ops.aten.index_put(x, torch.tensor([1, 1], device="cuda"), torch.tensor(5.)) - torch.ops.aten.index_put_(x, torch.tensor([1, 1], device="cuda"), torch.tensor(5.)) - - + torch.ops.aten.index_put( + x, torch.tensor([1, 1], device="cuda"), torch.tensor(5.0) + ) + torch.ops.aten.index_put_( + x, torch.tensor([1, 1], device="cuda"), torch.tensor(5.0) + ) @unittest.skipIf(not RUN_CUDA, "requires cuda") def test_like_constructor(self): @@ -338,7 +372,9 @@ def test_binary_op_type_promotion(self): self.assertEqual(out.dtype, torch.float) self.assertEqual(out.device.type, "cpu") - @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile") + @unittest.skipIf( + TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile" + ) def test_from_numpy(self): with FakeTensorMode(): x = torch.tensor(np.zeros([4, 4])) @@ -366,9 +402,15 @@ def test_upsample_bilinear_small_channels(self): mode = FakeTensorMode() for i, context in enumerate([contextlib.nullcontext, lambda: mode]): with context(): - arg0_1 = torch.empty_strided((3, 427, 640), (1, 1920, 3), dtype=torch.float32, device='cuda') + arg0_1 = torch.empty_strided( + (3, 427, 640), (1, 1920, 3), dtype=torch.float32, device="cuda" + ) unsqueeze = torch.ops.aten.unsqueeze.default(arg0_1, 0) - out.append(torch.ops.aten.upsample_bilinear2d.default(unsqueeze, [800, 1199], False)) + out.append( + torch.ops.aten.upsample_bilinear2d.default( + unsqueeze, [800, 1199], False + ) + ) self.assertTrue(out[1].is_contiguous()) self.checkMetaProps(out[0], out[1]) @@ -409,8 +451,9 @@ def test_out_multi_device(self): with self.assertRaisesRegex(Exception, "found.+two.+devices"): x.add_(y) - - @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile") + @unittest.skipIf( + TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile" + ) @unittest.skipIf(not RUN_CUDA, "requires cuda") def test_normalize_device(self): with FakeTensorMode(): @@ -427,10 +470,15 @@ def test_recursive_invocation(self): y = x + x self.assertTrue(mode.in_kernel_invocation) - @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile") + @unittest.skipIf( + TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile" + ) @skipIfRocm - @parametrize("allow_fallback_kernels", [False, True], - lambda a: 'with_fallback' if a else 'without_fallback') + @parametrize( + "allow_fallback_kernels", + [False, True], + lambda a: "with_fallback" if a else "without_fallback", + ) @unittest.skipIf(not RUN_CUDA, "requires cuda") def test_cudnn_rnn(self, allow_fallback_kernels): def fn( @@ -526,7 +574,7 @@ def fn( for ten in out: if i == 1: self.assertTrue(isinstance(ten, FakeTensor)) - self.assertEqual(ten.device.type, 'cuda') + self.assertEqual(ten.device.type, "cuda") @unittest.skipIf(not RUN_CUDA, "requires cuda") def test_cuda_lstm(self): @@ -544,13 +592,20 @@ def test_cuda_lstm(self): D = 2 if bidir else 1 H_out = proj_size if proj_size > 0 else hidden_size - lstm = torch.nn.LSTM(input_size=H_in, hidden_size=hidden_size, - num_layers=num_layers, proj_size=proj_size, batch_first=False, - bias=True, bidirectional=bidir, device='cuda') + lstm = torch.nn.LSTM( + input_size=H_in, + hidden_size=hidden_size, + num_layers=num_layers, + proj_size=proj_size, + batch_first=False, + bias=True, + bidirectional=bidir, + device="cuda", + ) - h_0 = torch.randn((num_layers * D, N, H_out), device='cuda') - c_0 = torch.randn((num_layers * D, N, hidden_size), device='cuda') - inp = torch.randn((L, N, H_in), device='cuda') + h_0 = torch.randn((num_layers * D, N, H_out), device="cuda") + c_0 = torch.randn((num_layers * D, N, hidden_size), device="cuda") + inp = torch.randn((L, N, H_in), device="cuda") (output, (h_n, c_n)) = lstm(inp, (h_0, c_0)) output.sum().backward() @@ -578,9 +633,8 @@ def test_same_shape_env_preserved(self): t1 = mode1.from_tensor( torch.randn(10), symbolic_context=StatelessSymbolicContext( - dynamic_sizes=[DimDynamic.DYNAMIC], - constraint_sizes=[None] - ) + dynamic_sizes=[DimDynamic.DYNAMIC], constraint_sizes=[None] + ), ) mode2 = FakeTensorMode(shape_env=shape_env) t2 = mode2.from_tensor(t1) @@ -630,11 +684,16 @@ def test_deepcopy(self): mod_copied = copy.deepcopy(mod) def check_copy(mod, mod_copied): - for name, param in itertools.chain(mod.named_parameters(), mod.named_buffers()): + for name, param in itertools.chain( + mod.named_parameters(), mod.named_buffers() + ): param_copied = getattr(mod_copied, name) self.checkMetaProps(param, param_copied) self.assertTrue(isinstance(param_copied, FakeTensor)) - self.assertEqual(isinstance(param, torch.nn.Parameter), isinstance(param_copied, torch.nn.Parameter)) + self.assertEqual( + isinstance(param, torch.nn.Parameter), + isinstance(param_copied, torch.nn.Parameter), + ) self.assertEqual(param.requires_grad, param_copied.requires_grad) check_copy(mod, mod_copied) @@ -653,18 +712,22 @@ def __init__(self): self.assertIs(mod_copied.a, mod_copied.b) self.assertEqual(mod_copied.b.storage()._cdata, mod_copied.a.storage()._cdata) - @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile") + @unittest.skipIf( + TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile" + ) @unittest.skipIf(not RUN_CUDA, "requires cuda") def test_new(self): with FakeTensorMode(): a = torch.rand([16, 1]) self.checkType(a.new(10, 10), "cpu", [10, 10]) self.checkType(a.new([1, 2, 3, 4]), "cpu", [4]) - b = torch.rand([4, 4], device='cuda') - self.checkType(b.new(device='cuda'), "cuda", [0]) + b = torch.rand([4, 4], device="cuda") + self.checkType(b.new(device="cuda"), "cuda", [0]) self.checkType(a.new(torch.rand([1])), "cpu", [1]) - @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile") + @unittest.skipIf( + TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile" + ) def test_scalar_inputs(self): with FakeTensorMode(): self.checkType(torch.div(3, 2), "cpu", []) @@ -672,7 +735,9 @@ def test_scalar_inputs(self): self.assertEqual(ten.dtype, torch.float) self.checkType(ten, "cpu", [2]) - @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile") + @unittest.skipIf( + TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile" + ) def test_allow_meta(self): def run_meta(): with FakeTensorMode(): @@ -688,7 +753,7 @@ def test_embedding_bag_meta(self): def f(): # This behavior was originally unintentional but we see people # relying on it - embedding = torch.nn.EmbeddingBag(10, 3, mode='sum', device='meta') + embedding = torch.nn.EmbeddingBag(10, 3, mode="sum", device="meta") input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long) offsets = torch.tensor([0, 4], dtype=torch.long) return embedding(input, offsets) @@ -701,7 +766,9 @@ def f(): self.assertEqual(r.size(), f.size()) self.assertEqual(r.device, f.device) - @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile") + @unittest.skipIf( + TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile" + ) def test_mixed_real_and_fake_inputs(self): class _TestPattern(torch.nn.Module): def __init__(self): @@ -730,7 +797,9 @@ def forward(self, input): out = mod(torch.randn(1, 1, 3, 3)) self.checkType(out, "cpu", (1, 1, 3, 3)) - @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile") + @unittest.skipIf( + TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile" + ) @unittest.skipIf(not RUN_CUDA, "requires cuda") def test_aten_copy_multi_device(self): with FakeTensorMode(): @@ -744,7 +813,9 @@ def test_aten_copy_multi_device(self): self.checkType(copy2, "cuda", (4,)) self.checkType(out, "cpu", (4,)) - @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile") + @unittest.skipIf( + TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile" + ) @unittest.skipIf(not RUN_CUDA, "requires cuda") def test_aten_index_multi_device(self): with FakeTensorMode(): @@ -768,7 +839,9 @@ def test_aten_index_multi_device(self): self.checkType(r3, "cpu", (4, 4)) self.checkType(r4, "cuda", (4, 4)) - @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile") + @unittest.skipIf( + TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile" + ) @unittest.skipIf(not RUN_CUDA, "requires cuda") def test_aten_slice_scatter_multi_device(self): with FakeTensorMode(): @@ -790,7 +863,10 @@ def test__adaptive_avg_pool2d_backward(self): grad_out = torch.rand(2, 3, 4, 4) inp = torch.rand(2, 3, 4, 4).to(memory_format=torch.channels_last) grad_in = torch.ops.aten._adaptive_avg_pool2d_backward(grad_out, inp) - self.assertTrue(torch._prims_common.suggest_memory_format(grad_in) == torch.channels_last) + self.assertTrue( + torch._prims_common.suggest_memory_format(grad_in) + == torch.channels_last + ) # Propagate real tensors doesn't work when original input arguments are # fake @@ -805,6 +881,23 @@ def forward(self, input): ep = torch.export.export(MyNumpyModel(), args=(torch.randn(1000),)) self.assertTrue(isinstance(ep, torch.export.ExportedProgram)) + def test_unsqueeze_copy(self): + shape_env = ShapeEnv() + t1 = torch.ones(2, 2, 768) + with FakeTensorMode(shape_env=shape_env) as fake_mode: + t = fake_mode.from_tensor( + t1, + symbolic_context=StatelessSymbolicContext( + dynamic_sizes=[ + DimDynamic.DYNAMIC, + DimDynamic.STATIC, + DimDynamic.STATIC, + ], + ), + ) + + self.assertEqual(t.shape[0], torch.ops.aten.unsqueeze_copy(t, 1).shape[0]) + def test_alias_call(self): fwAD = torch.autograd.forward_ad @@ -853,20 +946,20 @@ def assertNotConst(self, *args): def test_simple(self): with FakeTensorMode(): - x = torch.tensor(4.) - self.assertEqual(x.item(), 4.) + x = torch.tensor(4.0) + self.assertEqual(x.item(), 4.0) def test_inplace_add(self): with FakeTensorMode(): - x = torch.tensor(4.) + x = torch.tensor(4.0) y = x.add_(1) - self.assertEqual(x.item(), 5.) - self.assertEqual(y.item(), 5.) + self.assertEqual(x.item(), 5.0) + self.assertEqual(y.item(), 5.0) self.assertConst(x, y) def test_shared_storages(self): with FakeTensorMode(): - x = torch.tensor([4.]) + x = torch.tensor([4.0]) y = x[:] self.assertEqual(x.storage()._cdata, y.storage()._cdata) @@ -874,7 +967,7 @@ def test_shared_storages(self): def test_constant_invalidation(self): with FakeTensorMode(): - x = torch.tensor([1.]) + x = torch.tensor([1.0]) self.assertConst(x) y = torch.rand([1]) x.add_(y) @@ -889,13 +982,14 @@ def test_inplace_view_invalidation(self): self.assertNotConst(x) def test_fake_tensor_in_intlist_repro(self): - def fn(tensors): max_size = torch.tensor([800, 1216], dtype=torch.int64) batch_shape = [len(tensors)] + list(tensors[0].shape[:-2]) + list(max_size) return tensors[0].new_full(batch_shape, 0.0) - with self.assertRaises(torch._subclasses.fake_tensor.DataDependentOutputException): + with self.assertRaises( + torch._subclasses.fake_tensor.DataDependentOutputException + ): with torch._subclasses.fake_tensor.FakeTensorMode(): a = torch.randn(3, 800, 1199) b = torch.randn(3, 800, 800) @@ -913,7 +1007,7 @@ def test_fake_tensor_batch_norm_cpu(self): def test_shared_storage_invalidation(self): with FakeTensorMode(): - x = torch.tensor([1.]) + x = torch.tensor([1.0]) y = x[:] self.assertConst(x, y) y.add_(torch.rand([1])) @@ -929,7 +1023,7 @@ def test_aliased_const_write(self): def test_constant_propagate_through_functions(self): with FakeTensorMode(): - y = torch.div(4, 4, rounding_mode='trunc') + y = torch.div(4, 4, rounding_mode="trunc") self.assertConst(y) @@ -954,7 +1048,9 @@ def test_fake(self, device, dtype, op): make_propagate_real_tensors_cls(FakeTensorOpInfoTest) instantiate_device_type_tests(FakeTensorOpInfoTest, globals(), only_for=("cpu", "cuda")) -instantiate_device_type_tests(PropagateRealTensorsFakeTensorOpInfoTest, globals(), only_for=("cpu",)) # noqa: F821 +instantiate_device_type_tests( + PropagateRealTensorsFakeTensorOpInfoTest, globals(), only_for=("cpu",) # noqa: F821 +) class FakeTensorConverterTest(TestCase): @@ -967,7 +1063,10 @@ def test_memoized_conversion_from_meta(self): x = torch.rand(2, 2).to(device="meta") mode = FakeTensorMode() converter = mode.fake_tensor_converter - self.assertTrue(converter.from_meta_and_device(mode, x, "cpu") is converter.from_meta_and_device(mode, x, "cpu")) + self.assertTrue( + converter.from_meta_and_device(mode, x, "cpu") + is converter.from_meta_and_device(mode, x, "cpu") + ) def test_separate_tensor_storages_view(self): x = torch.rand(2, 2, 2) @@ -998,7 +1097,6 @@ def test_separate_tensor_storages_non_view(self): self.assertEqual(len(converter.tensor_memo), 0) self.assertEqual(len(converter.meta_converter.storage_memo), 0) - @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1991") def test_dead_weak_ref(self): x = torch.rand(2, 2, 2) @@ -1101,7 +1199,8 @@ def test_non_kwarg_only_device(self): ) if has_non_kwarg_device: self.assertTrue( - self.get_aten_op(schema) in torch._subclasses.fake_tensor._device_not_kwarg_ops + self.get_aten_op(schema) + in torch._subclasses.fake_tensor._device_not_kwarg_ops ) def test_tensor_constructors_all_have_kwarg_device(self): @@ -1140,24 +1239,35 @@ def test_like_ops(self): for schema in self.get_all_aten_schemas(): if "_like" == schema.name[-5:]: op = self.get_aten_op(schema) - self.assertIn(op, torch._subclasses.fake_tensor._like_tensor_constructors) + self.assertIn( + op, torch._subclasses.fake_tensor._like_tensor_constructors + ) def test_str_storage(self): x = torch.zeros(3) with FakeTensorMode() as m: y = m.from_tensor(x) - self.assertExpectedInline(str(x.storage()), '''\ + self.assertExpectedInline( + str(x.storage()), + """\ 0.0 0.0 0.0 -[torch.storage.TypedStorage(dtype=torch.float32, device=cpu) of size 3]''') - self.assertExpectedInline(str(y.storage()), '''\ +[torch.storage.TypedStorage(dtype=torch.float32, device=cpu) of size 3]""", + ) + self.assertExpectedInline( + str(y.storage()), + """\ ... -[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]''') +[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]""", + ) - self.assertExpectedInline(str(y.storage()), '''\ + self.assertExpectedInline( + str(y.storage()), + """\ ... -[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]''') +[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]""", + ) # at::_embedding_bag has no op info, # and returns extra tensors that at::embedding bag throws away @@ -1172,7 +1282,9 @@ def test_embedding_bag_private(self): ref_out = torch.ops.aten._embedding_bag(*args) with FakeTensorMode() as m: - meta_args = [m.from_tensor(a) if isinstance(a, torch.Tensor) else a for a in args] + meta_args = [ + m.from_tensor(a) if isinstance(a, torch.Tensor) else a for a in args + ] meta_out = torch.ops.aten._embedding_bag(*meta_args) self.assertEqual(len(ref_out), len(meta_out)) @@ -1188,20 +1300,29 @@ def test_cross_entropy_loss(self): args = (inp, target, w) ref = fn(*args) with FakeTensorMode() as m: - meta_args = [m.from_tensor(a) if isinstance(a, torch.Tensor) else a for a in args] - meta_out = torch.nn.functional.cross_entropy(*meta_args, label_smoothing=0.5) + meta_args = [ + m.from_tensor(a) if isinstance(a, torch.Tensor) else a for a in args + ] + meta_out = torch.nn.functional.cross_entropy( + *meta_args, label_smoothing=0.5 + ) self.assertEqual(ref.size(), meta_out.size()) @skipIfRocm - @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support SDPA or pre-SM80 hardware") + @unittest.skipIf( + not PLATFORM_SUPPORTS_FLASH_ATTENTION, + "Does not support SDPA or pre-SM80 hardware", + ) def test_flash_attention(self): class Repro(torch.nn.Module): def __init__(self): super().__init__() def forward(self, arg1, arg2, arg3): - torch.ops.aten._scaled_dot_product_flash_attention(arg1, arg2, arg3, scale=0.17677669529663687) + torch.ops.aten._scaled_dot_product_flash_attention( + arg1, arg2, arg3, scale=0.17677669529663687 + ) args_new = [ [ @@ -1213,11 +1334,13 @@ def forward(self, arg1, arg2, arg3): ((4, 2, 16, 32), (1024, 512, 32, 1), torch.float16, "cuda"), ((4, 2, 16, 32), (1024, 512, 32, 1), torch.float16, "cuda"), ((4, 2, 16, 32), (1024, 512, 32, 1), torch.float16, "cuda"), - ] + ], ] for args_list in args_new: - args = [rand_strided(bsz, num_heads, seq_len, head_dim) for - (bsz, num_heads, seq_len, head_dim) in args_list] + args = [ + rand_strided(bsz, num_heads, seq_len, head_dim) + for (bsz, num_heads, seq_len, head_dim) in args_list + ] try: with torch._subclasses.CrossRefFakeMode(): Repro()(*args) @@ -1225,7 +1348,10 @@ def forward(self, arg1, arg2, arg3): # We expect the cross ref to succed for the first output to fail # for the rng state, see Note [Seed and Offset] self.assertTrue("output[0]" not in str(e)) - self.assertTrue("found mismatched tensor metadata for output[6]: Devices cpu and cuda:0 are not equal!" in str(e)) + self.assertTrue( + "found mismatched tensor metadata for output[6]: Devices cpu and cuda:0 are not equal!" + in str(e) + ) # IMPORTANT!!! Always run even if CUDA is not available def test_fake_cuda_no_init(self): @@ -1234,12 +1360,12 @@ def test_fake_cuda_no_init(self): if torch._functorch.config.fake_tensor_propagate_real_tensors: return with FakeTensorMode(): - torch.empty(10, device='cuda') - torch.ones(10, device='cuda') - torch.zeros(10, device='cuda') - torch.rand(10, device='cuda') - torch.tensor(3.14, device='cuda') - torch.tensor([[3.14, 2], [1, 2]], device='cuda') + torch.empty(10, device="cuda") + torch.ones(10, device="cuda") + torch.zeros(10, device="cuda") + torch.rand(10, device="cuda") + torch.tensor(3.14, device="cuda") + torch.tensor([[3.14, 2], [1, 2]], device="cuda") @skipIfRocm @unittest.skipIf(not RUN_CUDA, "requires cuda") @@ -1351,7 +1477,6 @@ def to_fake_tensor(x): failed = True self.assertTrue(failed) - @expectedFailurePropagateRealTensors # Propagate real tensors doesn't work with fake-on-fake def test_fake_tensor_prop_on_nn_module_with_optional_args(self): class OptionalArgumentInBetween(torch.nn.Module): @@ -1371,14 +1496,19 @@ def forward(self, value, another_value=None, another_optional_value=None): value = value + another_value + another_optional_value return value * value - fake_mode = FakeTensorMode(allow_non_fake_inputs=True, allow_fallback_kernels=False) + fake_mode = FakeTensorMode( + allow_non_fake_inputs=True, allow_fallback_kernels=False + ) with fake_mode: model = OptionalArgumentInBetween() value = torch.randn(5, 4) another_optional_value = torch.randn(5, 4) - graph_model = torch.fx.symbolic_trace(model, (value, None, another_optional_value)) - FakeTensorProp(graph_model, fake_mode).propagate(value, None, another_optional_value) - + graph_model = torch.fx.symbolic_trace( + model, (value, None, another_optional_value) + ) + FakeTensorProp(graph_model, fake_mode).propagate( + value, None, another_optional_value + ) @expectedFailurePropagateRealTensors # TODO: not sure about this one, kinda strange def test_unbacked_shape_realloc(self): @@ -1390,12 +1520,14 @@ def f(x): with fake_mode: value = torch.randn(5) gm = make_fx(f)(value) - nonzero_nodes = [n for n in gm.graph.nodes if n.target is torch.ops.aten.nonzero.default] + nonzero_nodes = [ + n for n in gm.graph.nodes if n.target is torch.ops.aten.nonzero.default + ] self.assertEqual(len(nonzero_nodes), 1) - self.assertIsInstance(nonzero_nodes[0].meta['val'].shape[0], torch.SymInt) - u0 = nonzero_nodes[0].meta['val'].shape[0] + self.assertIsInstance(nonzero_nodes[0].meta["val"].shape[0], torch.SymInt) + u0 = nonzero_nodes[0].meta["val"].shape[0] FakeTensorProp(gm, fake_mode).propagate(value) - u1 = nonzero_nodes[0].meta['val'].shape[0] + u1 = nonzero_nodes[0].meta["val"].shape[0] # Test that this test is actually doing something in that the # FakeTensorProp actually triggered a reallocation. If this assert is # failing, it could be because we started memoizing the nnz count for @@ -1407,9 +1539,7 @@ def f(x): self.assertIsNot(u0, u1) self.assertTrue(statically_known_true(u0 == u1)) - def test_torch_load_with_fake_mode(self): - class TheModelClass(torch.nn.Module): def __init__(self): super().__init__() @@ -1462,7 +1592,8 @@ def test_shape_env_settings(self): """ init_sig = inspect.signature(ShapeEnv._init) args = [ - name for name, param in init_sig.parameters.items() + name + for name, param in init_sig.parameters.items() if type(param.default) is bool ] @@ -1770,5 +1901,6 @@ def test_inference_mode(self): extract_tensor_metadata(res4), ) + if __name__ == "__main__": run_tests() diff --git a/test/test_flop_counter.py b/test/test_flop_counter.py index 43f5cb9dadf4f..4f9c7020c0e60 100644 --- a/test/test_flop_counter.py +++ b/test/test_flop_counter.py @@ -1,15 +1,24 @@ # Owner(s): ["module: unknown"] +import functools +import unittest + import torch -from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_TORCHDYNAMO -from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION, PLATFORM_SUPPORTS_MEM_EFF_ATTENTION -import torch.utils.flop_counter import torch.nn.functional as F -import unittest -import functools +import torch.utils.flop_counter +from torch.testing._internal.common_cuda import ( + PLATFORM_SUPPORTS_FLASH_ATTENTION, + PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, +) +from torch.testing._internal.common_utils import ( + run_tests, + TEST_WITH_TORCHDYNAMO, + TestCase, +) try: from torchvision import models as torchvision_models + HAS_TORCHVISION = True except ImportError: HAS_TORCHVISION = False @@ -17,16 +26,22 @@ HAS_CUDA = torch.cuda.is_available() + def FlopCounterMode(*args, **kwargs): return torch.utils.flop_counter.FlopCounterMode(*args, **kwargs, display=False) + def get_total_flops(mode): return str(sum(v for _, v in mode.flop_counts["Global"].items())) + def T(*shape, requires_grad=False): return torch.randn(*shape, requires_grad=requires_grad) -@unittest.skipIf(TEST_WITH_TORCHDYNAMO, "torchdynamo doesn't work with __torch_dispatch__ right now") + +@unittest.skipIf( + TEST_WITH_TORCHDYNAMO, "torchdynamo doesn't work with __torch_dispatch__ right now" +) class TestFlopCounter(TestCase): def test_flop_counter_variety(self): mod = torch.nn.Linear(9, 10) @@ -109,6 +124,7 @@ def test_backward_reset(self): def test_torchscript(self): def foo(x): return torch.mm(x, x) + with FlopCounterMode() as mode: foo(T(5, 5)) unscripted_flops = get_total_flops(mode) @@ -125,7 +141,9 @@ def forward(ctx, input: torch.Tensor) -> torch.Tensor: @staticmethod def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor: - return torch.mm(grad_output, grad_output) + torch.mm(grad_output, grad_output) + return torch.mm(grad_output, grad_output) + torch.mm( + grad_output, grad_output + ) a = T(5, 5, requires_grad=True) with FlopCounterMode() as mode: @@ -160,11 +178,13 @@ def backward(ctx, grad_out): return grad_inp, grad_weight, None else: grad_inp = F.conv1d(grad_out, weight) - grad_weight = F.conv1d(grad_out.transpose(1, 0), inp.transpose(1, 0)) + grad_weight = F.conv1d( + grad_out.transpose(1, 0), inp.transpose(1, 0) + ) return grad_inp, grad_weight.transpose(1, 0), None - from torch.func import grad + x = torch.randn(2, 3, 16, dtype=torch.float64) weight = torch.randn(3, 4, 4, dtype=torch.float64) @@ -182,13 +202,16 @@ def only_convs(x, weight, transposed): self.assertEqual(boring_grads, fun_grads) - def test_convs(self): def assert_equivalence(f, expected_forward=None): with FlopCounterMode() as mode: f() - conv_forward_flops = mode.get_flop_counts()['Global'][torch.ops.aten.convolution] - conv_backward_flops = mode.get_flop_counts()['Global'][torch.ops.aten.convolution_backward] + conv_forward_flops = mode.get_flop_counts()["Global"][ + torch.ops.aten.convolution + ] + conv_backward_flops = mode.get_flop_counts()["Global"][ + torch.ops.aten.convolution_backward + ] self.assertEqual(conv_forward_flops * 2, conv_backward_flops) if expected_forward is not None: @@ -213,8 +236,12 @@ def assert_equivalence(f, expected_forward=None): x = torch.rand(1, in_channels, 4, 4, requires_grad=True) weight = torch.randn(out_channels, in_channels, 2, 2, requires_grad=True) assert_equivalence(lambda: F.conv2d(x, weight).sum().backward()) - transposed_weight = torch.randn(in_channels, out_channels, 2, 2, requires_grad=True) - assert_equivalence(lambda: F.conv_transpose2d(x, transposed_weight).sum().backward()) + transposed_weight = torch.randn( + in_channels, out_channels, 2, 2, requires_grad=True + ) + assert_equivalence( + lambda: F.conv_transpose2d(x, transposed_weight).sum().backward() + ) @skipIfNoTorchVision def test_module(self): @@ -224,12 +251,15 @@ def test_module(self): resnet18(a).sum().backward() self.assertExpectedInline(get_total_flops(mode), """10884440064""") - layer1_conv_flops = mode.flop_counts['ResNet.layer1'][torch.ops.aten.convolution] - layer1_conv_back_flops = mode.flop_counts['ResNet.layer1'][torch.ops.aten.convolution_backward] + layer1_conv_flops = mode.flop_counts["ResNet.layer1"][ + torch.ops.aten.convolution + ] + layer1_conv_back_flops = mode.flop_counts["ResNet.layer1"][ + torch.ops.aten.convolution_backward + ] self.assertExpectedInline(str(layer1_conv_flops), """924844032""") self.assertExpectedInline(str(layer1_conv_back_flops), """1849688064""") - def test_conv_transpose_loop(self): x = torch.rand(1, 4, 30, 2) model = torch.nn.ConvTranspose2d(4, 8, (2, 2), stride=2) @@ -241,7 +271,9 @@ def test_conv_transpose_loop(self): self.assertExpectedInline(str(mode.get_total_flops()), """1536000""") def test_custom(self): - mode = FlopCounterMode(custom_mapping={torch.ops.aten.add: lambda *args, out_shape: 5}) + mode = FlopCounterMode( + custom_mapping={torch.ops.aten.add: lambda *args, out_shape: 5} + ) with mode: a = T(4, 5) a + a @@ -250,6 +282,7 @@ def test_custom(self): def count(*args, out_val): return out_val.numel() + count._get_raw = True mode = FlopCounterMode(custom_mapping={torch.ops.aten.add: count}) @@ -264,8 +297,11 @@ def test_noop(self): T(4, 5).cos() @unittest.skipIf(not HAS_CUDA, "CUDA not available") - @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION or not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, - "Does not support all SDPA backends (pre-SM80 hardware on CUDA)") + @unittest.skipIf( + not PLATFORM_SUPPORTS_FLASH_ATTENTION + or not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, + "Does not support all SDPA backends (pre-SM80 hardware on CUDA)", + ) def test_sdpa(self): batch_size = 4 n_heads = 8 @@ -277,73 +313,154 @@ def test_sdpa(self): torch.manual_seed(0) - def get_flops(batch_size, n_heads, seq_len_q, seq_len_k, head_dim, head_dim_v, dtype, backend, with_backward=False): - query = torch.randn(batch_size, n_heads, seq_len_q, head_dim, device='cuda', dtype=dtype, requires_grad=True) - key = torch.randn(batch_size, n_heads, seq_len_k, head_dim, device='cuda', dtype=dtype, requires_grad=True) - value = torch.randn(batch_size, n_heads, seq_len_k, head_dim_v, device='cuda', dtype=dtype, requires_grad=True) + def get_flops( + batch_size, + n_heads, + seq_len_q, + seq_len_k, + head_dim, + head_dim_v, + dtype, + backend, + with_backward=False, + ): + query = torch.randn( + batch_size, + n_heads, + seq_len_q, + head_dim, + device="cuda", + dtype=dtype, + requires_grad=True, + ) + key = torch.randn( + batch_size, + n_heads, + seq_len_k, + head_dim, + device="cuda", + dtype=dtype, + requires_grad=True, + ) + value = torch.randn( + batch_size, + n_heads, + seq_len_k, + head_dim_v, + device="cuda", + dtype=dtype, + requires_grad=True, + ) if backend == "math": - backend = torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False) + backend = torch.backends.cuda.sdp_kernel( + enable_flash=False, enable_math=True, enable_mem_efficient=False + ) elif backend == "flash": - backend = torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False) + backend = torch.backends.cuda.sdp_kernel( + enable_flash=True, enable_math=False, enable_mem_efficient=False + ) elif backend == "mem_efficient": - backend = torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=True) + backend = torch.backends.cuda.sdp_kernel( + enable_flash=False, enable_math=False, enable_mem_efficient=True + ) mode = FlopCounterMode() with backend, mode: - out = F.scaled_dot_product_attention(query, key, value, dropout_p=0, is_causal=True) + out = F.scaled_dot_product_attention( + query, key, value, dropout_p=0, is_causal=True + ) if with_backward: out.sum().backward() return int(get_total_flops(mode)) # Sets seq_len_q == seq_len_k and dim_q == dim_v - run_uniform_flops = functools.partial(get_flops, batch_size, n_heads, seq_len_q, seq_len_q, head_dim, head_dim, dtype) + run_uniform_flops = functools.partial( + get_flops, + batch_size, + n_heads, + seq_len_q, + seq_len_q, + head_dim, + head_dim, + dtype, + ) - flops = [run_uniform_flops(backend, with_backward=False) for backend in ["math", "flash", "mem_efficient"]] + flops = [ + run_uniform_flops(backend, with_backward=False) + for backend in ["math", "flash", "mem_efficient"] + ] flops_fw_math, flops_fw_flash, flops_fw_efficient = flops self.assertEqual(flops_fw_math, flops_fw_flash) self.assertEqual(flops_fw_math, flops_fw_efficient) self.assertExpectedInline(str(flops_fw_math), """134217728""") - flops = [run_uniform_flops(backend, with_backward=True) for backend in ["math", "flash", "mem_efficient"]] + flops = [ + run_uniform_flops(backend, with_backward=True) + for backend in ["math", "flash", "mem_efficient"] + ] flops_fw_bw_math, flops_fw_bw_flash, flops_fw_bw_efficient = flops self.assertEqual(flops_fw_math * 3, flops_fw_bw_math) self.assertEqual(flops_fw_math * 7 // 2, flops_fw_bw_flash) self.assertEqual(flops_fw_bw_flash, flops_fw_bw_efficient) - - run_nonuniform_flops = functools.partial(get_flops, batch_size, n_heads, seq_len_q, seq_len_k, head_dim, head_dim_v, dtype) + run_nonuniform_flops = functools.partial( + get_flops, + batch_size, + n_heads, + seq_len_q, + seq_len_k, + head_dim, + head_dim_v, + dtype, + ) # Flash does not support non-uniform attention, i.e. seq_len_q != seq_len_k or dim_q != dim_v" non_uniform_backends = ["math", "mem_efficient"] - flops = [run_nonuniform_flops(backend, with_backward=False) for backend in non_uniform_backends] + flops = [ + run_nonuniform_flops(backend, with_backward=False) + for backend in non_uniform_backends + ] flops_fw_math, flops_fw_efficient = flops self.assertEqual(flops_fw_math, flops_fw_efficient) self.assertExpectedInline(str(flops_fw_math), """268435456""") - flops = [run_nonuniform_flops(backend, with_backward=True) for backend in non_uniform_backends] + flops = [ + run_nonuniform_flops(backend, with_backward=True) + for backend in non_uniform_backends + ] flops_fw_bw_math, flops_fw_bw_efficient = flops self.assertExpectedInline(str(flops_fw_bw_math), """805306368""") self.assertExpectedInline(str(flops_fw_bw_efficient), """939524096""") @unittest.skipIf(not HAS_CUDA, "CUDA not available") - @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION or not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, - "Does not support all SDPA backends (pre-SM80 hardware on CUDA)") + @unittest.skipIf( + not PLATFORM_SUPPORTS_FLASH_ATTENTION + or not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, + "Does not support all SDPA backends (pre-SM80 hardware on CUDA)", + ) def test_sdpa_nested_tensor(self): - def get_flops(q, k, v, backend, with_backward=False): mode = FlopCounterMode() if backend == "math": - backend = torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False) + backend = torch.backends.cuda.sdp_kernel( + enable_flash=False, enable_math=True, enable_mem_efficient=False + ) elif backend == "flash": - backend = torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False) + backend = torch.backends.cuda.sdp_kernel( + enable_flash=True, enable_math=False, enable_mem_efficient=False + ) elif backend == "mem_efficient": - backend = torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=True) + backend = torch.backends.cuda.sdp_kernel( + enable_flash=False, enable_math=False, enable_mem_efficient=True + ) with backend, mode: - out = F.scaled_dot_product_attention(q, k, v, dropout_p=0, is_causal=True) + out = F.scaled_dot_product_attention( + q, k, v, dropout_p=0, is_causal=True + ) if with_backward: if out.is_nested: out.values().sum().backward() @@ -361,25 +478,47 @@ def get_nested_inputs( head_dim_v, dtype, ): - q_lengths = torch.tensor([ - max_seq_len_q // 4, - max_seq_len_q // 4 * 2, - max_seq_len_q // 4 * 3, - max_seq_len_q // 4 * 4 - ]) - k_lengths = torch.tensor([ - max_seq_len_k // 4, - max_seq_len_k // 4 * 2, - max_seq_len_k // 4 * 3, - max_seq_len_k // 4 * 4 - ]) + q_lengths = torch.tensor( + [ + max_seq_len_q // 4, + max_seq_len_q // 4 * 2, + max_seq_len_q // 4 * 3, + max_seq_len_q // 4 * 4, + ] + ) + k_lengths = torch.tensor( + [ + max_seq_len_k // 4, + max_seq_len_k // 4 * 2, + max_seq_len_k // 4 * 3, + max_seq_len_k // 4 * 4, + ] + ) q_offsets, k_offsets = ( torch.cat((torch.tensor([0]), torch.cumsum(lengths, dim=0))).cuda() for lengths in (q_lengths, k_lengths) ) - q_values = torch.randn(q_offsets[-1], head_dim * n_heads, dtype=dtype, requires_grad=True, device="cuda") - k_values = torch.randn(k_offsets[-1], head_dim * n_heads, dtype=dtype, requires_grad=True, device="cuda") - v_values = torch.randn(k_offsets[-1], head_dim_v * n_heads, dtype=dtype, requires_grad=True, device="cuda") + q_values = torch.randn( + q_offsets[-1], + head_dim * n_heads, + dtype=dtype, + requires_grad=True, + device="cuda", + ) + k_values = torch.randn( + k_offsets[-1], + head_dim * n_heads, + dtype=dtype, + requires_grad=True, + device="cuda", + ) + v_values = torch.randn( + k_offsets[-1], + head_dim_v * n_heads, + dtype=dtype, + requires_grad=True, + device="cuda", + ) q = torch.nested.nested_tensor_from_jagged(q_values, q_offsets) k = torch.nested.nested_tensor_from_jagged(k_values, k_offsets) @@ -397,13 +536,16 @@ def split_tensor(x): y.unsqueeze(0).transpose(1, 2).detach().requires_grad_(True) for y in x.transpose(1, 2).unbind(0) ) + q_tensors = split_tensor(q) k_tensors = split_tensor(k) v_tensors = split_tensor(v) flops = 0 for q_i, k_i, v_i in zip(q_tensors, k_tensors, v_tensors): - flops += get_flops(q_i, k_i, v_i, backend=backend, with_backward=with_backward) + flops += get_flops( + q_i, k_i, v_i, backend=backend, with_backward=with_backward + ) return flops @@ -429,29 +571,77 @@ def split_tensor(x): } self.assertEqual( - get_dense_flops(*get_nested_inputs(**uniform_config), backend="flash", with_backward=False), - get_flops(*get_nested_inputs(**uniform_config), backend="flash", with_backward=False), + get_dense_flops( + *get_nested_inputs(**uniform_config), + backend="flash", + with_backward=False, + ), + get_flops( + *get_nested_inputs(**uniform_config), + backend="flash", + with_backward=False, + ), ) self.assertEqual( - get_dense_flops(*get_nested_inputs(**uniform_config), backend="mem_efficient", with_backward=False), - get_flops(*get_nested_inputs(**uniform_config), backend="mem_efficient", with_backward=False), + get_dense_flops( + *get_nested_inputs(**uniform_config), + backend="mem_efficient", + with_backward=False, + ), + get_flops( + *get_nested_inputs(**uniform_config), + backend="mem_efficient", + with_backward=False, + ), ) self.assertEqual( - get_dense_flops(*get_nested_inputs(**differing_config), backend="mem_efficient", with_backward=False), - get_flops(*get_nested_inputs(**differing_config), backend="mem_efficient", with_backward=False), + get_dense_flops( + *get_nested_inputs(**differing_config), + backend="mem_efficient", + with_backward=False, + ), + get_flops( + *get_nested_inputs(**differing_config), + backend="mem_efficient", + with_backward=False, + ), ) self.assertEqual( - get_dense_flops(*get_nested_inputs(**uniform_config), backend="flash", with_backward=True), - get_flops(*get_nested_inputs(**uniform_config), backend="flash", with_backward=True), + get_dense_flops( + *get_nested_inputs(**uniform_config), + backend="flash", + with_backward=True, + ), + get_flops( + *get_nested_inputs(**uniform_config), + backend="flash", + with_backward=True, + ), ) self.assertEqual( - get_dense_flops(*get_nested_inputs(**uniform_config), backend="mem_efficient", with_backward=True), - get_flops(*get_nested_inputs(**uniform_config), backend="mem_efficient", with_backward=True), + get_dense_flops( + *get_nested_inputs(**uniform_config), + backend="mem_efficient", + with_backward=True, + ), + get_flops( + *get_nested_inputs(**uniform_config), + backend="mem_efficient", + with_backward=True, + ), ) self.assertEqual( - get_dense_flops(*get_nested_inputs(**differing_config), backend="mem_efficient", with_backward=True), - get_flops(*get_nested_inputs(**differing_config), backend="mem_efficient", with_backward=True), + get_dense_flops( + *get_nested_inputs(**differing_config), + backend="mem_efficient", + with_backward=True, + ), + get_flops( + *get_nested_inputs(**differing_config), + backend="mem_efficient", + with_backward=True, + ), ) def test_addmm_out(self): @@ -479,8 +669,8 @@ def test_hook_registration(self): def test_pytrees(self): class Foo(torch.nn.Module): def forward(self, x): - x = x['a'].relu_() - return {'a': torch.mm(x, x)} + x = x["a"].relu_() + return {"a": torch.mm(x, x)} class Mod(torch.nn.Module): def __init__(self): @@ -493,8 +683,12 @@ def forward(self, x): mod = Mod() with FlopCounterMode() as mode: - mod({'a': torch.randn(10, 10, requires_grad=True).clone()})['a'].sum().backward() - self.assertExpectedInline((mode.flop_counts['Mod'][torch.ops.aten.mm]), """12000""") + mod({"a": torch.randn(10, 10, requires_grad=True).clone()})[ + "a" + ].sum().backward() + self.assertExpectedInline( + (mode.flop_counts["Mod"][torch.ops.aten.mm]), """12000""" + ) class Mod2(torch.nn.Module): def forward(self, x): @@ -503,7 +697,9 @@ def forward(self, x): mod = Mod2() with FlopCounterMode() as mode: mod(torch.randn(10, 10, requires_grad=True))[0].sum().backward() - self.assertExpectedInline((mode.flop_counts['Mod2'][torch.ops.aten.mm]), """6000""") + self.assertExpectedInline( + (mode.flop_counts["Mod2"][torch.ops.aten.mm]), """6000""" + ) def test_warning(self): mod = torch.nn.Linear(2, 2) @@ -511,5 +707,5 @@ def test_warning(self): FlopCounterMode(mod) -if __name__ == '__main__': +if __name__ == "__main__": run_tests() diff --git a/test/test_function_schema.py b/test/test_function_schema.py index 47586147dbbcb..439a3c66d3f02 100644 --- a/test/test_function_schema.py +++ b/test/test_function_schema.py @@ -1,8 +1,8 @@ # Owner(s): ["module: unknown"] import torch -from torch.testing._internal.common_utils import TestCase, run_tests from torch._C import parse_schema +from torch.testing._internal.common_utils import run_tests, TestCase class TestFunctionSchema(TestCase): @@ -16,216 +16,306 @@ def test_serialize_and_deserialize(self): self.assertTrue(parsed_schema.is_backward_compatible_with(schema)) def test_out_schema(self): - schema_with_out = parse_schema('any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)') + schema_with_out = parse_schema( + "any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)" + ) self.assertTrue(schema_with_out.arguments[-1].is_out) - schema_without_out = parse_schema('any.not_out(Tensor self, Tensor b) -> Tensor') + schema_without_out = parse_schema( + "any.not_out(Tensor self, Tensor b) -> Tensor" + ) self.assertFalse(schema_without_out.arguments[-1].is_out) def test_hash_schema(self): - schema1 = parse_schema('any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)') - schema2 = parse_schema('any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)') + schema1 = parse_schema("any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)") + schema2 = parse_schema("any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)") self.assertEqual(hash(schema1), hash(schema2)) - schema3 = parse_schema('any.not_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)') + schema3 = parse_schema( + "any.not_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)" + ) self.assertNotEqual(hash(schema2), hash(schema3)) - schema4 = parse_schema('foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)') + schema4 = parse_schema( + "foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)" + ) self.assertNotEqual(hash(schema2), hash(schema4)) # schemas with different default value, or different kw-only arg, should have different hash - default_val_schema0 = parse_schema('foo(Tensor self, int a = 2) -> Tensor(a!)') - default_val_schema1 = parse_schema('foo(Tensor self, int a = 3) -> Tensor(a!)') - default_val_schema2 = parse_schema('foo(Tensor self, *, int a = 2) -> Tensor(a!)') + default_val_schema0 = parse_schema("foo(Tensor self, int a = 2) -> Tensor(a!)") + default_val_schema1 = parse_schema("foo(Tensor self, int a = 3) -> Tensor(a!)") + default_val_schema2 = parse_schema( + "foo(Tensor self, *, int a = 2) -> Tensor(a!)" + ) self.assertNotEqual(hash(default_val_schema0), hash(default_val_schema1)) self.assertNotEqual(hash(default_val_schema0), hash(default_val_schema2)) # schema with different alias annotation should have different hash - alias_schema = parse_schema('foo(Tensor(a!) self, int a = 2) -> Tensor(a!)') + alias_schema = parse_schema("foo(Tensor(a!) self, int a = 2) -> Tensor(a!)") self.assertNotEqual(hash(default_val_schema0), hash(alias_schema)) - alias_schema2 = parse_schema('foo(Tensor(b!) self, int a = 2) -> Tensor(a!)') + alias_schema2 = parse_schema("foo(Tensor(b!) self, int a = 2) -> Tensor(a!)") self.assertNotEqual(hash(alias_schema), hash(alias_schema2)) # schema with different alias infos - alias_schema3 = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)') - alias_schema4 = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(b!)') - alias_schema5 = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(b!) out, Tensor(a!) b) -> Tensor(a!)') + alias_schema3 = parse_schema( + "foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)" + ) + alias_schema4 = parse_schema( + "foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(b!)" + ) + alias_schema5 = parse_schema( + "foo(Tensor self, *, int a, int b=1, Tensor(b!) out, Tensor(a!) b) -> Tensor(a!)" + ) self.assertNotEqual(hash(alias_schema3), hash(alias_schema4)) self.assertNotEqual(hash(alias_schema3), hash(alias_schema5)) def test_backward_compatible_structure(self): - old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor') + old_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor") # BC: A new schema without changes. - new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor') + new_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor") self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) # No-BC: A new schema with different name. - new_schema = parse_schema('any_.over(Tensor self, *, Tensor b) -> Tensor') + new_schema = parse_schema("any_.over(Tensor self, *, Tensor b) -> Tensor") self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) # No-BC: A new schema with different overload name. - new_schema = parse_schema('any.other(Tensor self, *, Tensor b) -> Tensor') + new_schema = parse_schema("any.other(Tensor self, *, Tensor b) -> Tensor") self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) # No-BC: A new schema that adds vararg. - new_schema = parse_schema('any.over(Tensor self, *, Tensor b, ...) -> Tensor') + new_schema = parse_schema("any.over(Tensor self, *, Tensor b, ...) -> Tensor") self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) # No-BC: A new schema with different number of outputs. - new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> (Tensor, Tensor)') + new_schema = parse_schema( + "any.over(Tensor self, *, Tensor b) -> (Tensor, Tensor)" + ) self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) def test_backward_compatible_outputs(self): - old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor') + old_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor") # No-BC: A new schema with output becoming of optional type. - new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor?') + new_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor?") self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) # BC: (the opposite case) An schema where the output is not of optional type anymore. self.assertTrue(old_schema.is_backward_compatible_with(new_schema)) # No-BC: A new schema with a different output type. - new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> int') + new_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> int") self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) # No-BC: A new schema with a different output type. - new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor out') + new_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor out") self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) def test_backward_compatible_arguments(self): - old_schema = parse_schema('any(Tensor self, *, Tensor b, int c) -> Tensor') + old_schema = parse_schema("any(Tensor self, *, Tensor b, int c) -> Tensor") # No-BC: A new schema with less arguments. - new_schema = parse_schema('any(Tensor self, *, Tensor b) -> Tensor') + new_schema = parse_schema("any(Tensor self, *, Tensor b) -> Tensor") self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) # No-BC: A new schema with more arguments, appended, but no default value. - new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d) -> Tensor') + new_schema = parse_schema( + "any(Tensor self, *, Tensor b, int c, int d) -> Tensor" + ) self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) # BC: A new schema with more arguments, appended, that have a default value. - new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d=1) -> Tensor') + new_schema = parse_schema( + "any(Tensor self, *, Tensor b, int c, int d=1) -> Tensor" + ) self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) # No-BC: A new schema with more arguments, not-appended, that have a default value. - new_schema = parse_schema('any(Tensor self, int d=1, *, Tensor b, int c) -> Tensor') + new_schema = parse_schema( + "any(Tensor self, int d=1, *, Tensor b, int c) -> Tensor" + ) self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) # BC: A new schema where old kwargs becomes positional. - new_schema = parse_schema('any(Tensor self, Tensor b, *, int c) -> Tensor') + new_schema = parse_schema("any(Tensor self, Tensor b, *, int c) -> Tensor") self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) # BC: (the opposite case) A new schema where an old positional argument becomes kwarg. self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) # BC: A new schema where all old kwargs become positional. - new_schema = parse_schema('any(Tensor self, Tensor b, int c) -> Tensor') + new_schema = parse_schema("any(Tensor self, Tensor b, int c) -> Tensor") self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) # BC: (the opposite case) A new schema where all old positional arguments become kwarg. self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) # No-BC: A new schema where old kwargs appear in different order. - new_schema = parse_schema('any(Tensor self, *, int c, Tensor b) -> Tensor') + new_schema = parse_schema("any(Tensor self, *, int c, Tensor b) -> Tensor") self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) # BC: A new schema where argument becomes of type optional. - new_schema = parse_schema('any(Tensor self, *, Tensor b, int? c) -> Tensor') + new_schema = parse_schema("any(Tensor self, *, Tensor b, int? c) -> Tensor") self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) # BC: A new schema where argument gains a default value. - new_schema = parse_schema('any(Tensor self, *, Tensor b, int c=1) -> Tensor') + new_schema = parse_schema("any(Tensor self, *, Tensor b, int c=1) -> Tensor") self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) # No-BC: A new schema where argument is "renamed". - new_schema = parse_schema('any(Tensor self, *, Tensor b, int renamed) -> Tensor') + new_schema = parse_schema( + "any(Tensor self, *, Tensor b, int renamed) -> Tensor" + ) self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) # No-BC: A new schema where argument type changes to an incompatible type. - new_schema = parse_schema('any(Tensor self, *, Tensor b, int[] c) -> Tensor') + new_schema = parse_schema("any(Tensor self, *, Tensor b, int[] c) -> Tensor") self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) def test_backward_compatible_with_smart_serialization(self): # cases where out arg is provided - old_schema = parse_schema('foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)') - new_schema_same_out = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)') - new_schema_wrong_default = parse_schema('foo(Tensor self, *, int b=1, int a, Tensor(a!) out) -> Tensor(a!)') - new_schema_more_out = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)') - new_schema_wrong_pos = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(b!) b, Tensor(a!) out) -> Tensor(a!)') + old_schema = parse_schema( + "foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)" + ) + new_schema_same_out = parse_schema( + "foo(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)" + ) + new_schema_wrong_default = parse_schema( + "foo(Tensor self, *, int b=1, int a, Tensor(a!) out) -> Tensor(a!)" + ) + new_schema_more_out = parse_schema( + "foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)" + ) + new_schema_wrong_pos = parse_schema( + "foo(Tensor self, *, int a, int b=1, Tensor(b!) b, Tensor(a!) out) -> Tensor(a!)" + ) self.assertTrue(new_schema_same_out.is_backward_compatible_with(old_schema)) self.assertTrue(new_schema_more_out.is_backward_compatible_with(old_schema)) - self.assertFalse(new_schema_wrong_default.is_backward_compatible_with(old_schema)) + self.assertFalse( + new_schema_wrong_default.is_backward_compatible_with(old_schema) + ) self.assertFalse(new_schema_wrong_pos.is_backward_compatible_with(old_schema)) # cases where out arg is not provided - old_schema_without_arg = parse_schema('foo(Tensor self, int a, int b=1) -> int') - new_schema_without_arg = parse_schema('foo(Tensor self, int a, int b=1, int c=2) -> int') - new_schema_without_arg_multiple_default = parse_schema('foo(Tensor self, int a, int b=1, int c=2, int d=3) -> int') - new_schema_without_arg_wrong_pos = parse_schema('foo(Tensor self, int a, int c=2, int b=1) -> int') - self.assertTrue(new_schema_without_arg.is_backward_compatible_with(old_schema_without_arg)) - self.assertTrue(new_schema_without_arg_multiple_default.is_backward_compatible_with(old_schema_without_arg)) - self.assertFalse(new_schema_without_arg_wrong_pos.is_backward_compatible_with(old_schema_without_arg)) + old_schema_without_arg = parse_schema("foo(Tensor self, int a, int b=1) -> int") + new_schema_without_arg = parse_schema( + "foo(Tensor self, int a, int b=1, int c=2) -> int" + ) + new_schema_without_arg_multiple_default = parse_schema( + "foo(Tensor self, int a, int b=1, int c=2, int d=3) -> int" + ) + new_schema_without_arg_wrong_pos = parse_schema( + "foo(Tensor self, int a, int c=2, int b=1) -> int" + ) + self.assertTrue( + new_schema_without_arg.is_backward_compatible_with(old_schema_without_arg) + ) + self.assertTrue( + new_schema_without_arg_multiple_default.is_backward_compatible_with( + old_schema_without_arg + ) + ) + self.assertFalse( + new_schema_without_arg_wrong_pos.is_backward_compatible_with( + old_schema_without_arg + ) + ) def test_string_optional_parameter_default_value(self): - schema_a = parse_schema("example::op(str? order=\"NCHW\") -> (Tensor)") + schema_a = parse_schema('example::op(str? order="NCHW") -> (Tensor)') schema_b = parse_schema(str(schema_a)) self.assertEqual(schema_a, schema_b) def test_forward_compatible_arguments_without_out(self): - old_schema = parse_schema('any(Tensor self, int a, int b=1) -> Tensor') + old_schema = parse_schema("any(Tensor self, int a, int b=1) -> Tensor") # deleting default arg is FC compatible - new_schema = parse_schema('any(Tensor self, int a) -> Tensor') + new_schema = parse_schema("any(Tensor self, int a) -> Tensor") is_fc, _ = new_schema.check_forward_compatible_with(old_schema) self.assertTrue(is_fc) # adding default arg is FC compatible - new_schema = parse_schema('any(Tensor self, int a, int b=1, int c=1) -> Tensor') + new_schema = parse_schema("any(Tensor self, int a, int b=1, int c=1) -> Tensor") is_fc, _ = new_schema.check_forward_compatible_with(old_schema) self.assertTrue(is_fc) # adding default arg with container type is NOT FC compatible - new_schema = parse_schema('any(Tensor self, int a, int b=1, int[2] c=1) -> Tensor') + new_schema = parse_schema( + "any(Tensor self, int a, int b=1, int[2] c=1) -> Tensor" + ) is_fc, reason = new_schema.check_forward_compatible_with(old_schema) self.assertFalse(is_fc) - self.assertEqual(reason, "Function schema is not forward compatible since the new argument" - " \'c\' of type int[] has a container type as its default value.") + self.assertEqual( + reason, + "Function schema is not forward compatible since the new argument" + " 'c' of type int[] has a container type as its default value.", + ) # updating the default value of a default arg is NOT FC compatible - new_schema = parse_schema('any(Tensor self, int a, int b=4) -> Tensor') + new_schema = parse_schema("any(Tensor self, int a, int b=4) -> Tensor") is_fc, reason = new_schema.check_forward_compatible_with(old_schema) self.assertFalse(is_fc) - self.assertEqual(reason, "\'b\' is not forward compatible with the older version of the schema") + self.assertEqual( + reason, "'b' is not forward compatible with the older version of the schema" + ) # updating the arg name of a default arg is NOT FC compatible - new_schema = parse_schema('any(Tensor self, int a, int c=1) -> Tensor') + new_schema = parse_schema("any(Tensor self, int a, int c=1) -> Tensor") is_fc, reason = new_schema.check_forward_compatible_with(old_schema) self.assertFalse(is_fc) - self.assertEqual(reason, "\'c\' is not forward compatible with the older version of the schema") + self.assertEqual( + reason, "'c' is not forward compatible with the older version of the schema" + ) # not adding default arg in the end is NOT FC compatible - new_schema = parse_schema('any(Tensor self, int a, int c=1, int b=1) -> Tensor') + new_schema = parse_schema("any(Tensor self, int a, int c=1, int b=1) -> Tensor") is_fc, reason = new_schema.check_forward_compatible_with(old_schema) self.assertFalse(is_fc) - self.assertEqual(reason, "\'c\' is not forward compatible with the older version of the schema") + self.assertEqual( + reason, "'c' is not forward compatible with the older version of the schema" + ) # making default arg into positional arg is NOT FC compatible - new_schema = parse_schema('any(Tensor self, int a, int b) -> Tensor') + new_schema = parse_schema("any(Tensor self, int a, int b) -> Tensor") is_fc, reason = new_schema.check_forward_compatible_with(old_schema) self.assertFalse(is_fc) - self.assertEqual(reason, "\'b\' is not forward compatible with the older version of the schema") + self.assertEqual( + reason, "'b' is not forward compatible with the older version of the schema" + ) # making positional arg into default arg is NOT FC compatible - new_schema = parse_schema('any(Tensor self, int a=1, int b=1) -> Tensor') + new_schema = parse_schema("any(Tensor self, int a=1, int b=1) -> Tensor") is_fc, reason = new_schema.check_forward_compatible_with(old_schema) self.assertFalse(is_fc) - self.assertEqual(reason, "\'a\' is not forward compatible with the older version of the schema") + self.assertEqual( + reason, "'a' is not forward compatible with the older version of the schema" + ) def test_forward_compatible_arguments_real_use_case(self): # this change introduced forward incompatibility in the past - old_slice_schema = parse_schema('slice(Tensor(a) self, int dim=0, int start=0, int end=0, int step=1) -> Tensor(a)') - new_slice_schema = parse_schema('slice(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)') + old_slice_schema = parse_schema( + "slice(Tensor(a) self, int dim=0, int start=0, int end=0, int step=1) -> Tensor(a)" + ) + new_slice_schema = parse_schema( + "slice(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)" + ) is_fc, reason = new_slice_schema.check_forward_compatible_with(old_slice_schema) self.assertFalse(is_fc) - self.assertEqual(reason, "\'start\' is not forward compatible with the older version of the schema") + self.assertEqual( + reason, + "'start' is not forward compatible with the older version of the schema", + ) def test_forward_compatible_arguments_with_out(self): - old_schema = parse_schema('any(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)') - new_schema = parse_schema('any(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)') + old_schema = parse_schema( + "any(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)" + ) + new_schema = parse_schema( + "any(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)" + ) is_fc, _ = new_schema.check_forward_compatible_with(old_schema) self.assertTrue(is_fc) - new_schema = parse_schema('any(Tensor self, *, int a, int b=1, int c=1, Tensor(a!) out) -> Tensor(a!)') + new_schema = parse_schema( + "any(Tensor self, *, int a, int b=1, int c=1, Tensor(a!) out) -> Tensor(a!)" + ) is_fc, _ = new_schema.check_forward_compatible_with(old_schema) self.assertTrue(is_fc) - new_schema = parse_schema('any(Tensor self, *, int a, Tensor(d!) d, int b=1, Tensor(a!) out) -> Tensor(a!)') + new_schema = parse_schema( + "any(Tensor self, *, int a, Tensor(d!) d, int b=1, Tensor(a!) out) -> Tensor(a!)" + ) is_fc, reason = new_schema.check_forward_compatible_with(old_schema) self.assertFalse(is_fc) - self.assertEqual(reason, "Function schema should have the same number of out arguments") + self.assertEqual( + reason, "Function schema should have the same number of out arguments" + ) def test_schema_error(self): - with self.assertRaisesRegex(RuntimeError, r"schemas with vararg \(...\) can't have default value args"): + with self.assertRaisesRegex( + RuntimeError, r"schemas with vararg \(...\) can't have default value args" + ): schema = parse_schema("any.foo(int arg1, int arg2=0, ...)") def test_tensor_list_alias_annotation_properly_parsed(self): - schema_str = 'foo(Tensor self, *, Tensor(a!)[] out) -> ()' + schema_str = "foo(Tensor self, *, Tensor(a!)[] out) -> ()" schema = parse_schema(schema_str) self.assertTrue(schema.arguments[-1].alias_info.is_write) self.assertEqual(str(schema), schema_str) def test_tensor_option_arguments_properly_parsed(self): - schema_str = '_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, ' \ - 'bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor' + schema_str = ( + "_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, " + "bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor" + ) schema = parse_schema(schema_str) # fake type of MemoryFormat? is int? self.assertEqual(schema.arguments[-1].type.str(), "int?") @@ -237,7 +327,7 @@ def test_tensor_option_arguments_properly_parsed(self): self.assertEqual(str(schema), schema_str) def test_sym_int_argument_properly_parsed(self): - schema_str = 'sym_size.int(Tensor self, int dim) -> SymInt' + schema_str = "sym_size.int(Tensor self, int dim) -> SymInt" schema = parse_schema(schema_str) # fake type of SymInt is int self.assertEqual(schema.returns[-1].type.str(), "int") @@ -247,5 +337,5 @@ def test_sym_int_argument_properly_parsed(self): self.assertEqual(str(schema), schema_str) -if __name__ == '__main__': +if __name__ == "__main__": run_tests() diff --git a/test/test_functional_autograd_benchmark.py b/test/test_functional_autograd_benchmark.py index 57a67ccead89b..b0141479dd38a 100644 --- a/test/test_functional_autograd_benchmark.py +++ b/test/test_functional_autograd_benchmark.py @@ -1,14 +1,21 @@ # Owner(s): ["module: autograd"] -from torch.testing._internal.common_utils import TestCase, run_tests, slowTest, IS_WINDOWS +import os import subprocess import tempfile -import os import unittest +from torch.testing._internal.common_utils import ( + IS_WINDOWS, + run_tests, + slowTest, + TestCase, +) + PYTORCH_COLLECT_COVERAGE = bool(os.environ.get("PYTORCH_COLLECT_COVERAGE")) + # This is a very simple smoke test for the functional autograd benchmarking script. class TestFunctionalAutogradBenchmark(TestCase): def _test_runner(self, model, disable_gpu=False): @@ -17,18 +24,20 @@ def _test_runner(self, model, disable_gpu=False): # is not allowed to open it again. As this is a simple smoke test, we choose for now # not to run this on windows and keep the code here simple. with tempfile.NamedTemporaryFile() as out_file: - cmd = ['python3', - '../benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py'] + cmd = [ + "python3", + "../benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py", + ] # Only run the warmup - cmd += ['--num-iters', '0'] + cmd += ["--num-iters", "0"] # Only run the vjp task (fastest one) - cmd += ['--task-filter', 'vjp'] + cmd += ["--task-filter", "vjp"] # Only run the specified model - cmd += ['--model-filter', model] + cmd += ["--model-filter", model] # Output file - cmd += ['--output', out_file.name] + cmd += ["--output", out_file.name] if disable_gpu: - cmd += ['--gpu', '-1'] + cmd += ["--gpu", "-1"] res = subprocess.run(cmd) @@ -37,20 +46,34 @@ def _test_runner(self, model, disable_gpu=False): out_file.seek(0, os.SEEK_END) self.assertTrue(out_file.tell() > 0) - - @unittest.skipIf(IS_WINDOWS, "NamedTemporaryFile on windows does not have all the features we need.") - @unittest.skipIf(PYTORCH_COLLECT_COVERAGE, "Can deadlocks with gcov, see https://github.com/pytorch/pytorch/issues/49656") + @unittest.skipIf( + IS_WINDOWS, + "NamedTemporaryFile on windows does not have all the features we need.", + ) + @unittest.skipIf( + PYTORCH_COLLECT_COVERAGE, + "Can deadlocks with gcov, see https://github.com/pytorch/pytorch/issues/49656", + ) def test_fast_tasks(self): - fast_tasks = ['resnet18', 'ppl_simple_reg', 'ppl_robust_reg', 'wav2letter', - 'transformer', 'multiheadattn'] + fast_tasks = [ + "resnet18", + "ppl_simple_reg", + "ppl_robust_reg", + "wav2letter", + "transformer", + "multiheadattn", + ] for task in fast_tasks: self._test_runner(task) @slowTest - @unittest.skipIf(IS_WINDOWS, "NamedTemporaryFile on windows does not have all the features we need.") + @unittest.skipIf( + IS_WINDOWS, + "NamedTemporaryFile on windows does not have all the features we need.", + ) def test_slow_tasks(self): - slow_tasks = ['fcn_resnet', 'detr'] + slow_tasks = ["fcn_resnet", "detr"] # deepspeech is voluntarily excluded as it takes too long to run without # proper tuning of the number of threads it should use. @@ -59,5 +82,5 @@ def test_slow_tasks(self): self._test_runner(task, disable_gpu=True) -if __name__ == '__main__': +if __name__ == "__main__": run_tests() diff --git a/test/test_functional_optim.py b/test/test_functional_optim.py index da3d40d305e34..5e2a1e67e0159 100644 --- a/test/test_functional_optim.py +++ b/test/test_functional_optim.py @@ -1,15 +1,16 @@ # Owner(s): ["oncall: distributed"] -from typing import List, Optional, Tuple import unittest +from typing import List, Optional, Tuple import torch import torch.distributed import torch.nn as nn import torch.nn.functional as F from torch import Tensor -from torch.optim import SGD, Adam, AdamW -from torch.testing._internal.common_utils import TestCase, run_tests +from torch.optim import Adam, AdamW, SGD +from torch.testing._internal.common_utils import run_tests, TestCase + class MyModule(torch.nn.Module): def __init__(self): @@ -21,6 +22,7 @@ def __init__(self): def forward(self, t1): return self.lin2(F.relu(self.lin1(t1))) + # dummy class to showcase custom optimizer registration with functional wrapper class MyDummyFnOptimizer: def __init__( @@ -32,7 +34,6 @@ def __init__( weight_decay: float = 0.0, _allow_empty_param_list: bool = False, ): - if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") if not 0.0 <= eps: @@ -58,17 +59,26 @@ def __init__( def step_param(self, param: Tensor, grad: Optional[Tensor]): # call the custom optimizer step_param implementation with torch.no_grad(): - raise RuntimeError("MyDummyFnOptimizer does not support step_param() as of now") + raise RuntimeError( + "MyDummyFnOptimizer does not support step_param() as of now" + ) def step(self, gradients: List[Optional[Tensor]]): # call the custom optimizer step implementation with torch.no_grad(): raise RuntimeError("MyDummyFnOptimizer does not support step() as of now") + if torch.distributed.is_available(): - from torch.distributed.optim.utils import functional_optim_map, register_functional_optim + from torch.distributed.optim.utils import ( + functional_optim_map, + register_functional_optim, + ) + -@unittest.skipIf(not torch.distributed.is_available(), "These are testing distributed functions") +@unittest.skipIf( + not torch.distributed.is_available(), "These are testing distributed functions" +) class TestFunctionalOptimParity(TestCase): def _validate_parameters(self, params_1, params_2): for p1, p2 in zip(params_1, params_2): diff --git a/test/test_functionalization_of_rng_ops.py b/test/test_functionalization_of_rng_ops.py index b2ac62e4f2786..bba22ff34a0b0 100644 --- a/test/test_functionalization_of_rng_ops.py +++ b/test/test_functionalization_of_rng_ops.py @@ -1,36 +1,34 @@ # Owner(s): ["oncall: pt2"] +import functools import sys import unittest -import torch -from torch.testing._internal.common_utils import ( - TestCase, - run_tests, -) - -from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes -from functorch.compile import aot_function, nop, min_cut_rematerialization_partition from unittest.mock import patch -import functools -import torch.utils.checkpoint +import torch +import torch.utils.checkpoint +from functorch.compile import aot_function, min_cut_rematerialization_partition, nop -from torch.testing._internal.common_utils import ( - IS_CI, - IS_WINDOWS, +from torch.testing._internal.common_device_type import ( + dtypes, + instantiate_device_type_tests, ) +from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS, run_tests, TestCase + if IS_WINDOWS and IS_CI: - sys.stderr.write( - "torch.compile not supported on windows" - ) + sys.stderr.write("torch.compile not supported on windows") if __name__ == "__main__": sys.exit(0) raise unittest.SkipTest("torch.compile not supported on windows") + def count_philox_rand(gm, args, freq): - assert [node.target for node in gm.graph.nodes].count(torch.ops.rngprims.philox_rand.default) == freq + assert [node.target for node in gm.graph.nodes].count( + torch.ops.rngprims.philox_rand.default + ) == freq return gm + class TestFunctionalizationRngOps(TestCase): @dtypes(torch.float32) @patch.object(torch._functorch.config, "functionalize_rng_ops", True) @@ -72,8 +70,6 @@ def fn(x): self.assertEqual(ref, res) - - @dtypes(torch.float32) @patch.object(torch._functorch.config, "functionalize_rng_ops", True) def test_rand_like_dynamic_bwd(self, dtype, device): @@ -96,7 +92,6 @@ def fn(x): self.assertEqual(ref, res) - @dtypes(torch.float32) @patch.object(torch._functorch.config, "functionalize_rng_ops", True) def test_rand(self, dtype, device): @@ -134,7 +129,7 @@ def forward(ctx, x): @staticmethod def backward(ctx, grad_out): - x, = ctx.saved_tensors + (x,) = ctx.saved_tensors return grad_out * torch.rand_like(grad_out) * torch.cos(x) custom = Custom.apply @@ -174,7 +169,7 @@ def forward(ctx, x): @staticmethod def backward(ctx, grad_out): - x, = ctx.saved_tensors + (x,) = ctx.saved_tensors return grad_out * torch.rand_like(grad_out) * torch.cos(x) class CustomOp2(torch.autograd.Function): @@ -186,10 +181,9 @@ def forward(ctx, x): @staticmethod def backward(ctx, grad_out): - x, = ctx.saved_tensors + (x,) = ctx.saved_tensors return grad_out * torch.rand_like(grad_out) * torch.rand_like(x) - custom_op1 = CustomOp1.apply custom_op2 = CustomOp2.apply @@ -210,7 +204,6 @@ def aot_fn(x): b = a.sin() return aot_custom_op2(b) - for seed in range(10): torch.cuda.manual_seed(seed) x = torch.rand(*shape, device=device, dtype=dtype, requires_grad=True) @@ -265,7 +258,6 @@ def fn(x): a = torch.sin(a) return a - x = torch.rand(*shape, device=device, dtype=dtype, requires_grad=True) x_clone = x.clone().detach().requires_grad_(True) @@ -277,7 +269,12 @@ def fn(x): torch.cuda.manual_seed(123) fwd_compiler = functools.partial(count_philox_rand, freq=2) bwd_compiler = functools.partial(count_philox_rand, freq=0) - aot_custom = aot_function(fn, fwd_compiler, bwd_compiler, partition_fn=min_cut_rematerialization_partition) + aot_custom = aot_function( + fn, + fwd_compiler, + bwd_compiler, + partition_fn=min_cut_rematerialization_partition, + ) # aot_custom = aot_function(fn, fwd_compiler, bwd_compiler) res = aot_custom(x_clone) res.sum().backward() diff --git a/test/test_optim.py b/test/test_optim.py index 13484b1d7876c..f875b4ed669ee 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -911,8 +911,6 @@ def test_fused_large_tensor(self, device, dtype, optim_info): @onlyCUDA @optims([optim for optim in optim_db if "fused" in optim.supported_impls], dtypes=[torch.float32]) def test_fused_does_not_step_if_foundinf(self, device, dtype, optim_info): - if device not in optim_info.supports_fused_on: - self.skipTest(f"{device} is not supported for fused on {optim_info.optim_cls.__name__}") optim_cls = optim_info.optim_cls optim_inputs = optim_info.optim_inputs_func(device=device) num_params = 5 @@ -942,12 +940,9 @@ def test_cpu_load_state_dict(self, device, dtype, impl, optim_info): # Since this is a unit test, it is more expedient to simulate what the state_dict # would look like, which is basically CPU tensors with fused/capturable flag = True. optim_cls = optim_info.optim_cls - opt_name = optim_cls.__name__ - if opt_name in ("SGD", "Adagrad", ) and impl == "capturable": - # Capturable SGD/Adagrad does not exist + if optim_cls.__name__ == "SGD" and impl == "capturable": + # Capturable SGD does not exist self.skipTest("SGD does not currently support capturable") - if impl == "fused" and device not in optim_info.supports_fused_on: - self.skipTest(f"{device} is not supported for fused on {opt_name}") cpu_optim_inputs = optim_info.optim_inputs_func(device="cpu") for optim_input in cpu_optim_inputs: @@ -1323,8 +1318,6 @@ def closure(): return closure_loss if optim_info.step_requires_closure else None for optim_input in cpu_optim_inputs: - if "fused" in optim_input.kwargs and "cuda" not in optim_info.supports_fused_on: - self.skipTest(f"cuda is not supported for fused on {optim_cls.__name__}") params = [Parameter(torch.randn(2, 3, device="cpu", dtype=dtype)) for _ in range(2)] for p in params: p.grad = torch.randn_like(p) diff --git a/test/test_serialization.py b/test/test_serialization.py index 2f7e6babdecfb..49f8880885ec4 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -26,7 +26,7 @@ from torch.testing._internal.common_utils import ( IS_FILESYSTEM_UTF8_ENCODING, TemporaryDirectoryName, - TestCase, IS_WINDOWS, TEST_DILL, run_tests, download_file, BytesIOContext, TemporaryFileName, + TestCase, IS_FBCODE, IS_WINDOWS, TEST_DILL, run_tests, download_file, BytesIOContext, TemporaryFileName, parametrize, instantiate_parametrized_tests, AlwaysWarnTypedStorageRemoval, serialTest) from torch.testing._internal.common_device_type import instantiate_device_type_tests from torch.testing._internal.common_dtype import all_types_and_complex_and @@ -4000,6 +4000,51 @@ def test_serialization_dtype(self, dtype, weights_only): y['even'][0] = torch.tensor(-0.25, dtype=dtype) self.assertEqual(y['x'][:2].to(dtype=torch.float32), torch.tensor([-0.25, 0.25])) + @parametrize('filename', (True, False)) + @unittest.skipIf(IS_WINDOWS, "NamedTemporaryFile on windows") + @unittest.skipIf(IS_FBCODE, "miniz version differs between fbcode and oss") + def test_filewriter_metadata_writing(self, filename): + sd = torch.nn.Linear(3, 5).state_dict() + weight_nbytes = sd['weight'].untyped_storage().nbytes() + bias_nbytes = sd['bias'].untyped_storage().nbytes() + # TemporaryFileName will give a string + # NamedTemporaryFile will be treated as a buffer + file_creation_func = TemporaryFileName if filename else tempfile.NamedTemporaryFile + + with file_creation_func() as f, file_creation_func() as g: + # save state_dict in f + torch.save(sd, f) + if not filename: + f.seek(0) + # extract 'data.pkl' for use in our fake checkpoint + with torch.serialization._open_file_like(f, 'rb') as opened_file: + with torch.serialization._open_zipfile_reader(opened_file) as zip_file: + data_file = io.BytesIO(zip_file.get_record('data.pkl')) + data_0_offset = zip_file.get_record_offset('data/0') + data_1_offset = zip_file.get_record_offset('data/1') + + # write nulls for 'data/0' and 'data/1' + with open(f if filename else f.name, 'rb+') as opened_f: + opened_f.seek(data_0_offset) + opened_f.write(b'0' * weight_nbytes) + opened_f.seek(data_1_offset) + opened_f.write(b'0' * bias_nbytes) + + with torch.serialization._open_zipfile_writer(g) as zip_file: + data_value = data_file.getvalue() + zip_file.write_record('data.pkl', data_value, len(data_value)) + zip_file.write_record('byteorder', sys.byteorder, len(sys.byteorder)) + # Only write metadata for storages + zip_file.write_record_metadata('data/0', weight_nbytes) + zip_file.write_record_metadata('data/1', bias_nbytes) + + if not filename: + f.seek(0) + g.seek(0) + sd_loaded = torch.load(g) + sd_loaded_ref = torch.load(f) + self.assertEqual(sd_loaded, sd_loaded_ref) + def run(self, *args, **kwargs): with serialization_method(use_zip=True): return super().run(*args, **kwargs) diff --git a/test/test_utils.py b/test/test_utils.py index b151b5141a280..66d66b8874f17 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,37 +1,52 @@ # Owner(s): ["module: unknown"] -import sys import os +import random import re import shutil -import random import subprocess +import sys import tempfile -import traceback import textwrap +import traceback import unittest import warnings -from typing import Any, List, Dict +from typing import Any, Dict, List + import torch +import torch.cuda import torch.nn as nn +import torch.utils.cpp_extension import torch.utils.data -from torch.utils.data import DataLoader +from torch.autograd._functions.utils import check_onnx_broadcast +from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings from torch.testing._internal.common_cuda import TEST_MULTIGPU from torch.testing._internal.common_device_type import ( - ops, - onlyCPU, instantiate_device_type_tests, + onlyCPU, + ops, ) from torch.testing._internal.common_methods_invocations import op_db -import torch.cuda -from torch.utils._pytree import tree_any, tree_all_only -from torch.utils.checkpoint import checkpoint, checkpoint_sequential, get_device_states, _infer_device_type +from torch.testing._internal.common_utils import ( # type: ignore[attr-defined] + IS_FBCODE, + IS_SANDCASTLE, + IS_WINDOWS, + load_tests, +) from torch.utils._device import set_device -from torch.utils._traceback import report_compile_source_on_error, format_traceback_short, CapturedTraceback -import torch.utils.cpp_extension -from torch.autograd._functions.utils import check_onnx_broadcast -from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings -from torch.testing._internal.common_utils import load_tests, IS_FBCODE, IS_SANDCASTLE, IS_WINDOWS # type: ignore[attr-defined] +from torch.utils._pytree import tree_all_only, tree_any +from torch.utils._traceback import ( + CapturedTraceback, + format_traceback_short, + report_compile_source_on_error, +) +from torch.utils.checkpoint import ( + _infer_device_type, + checkpoint, + checkpoint_sequential, + get_device_states, +) +from torch.utils.data import DataLoader # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings @@ -40,11 +55,10 @@ HAS_CUDA = torch.cuda.is_available() -from torch.testing._internal.common_utils import TestCase, run_tests +from torch.testing._internal.common_utils import run_tests, TestCase class RandomDatasetMock(torch.utils.data.Dataset): - def __getitem__(self, index): return torch.tensor([torch.rand(1).item(), random.uniform(0, 1)]) @@ -53,7 +67,6 @@ def __len__(self): class TestCheckpoint(TestCase): - # This runs checkpoint_sequential on each of the nets in # module_lists_to_compare, and compares them against the uncheckpointed model. # To compare, it checks outputs as well as input gradients and parameter gradients @@ -101,9 +114,7 @@ def _check_checkpoint_sequential( # Test whether checkpoint is being triggered or not. For this, we check # the number of times forward pass happens def test_checkpoint_trigger(self): - class Net(nn.Module): - def __init__(self): super().__init__() self.counter = 0 @@ -112,7 +123,7 @@ def forward(self, input_var): self.counter += 1 # For reentrant, need to have autograd actually # pack a tensor to trigger recomp - ret = input_var * torch.tensor(2.) + ret = input_var * torch.tensor(2.0) return ret # checkpointed @@ -122,13 +133,15 @@ def forward(self, input_var): for m in modules: self.assertEqual(m.counter, 0) input_var = torch.randn(3, 4, requires_grad=True) - out = checkpoint_sequential(modules, 2, input_var, use_reentrant=use_reentrant) + out = checkpoint_sequential( + modules, 2, input_var, use_reentrant=use_reentrant + ) for m in modules: self.assertEqual(m.counter, 1) out.sum().backward() - for m in modules[:(len(modules) // 2)]: + for m in modules[: (len(modules) // 2)]: self.assertEqual(m.counter, 2) - for m in modules[(len(modules) // 2):]: + for m in modules[(len(modules) // 2) :]: self.assertEqual(m.counter, 1) def test_checkpoint_valid(self): @@ -138,7 +151,7 @@ def test_checkpoint_valid(self): nn.Linear(50, 20), nn.ReLU(), nn.Linear(20, 5), - nn.ReLU() + nn.ReLU(), ) input_var = torch.randn(1, 100, requires_grad=True) @@ -147,20 +160,33 @@ def test_checkpoint_valid(self): chunks = 2 modules = list(model.children()) out = checkpoint_sequential(modules, chunks, input_var, use_reentrant=True) - with self.assertRaisesRegex(RuntimeError, "torch.utils.checkpoint is incompatible"): + with self.assertRaisesRegex( + RuntimeError, "torch.utils.checkpoint is incompatible" + ): torch.autograd.grad( - outputs=[out], grad_outputs=[torch.ones(1, 5)], inputs=[input_var], create_graph=True + outputs=[out], + grad_outputs=[torch.ones(1, 5)], + inputs=[input_var], + create_graph=True, ) # works with use_reentrant=False, and grads are the same out = model(input_var) grads_no_checkpoint = torch.autograd.grad( - outputs=[out], grad_outputs=[torch.ones(1, 5)], inputs=[input_var], create_graph=True, + outputs=[out], + grad_outputs=[torch.ones(1, 5)], + inputs=[input_var], + create_graph=True, + ) + out_checkpoint = checkpoint_sequential( + modules, chunks, input_var, use_reentrant=False ) - out_checkpoint = checkpoint_sequential(modules, chunks, input_var, use_reentrant=False) # check outputs are the same self.assertEqual(out_checkpoint, out) grads_checkpoint = torch.autograd.grad( - outputs=[out_checkpoint], grad_outputs=[torch.ones(1, 5)], inputs=[input_var], create_graph=True, + outputs=[out_checkpoint], + grad_outputs=[torch.ones(1, 5)], + inputs=[input_var], + create_graph=True, ) self.assertEqual(grads_no_checkpoint, grads_checkpoint) @@ -173,7 +199,7 @@ def test_checkpoint(self): nn.Linear(50, 20), nn.ReLU(), nn.Linear(20, 5), - nn.ReLU() + nn.ReLU(), ) # Compare uncheckpointed model with its checkpointed counterparts @@ -247,7 +273,7 @@ def forward(self): def test_checkpoint_rng_cpu(self): for _ in range(5): - inp = torch.randn(20000, device='cpu').requires_grad_() + inp = torch.randn(20000, device="cpu").requires_grad_() phase1 = torch.nn.Dropout() phase2 = torch.nn.Dropout() @@ -272,10 +298,10 @@ def run_fn(input): self.assertEqual(grad_with_checkpointing, grad_no_checkpointing) - @unittest.skipIf(not HAS_CUDA, 'No CUDA') + @unittest.skipIf(not HAS_CUDA, "No CUDA") def test_checkpoint_rng_cuda(self): for _ in range(5): - inp = torch.randn(20000, device='cuda').requires_grad_() + inp = torch.randn(20000, device="cuda").requires_grad_() phase1 = torch.nn.Dropout() phase2 = torch.nn.Dropout() @@ -300,9 +326,9 @@ def run_fn(input): self.assertEqual(grad_with_checkpointing, grad_no_checkpointing) - @unittest.skipIf(not HAS_CUDA, 'No CUDA') + @unittest.skipIf(not HAS_CUDA, "No CUDA") def test_checkpoint_not_preserve_rng_state_and_without_reentrant(self): - inp = torch.randn(2, device='cuda').requires_grad_() + inp = torch.randn(2, device="cuda").requires_grad_() layer = torch.nn.Dropout() def run_fn(input): @@ -312,9 +338,7 @@ def run_fn(input): out.sum().backward() # This should run without error - def test_checkpoint_non_tensor(self): - def run_fn(tensor1, tensor2): if tensor2 is None: return tensor1 @@ -349,7 +373,9 @@ def foo(t1, t2, scale, t3): res[1].sum().backward(retain_graph=True) res[4].sum().backward(retain_graph=True) res[6].sum().backward() - with self.assertRaisesRegex(RuntimeError, "Trying to backward through the graph a second time"): + with self.assertRaisesRegex( + RuntimeError, "Trying to backward through the graph a second time" + ): res[6].sum().backward() t1_grad = t1.grad t2_grad = t2.grad @@ -387,6 +413,7 @@ def test_checkpoint_partial_grad(self): def run_fn(tensor1, tensor2): # tensor 2 is used for other application logic return tensor1, tensor2 + input_var = torch.randn(1, 4, requires_grad=True) input_var2 = torch.randn(1, 4, requires_grad=False) out = checkpoint(run_fn, input_var, input_var2, use_reentrant=True) @@ -394,11 +421,12 @@ def run_fn(tensor1, tensor2): def run_fn2(tensor1, tensor2): return tensor1 + input_var = torch.randn(1, 4, requires_grad=False) input_var2 = torch.randn(1, 4, requires_grad=True) with self.assertRaisesRegex( RuntimeError, - r"none of output has requires_grad=True, this checkpoint\(\) is not necessary" + r"none of output has requires_grad=True, this checkpoint\(\) is not necessary", ): out = checkpoint(run_fn2, input_var, input_var2, use_reentrant=True) out.sum().backward() @@ -430,13 +458,13 @@ def hook(_unused): def test_fn(x): # The main property of this function is that it contains multiple # operations that save gradients in a chain. - x = x ** 2 + x = x**2 track(x, 2) - x = x ** 2 + x = x**2 track(x, 1) - x = x ** 2 + x = x**2 track(x, 0) - x = x ** 2 + x = x**2 return x.sum() fn(test_fn) @@ -450,20 +478,32 @@ def test_fn(x): non_retain_stats = _do_test(lambda fn: fn(x).backward(), True) # In a retain_grad backward, buffers get preserved - _unused_retain_stats = _do_test(lambda fn: fn(x).backward(retain_graph=True), False) + _unused_retain_stats = _do_test( + lambda fn: fn(x).backward(retain_graph=True), False + ) # In a regular backward with checkpoint, buffers get eagerly freed - checkpoint_non_retain_stats = _do_test(lambda fn: checkpoint(fn, x, use_reentrant=False).backward(), True) + checkpoint_non_retain_stats = _do_test( + lambda fn: checkpoint(fn, x, use_reentrant=False).backward(), True + ) # In a retain_grad backward with checkpoint, buffers get eagerly freed - checkpoint_retain_stats = _do_test(lambda fn: checkpoint(fn, x, use_reentrant=False).backward(retain_graph=True), True) + checkpoint_retain_stats = _do_test( + lambda fn: checkpoint(fn, x, use_reentrant=False).backward( + retain_graph=True + ), + True, + ) self.assertEqual(non_retain_stats, checkpoint_non_retain_stats) self.assertEqual(non_retain_stats, checkpoint_retain_stats) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") def test_get_device_states_recursive(self): - inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="cuda:1")]} + inp = { + "foo": torch.rand(10, device="cuda:0"), + "bar": [torch.rand(10, device="cuda:1")], + } device_ids, device_states = get_device_states(inp) self.assertEqual(2, len(device_ids)) self.assertEqual(2, len(device_states)) @@ -473,7 +513,7 @@ def test_get_device_states_recursive(self): self.assertTrue(isinstance(device_states[1], torch.Tensor)) def test_infer_device_state_recursive_meta(self): - inp = {'foo' : torch.rand(10, device="meta")} + inp = {"foo": torch.rand(10, device="meta")} device_type = _infer_device_type(inp) self.assertEqual("meta", device_type) @@ -481,19 +521,28 @@ def test_infer_device_state_recursive_meta(self): def test_infer_device_state_recursive_multi_cuda(self): # Check that no warning is issued for either cuda:0, cuda:1 or # cuda:0, cuda:0 cases since they are both the same device type - inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="cuda:1")]} + inp = { + "foo": torch.rand(10, device="cuda:0"), + "bar": [torch.rand(10, device="cuda:1")], + } with warnings.catch_warnings(): warnings.simplefilter("error") device_type = _infer_device_type(inp) self.assertEqual("cuda", device_type) - inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="cuda:0")]} + inp = { + "foo": torch.rand(10, device="cuda:0"), + "bar": [torch.rand(10, device="cuda:0")], + } with warnings.catch_warnings(): warnings.simplefilter("error") device_type = _infer_device_type(inp) self.assertEqual("cuda", device_type) # Check that a warning is issued for cuda:0, meta and that it includes # device type information - inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="meta")]} + inp = { + "foo": torch.rand(10, device="cuda:0"), + "bar": [torch.rand(10, device="meta")], + } with warnings.catch_warnings(record=True) as w: device_type = _infer_device_type(inp) self.assertEqual("cuda", device_type) @@ -503,7 +552,7 @@ def test_infer_device_state_recursive_multi_cuda(self): "Tensor arguments, excluding CPU tensors, are detected on at least two types of devices" in warning_msg ) - self.assertTrue("Device types: [\'cuda\', \'meta\']" in warning_msg) + self.assertTrue("Device types: ['cuda', 'meta']" in warning_msg) self.assertTrue("first device type: cuda" in warning_msg) @@ -517,11 +566,13 @@ def setUp(self): def test_random_seed(self): def run(): - dataloader = torch.utils.data.DataLoader(RandomDatasetMock(), - batch_size=2, - num_workers=4, - shuffle=True, - timeout=self.MAX_TIMEOUT_IN_SECOND) + dataloader = torch.utils.data.DataLoader( + RandomDatasetMock(), + batch_size=2, + num_workers=4, + shuffle=True, + timeout=self.MAX_TIMEOUT_IN_SECOND, + ) return next(iter(dataloader)) torch.manual_seed(2018) @@ -534,37 +585,47 @@ def test_single_keep(self): # self.dataset is a Tensor here; technically not a valid input because # not a Dataset subclass, but needs to stay working so add ignore's # for type checking with mypy - dataloader : DataLoader = DataLoader(self.dataset, # type: ignore[arg-type] - batch_size=self.batch_size, - num_workers=0, - drop_last=False) + dataloader: DataLoader = DataLoader( + self.dataset, # type: ignore[arg-type] + batch_size=self.batch_size, + num_workers=0, + drop_last=False, + ) dataiter = iter(dataloader) self.assertEqual(len(list(dataiter)), 2) def test_single_drop(self): - dataloader : DataLoader = DataLoader(self.dataset, # type: ignore[arg-type] - batch_size=self.batch_size, - num_workers=0, - drop_last=True) + dataloader: DataLoader = DataLoader( + self.dataset, # type: ignore[arg-type] + batch_size=self.batch_size, + num_workers=0, + drop_last=True, + ) dataiter = iter(dataloader) self.assertEqual(len(list(dataiter)), 1) - @unittest.skip("FIXME: Intermittent CUDA out-of-memory error on Windows and time-out under ASAN") + @unittest.skip( + "FIXME: Intermittent CUDA out-of-memory error on Windows and time-out under ASAN" + ) def test_multi_keep(self): - dataloader : DataLoader = DataLoader(self.dataset, # type: ignore[arg-type] - batch_size=self.batch_size, - num_workers=2, - drop_last=False, - timeout=self.MAX_TIMEOUT_IN_SECOND) + dataloader: DataLoader = DataLoader( + self.dataset, # type: ignore[arg-type] + batch_size=self.batch_size, + num_workers=2, + drop_last=False, + timeout=self.MAX_TIMEOUT_IN_SECOND, + ) dataiter = iter(dataloader) self.assertEqual(len(list(dataiter)), 2) def test_multi_drop(self): - dataloader : DataLoader = DataLoader(self.dataset, # type: ignore[arg-type] - batch_size=self.batch_size, - num_workers=2, - drop_last=True, - timeout=self.MAX_TIMEOUT_IN_SECOND) + dataloader: DataLoader = DataLoader( + self.dataset, # type: ignore[arg-type] + batch_size=self.batch_size, + num_workers=2, + drop_last=True, + timeout=self.MAX_TIMEOUT_IN_SECOND, + ) dataiter = iter(dataloader) self.assertEqual(len(list(dataiter)), 1) @@ -572,14 +633,20 @@ def test_multi_drop(self): test_dir = os.path.abspath(os.path.dirname(str(__file__))) -@unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set') +@unittest.skipIf( + "SKIP_TEST_BOTTLENECK" in os.environ.keys(), "SKIP_TEST_BOTTLENECK is set" +) class TestBottleneck(TestCase): def _run(self, command, timeout=30): """Returns (return-code, stdout, stderr)""" import subprocess - p = subprocess.Popen(command, stdout=subprocess.PIPE, # noqa: P204 - stderr=subprocess.PIPE, shell=True) + p = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True, + ) try: output, err = p.communicate(timeout=timeout) except subprocess.TimeoutExpired: @@ -590,67 +657,108 @@ def _run(self, command, timeout=30): err_str = err.decode("ascii") return (rc, output_str, err_str) - def _run_bottleneck(self, test_file, scriptargs=''): + def _run_bottleneck(self, test_file, scriptargs=""): curdir = os.path.dirname(os.path.abspath(__file__)) - filepath = f'{curdir}/{test_file}' - if scriptargs != '': - scriptargs = f' {scriptargs}' + filepath = f"{curdir}/{test_file}" + if scriptargs != "": + scriptargs = f" {scriptargs}" rc, out, err = self._run( - f'{sys.executable} -m torch.utils.bottleneck {filepath}{scriptargs}') + f"{sys.executable} -m torch.utils.bottleneck {filepath}{scriptargs}" + ) return rc, out, err def _check_run_args(self): # Check that this fails due to missing args - rc, out, err = self._run_bottleneck('bottleneck_test/test_args.py') - self.assertEqual(rc, 2, atol=0, rtol=0, msg=self._fail_msg('Missing args should error', out + err)) + rc, out, err = self._run_bottleneck("bottleneck_test/test_args.py") + self.assertEqual( + rc, + 2, + atol=0, + rtol=0, + msg=self._fail_msg("Missing args should error", out + err), + ) # This should succeed - rc, out, err = self._run_bottleneck('bottleneck_test/test_args.py', '--foo foo --bar bar') - self.assertEqual(rc, 0, atol=0, rtol=0, msg=self._fail_msg('Should pass args to script', out + err)) + rc, out, err = self._run_bottleneck( + "bottleneck_test/test_args.py", "--foo foo --bar bar" + ) + self.assertEqual( + rc, + 0, + atol=0, + rtol=0, + msg=self._fail_msg("Should pass args to script", out + err), + ) def _fail_msg(self, msg, output): - return f'{msg}, output was:\n{output}' + return f"{msg}, output was:\n{output}" def _check_environment_summary(self, output): - results = re.search('Environment Summary', output) - self.assertIsNotNone(results, self._fail_msg('Should have Environment Summary', output)) + results = re.search("Environment Summary", output) + self.assertIsNotNone( + results, self._fail_msg("Should have Environment Summary", output) + ) # Up to five lines away from the heading, there should be the version number - results = re.search(r'Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+', output) - self.assertIsNotNone(results, self._fail_msg('Should have PyTorch version', output)) + results = re.search( + r"Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+", output + ) + self.assertIsNotNone( + results, self._fail_msg("Should have PyTorch version", output) + ) def _check_cprof_summary(self, output): - results = re.search('cProfile output', output) - self.assertIsNotNone(results, self._fail_msg('Should have cProfile output', output)) + results = re.search("cProfile output", output) + self.assertIsNotNone( + results, self._fail_msg("Should have cProfile output", output) + ) # This assumes that after the cProfile output section we have # the autograd profiler output - results = re.search(r'cProfile output.*(\n.*){6,50}\n.*autograd profiler output', output) - self.assertIsNotNone(results, self._fail_msg( - 'Distance between cProfile and autograd prof out not in [6, 50] lines', output)) + results = re.search( + r"cProfile output.*(\n.*){6,50}\n.*autograd profiler output", output + ) + self.assertIsNotNone( + results, + self._fail_msg( + "Distance between cProfile and autograd prof out not in [6, 50] lines", + output, + ), + ) def _check_autograd_summary(self, output): - results = re.search('autograd profiler output', output) - self.assertIsNotNone(results, self._fail_msg('Should have autograd profiler output', output)) + results = re.search("autograd profiler output", output) + self.assertIsNotNone( + results, self._fail_msg("Should have autograd profiler output", output) + ) # This assumes that after the autograd profiler output is the end of the # output. - results = re.search(r'autograd profiler output.*(\n.*){6,100}', output) - self.assertIsNotNone(results, self._fail_msg( - 'Distance between autograd prof output and end of output not in [6, 100] lines', output)) + results = re.search(r"autograd profiler output.*(\n.*){6,100}", output) + self.assertIsNotNone( + results, + self._fail_msg( + "Distance between autograd prof output and end of output not in [6, 100] lines", + output, + ), + ) def _check_cuda(self, output): if HAS_CUDA: - results = re.search('CUDA mode', output) - self.assertIsNotNone(results, self._fail_msg('Should tell users CUDA', output)) + results = re.search("CUDA mode", output) + self.assertIsNotNone( + results, self._fail_msg("Should tell users CUDA", output) + ) else: - results = re.search('CUDA mode', output) - self.assertIsNone(results, self._fail_msg('Should not tell users about CUDA', output)) + results = re.search("CUDA mode", output) + self.assertIsNone( + results, self._fail_msg("Should not tell users about CUDA", output) + ) - @unittest.skipIf(HAS_CUDA, 'CPU-only test') + @unittest.skipIf(HAS_CUDA, "CPU-only test") def test_bottleneck_cpu_only(self): - rc, out, err = self._run_bottleneck('bottleneck_test/test.py') - self.assertEqual(rc, 0, msg=f'Run failed with\n{err}') + rc, out, err = self._run_bottleneck("bottleneck_test/test.py") + self.assertEqual(rc, 0, msg=f"Run failed with\n{err}") self._check_run_args() self._check_environment_summary(out) @@ -658,10 +766,10 @@ def test_bottleneck_cpu_only(self): self._check_cprof_summary(out) self._check_cuda(out) - @unittest.skipIf(not HAS_CUDA, 'No CUDA') + @unittest.skipIf(not HAS_CUDA, "No CUDA") def test_bottleneck_cuda(self): - rc, out, err = self._run_bottleneck('bottleneck_test/test_cuda.py') - self.assertEqual(rc, 0, msg=f'Run failed with\n{err}') + rc, out, err = self._run_bottleneck("bottleneck_test/test_cuda.py") + self.assertEqual(rc, 0, msg=f"Run failed with\n{err}") self._check_run_args() self._check_environment_summary(out) @@ -677,7 +785,7 @@ def test_bottleneck_cuda(self): class TestCollectEnv(TestCase): def test_smoke(self): info_output = get_pretty_env_info() - self.assertTrue(info_output.count('\n') >= 17) + self.assertTrue(info_output.count("\n") >= 17) class TestONNXUtils(TestCase): @@ -688,7 +796,6 @@ def test_prepare_onnx_paddings(self): self.assertEqual(paddings, [0, 3, 1, 0, 4, 2]) def test_check_onnx_broadcast(self): - def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail): broadcast = True fail = False @@ -741,7 +848,6 @@ def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail): class TestHipify(TestCase): - def test_import_hipify(self): from torch.utils.hipify import hipify_python # noqa: F401 @@ -774,15 +880,19 @@ def test_quote_escape(self): self.assertEqual(self.trie.quote(orig_chars[i]), quoted_strs[i]) def test_export_trie_to_regex(self): - words_to_add = ["__CUDACC__", "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", "CUDA_ERROR_ARRAY_IS_MAPPED", - "CUDA_ERROR_NOT_MAPPED", "CUDA_ERROR_INVALID_SOURCE"] + words_to_add = [ + "__CUDACC__", + "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", + "CUDA_ERROR_ARRAY_IS_MAPPED", + "CUDA_ERROR_NOT_MAPPED", + "CUDA_ERROR_INVALID_SOURCE", + ] for word in words_to_add: self.trie.add(word) regex = self.trie.export_to_regex() expected_regex = r"(?:CUDA_ERROR_(?:ARRAY_IS_MAPPED|CONTEXT_ALREADY_CURRENT|INVALID_SOURCE|NOT_MAPPED)|__CUDACC__)" self.assertEqual(regex, expected_regex) - def test_prefix_words_export_trie_to_regex(self): # test case where some nodes have both children and are also leaf nodes. words_to_add = ["apple", "app", "ban", "banana"] @@ -800,7 +910,6 @@ def test_single_export_trie_to_regex(self): expected_regex = "cudaErrorInvalidMemcpyDirection" self.assertEqual(regex, expected_regex) - def test_char_export_trie_to_regex(self): self.trie.add("a") self.assertEqual(self.trie.export_to_regex(), "a") @@ -811,6 +920,7 @@ def test_special_char_export_trie_to_regex(self): self.trie.add(r"c*") self.assertEqual(self.trie.export_to_regex(), r"c\*") + class TestAssert(TestCase): def test_assert_true(self): # verify assertions work as expected @@ -845,14 +955,16 @@ def test_load_standalone(self): build_dir = tempfile.mkdtemp() try: src_path = os.path.join(build_dir, "main.cpp") - src = textwrap.dedent("""\ + src = textwrap.dedent( + """\ #include #include int main() { auto x = torch::eye(3); std::cout << x << std::endl; } - """) + """ + ) with open(src_path, "w") as f: f.write(src) @@ -866,8 +978,7 @@ def test_load_standalone(self): ext = ".exe" if IS_WINDOWS else "" self.assertEqual( - exec_path, - os.path.join(build_dir, f"standalone_load_test{ext}") + exec_path, os.path.join(build_dir, f"standalone_load_test{ext}") ) for shell in [True, False]: @@ -880,12 +991,14 @@ def test_load_standalone(self): self.assertEqual( # Windows prints "\r\n" for newlines. textwrap.dedent(r.stdout.decode("utf-8")).replace("\r\n", "\n"), - textwrap.dedent("""\ + textwrap.dedent( + """\ 1 0 0 0 1 0 0 0 1 [ CPUFloatType{3,3} ] - """) + """ + ), ) finally: @@ -930,30 +1043,30 @@ def tearDown(self): def test_external_module_register(self): # Built-in module with self.assertRaisesRegex(RuntimeError, "The runtime module of"): - torch._register_device_module('cuda', torch.cuda) + torch._register_device_module("cuda", torch.cuda) # Wrong device type with self.assertRaisesRegex(RuntimeError, "Expected one of cpu"): - torch._register_device_module('dummmy', DummyPrivateUse1Module) + torch._register_device_module("dummmy", DummyPrivateUse1Module) with self.assertRaises(AttributeError): torch.privateuseone.is_available() # type: ignore[attr-defined] - torch._register_device_module('privateuseone', DummyPrivateUse1Module) + torch._register_device_module("privateuseone", DummyPrivateUse1Module) torch.privateuseone.is_available() # type: ignore[attr-defined] # No supporting for override with self.assertRaisesRegex(RuntimeError, "The runtime module of"): - torch._register_device_module('privateuseone', DummyPrivateUse1Module) + torch._register_device_module("privateuseone", DummyPrivateUse1Module) def test_external_module_register_with_renamed_backend(self): - torch.utils.rename_privateuse1_backend('foo') + torch.utils.rename_privateuse1_backend("foo") with self.assertRaisesRegex(RuntimeError, "has already been set"): - torch.utils.rename_privateuse1_backend('dummmy') + torch.utils.rename_privateuse1_backend("dummmy") custom_backend_name = torch._C._get_privateuse1_backend_name() - self.assertEqual(custom_backend_name, 'foo') + self.assertEqual(custom_backend_name, "foo") with self.assertRaises(AttributeError): torch.foo.is_available() # type: ignore[attr-defined] @@ -961,65 +1074,69 @@ def test_external_module_register_with_renamed_backend(self): with self.assertRaisesRegex(AssertionError, "Tried to use AMP with the"): with torch.autocast(device_type=custom_backend_name): pass - torch._register_device_module('foo', DummyPrivateUse1Module) + torch._register_device_module("foo", DummyPrivateUse1Module) torch.foo.is_available() # type: ignore[attr-defined] with torch.autocast(device_type=custom_backend_name): pass - self.assertEqual(torch._utils._get_device_index('foo:1'), 1) + self.assertEqual(torch._utils._get_device_index("foo:1"), 1) self.assertEqual(torch._utils._get_device_index(torch.device("foo:2")), 2) + class TestRenderUtils(TestCase): def test_basic(self): self.assertExpectedInline( - torch._utils.render_call(torch.sum, [torch.randn(100)], {'dim': 0}), - '''torch.sum(tensor([...], size=(100,)), dim=0)''' + torch._utils.render_call(torch.sum, [torch.randn(100)], {"dim": 0}), + """torch.sum(tensor([...], size=(100,)), dim=0)""", ) self.assertExpectedInline( - torch._utils.render_call(torch.sum, [torch.randn(100, 100)], {'dim': 0}), - '''torch.sum(tensor([...], size=(100, 100)), dim=0)''' + torch._utils.render_call(torch.sum, [torch.randn(100, 100)], {"dim": 0}), + """torch.sum(tensor([...], size=(100, 100)), dim=0)""", ) + class TestDeviceUtils(TestCase): def test_basic(self): - with torch.device('meta') as dev: + with torch.device("meta") as dev: x = torch.empty(3, 3) - self.assertEqual(x.device.type, 'meta') - self.assertEqual(dev, torch.device('meta')) + self.assertEqual(x.device.type, "meta") + self.assertEqual(dev, torch.device("meta")) def test_decorator(self): - @set_device('meta') + @set_device("meta") def f(): return torch.empty(3, 3) - self.assertEqual(f().device.type, 'meta') + + self.assertEqual(f().device.type, "meta") def test_decorator_generator(self): - @set_device('meta') + @set_device("meta") def f(): yield torch.empty(3, 3) yield torch.empty(3, 3) + r1, r2 = list(f()) - self.assertEqual(r1.device.type, 'meta') - self.assertEqual(r2.device.type, 'meta') + self.assertEqual(r1.device.type, "meta") + self.assertEqual(r2.device.type, "meta") def test_nn_module(self): - with torch.device('meta'): + with torch.device("meta"): m = nn.Linear(40, 50) - self.assertEqual(m.weight.device.type, 'meta') + self.assertEqual(m.weight.device.type, "meta") def test_set_default_device(self): try: - torch.set_default_device('meta') + torch.set_default_device("meta") r = torch.empty(2, 2) finally: torch.set_default_device(None) - self.assertEqual(r.device.type, 'meta') + self.assertEqual(r.device.type, "meta") def test_get_default_device(self): - torch.set_default_device('meta') - self.assertEqual(torch.get_default_device().type, 'meta') + torch.set_default_device("meta") + self.assertEqual(torch.get_default_device().type, "meta") torch.set_default_device(None) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") @@ -1048,7 +1165,7 @@ def test_device_mode_ops(self, device, dtype, op): # very incomplete if tree_any( lambda x: isinstance(x, torch.Tensor), - (sample.input, sample.args, sample.kwargs) + (sample.input, sample.args, sample.kwargs), ): continue # Many OpInfos will explicitly pass in a device. DeviceContext @@ -1057,11 +1174,11 @@ def test_device_mode_ops(self, device, dtype, op): # NB: Can't pass None to sample_inputs, the function can't # handle it. kwargs = sample.kwargs.copy() - kwargs.pop('device', None) - with torch.device('meta'): + kwargs.pop("device", None) + with torch.device("meta"): r = func(sample.input, *sample.args, **kwargs) self.assertTrue( - tree_all_only(torch.Tensor, lambda x: x.device.type == 'meta', r) + tree_all_only(torch.Tensor, lambda x: x.device.type == "meta", r) ) @@ -1070,22 +1187,22 @@ def test_device_mode_ops(self, device, dtype, op): class TestCppExtensionUtils(TestCase): def test_cpp_compiler_is_ok(self): - self.assertTrue(torch.utils.cpp_extension.check_compiler_ok_for_platform('c++')) + self.assertTrue(torch.utils.cpp_extension.check_compiler_ok_for_platform("c++")) def test_cc_compiler_is_ok(self): - self.assertTrue(torch.utils.cpp_extension.check_compiler_ok_for_platform('cc')) + self.assertTrue(torch.utils.cpp_extension.check_compiler_ok_for_platform("cc")) class TestTraceback(TestCase): def test_basic(self): - source = '''\ + source = """\ def f(x): def g(x): raise RuntimeError # HEYA x = x * 3 return g(x) + 1 -''' +""" out: Dict[str, Any] = {} scope = {"__compile_source__": source} @@ -1095,29 +1212,36 @@ def g(x): with report_compile_source_on_error(): out["f"](1) except RuntimeError as e: - self.assertIn("HEYA", ''.join(traceback.format_tb(e.__traceback__))) + self.assertIn("HEYA", "".join(traceback.format_tb(e.__traceback__))) def test_format_traceback_short(self): try: raise RuntimeError except RuntimeError as e: - self.assertRegex(format_traceback_short(e.__traceback__), r'.*test_utils.py:\d+ in test_format_traceback_short') + self.assertRegex( + format_traceback_short(e.__traceback__), + r".*test_utils.py:\d+ in test_format_traceback_short", + ) def test_captured_traceback(self): - self.assertIn('test_captured_traceback', ''.join(CapturedTraceback.extract().format())) + self.assertIn( + "test_captured_traceback", "".join(CapturedTraceback.extract().format()) + ) def test_captured_traceback_format_all(self): - rs = CapturedTraceback.format_all([CapturedTraceback.extract(), CapturedTraceback.extract()]) + rs = CapturedTraceback.format_all( + [CapturedTraceback.extract(), CapturedTraceback.extract()] + ) self.assertEqual(len(rs), 2) - self.assertIn('test_captured_traceback_format_all', ''.join(rs[0])) + self.assertIn("test_captured_traceback_format_all", "".join(rs[0])) def test_captured_traceback_format_all_cached(self): tb = CapturedTraceback.extract() tb.format() # cached rs = CapturedTraceback.format_all([tb, CapturedTraceback.extract()]) self.assertEqual(len(rs), 2) - self.assertIn('test_captured_traceback_format_all', ''.join(rs[0])) + self.assertIn("test_captured_traceback_format_all", "".join(rs[0])) -if __name__ == '__main__': +if __name__ == "__main__": run_tests() diff --git a/third_party/miniz-2.1.0/miniz.c b/third_party/miniz-2.1.0/miniz.c index 4b5d53f817216..dc790d9e36b7c 100755 --- a/third_party/miniz-2.1.0/miniz.c +++ b/third_party/miniz-2.1.0/miniz.c @@ -6250,6 +6250,7 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n mz_uint32 extra_size = 0; mz_uint8 extra_data[MZ_ZIP64_MAX_CENTRAL_EXTRA_FIELD_SIZE]; mz_uint16 bit_flags = 0; + mz_bool write_metadata_only = buf_size && !pBuf; if ((int)level_and_flags < 0) level_and_flags = MZ_DEFAULT_LEVEL; @@ -6263,7 +6264,7 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n level = level_and_flags & 0xF; store_data_uncompressed = ((!level) || (level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA)); - if ((!pZip) || (!pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING) || ((buf_size) && (!pBuf)) || (!pArchive_name) || ((comment_size) && (!pComment)) || (level > MZ_UBER_COMPRESSION)) + if ((!pZip) || (!pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING) || (!pArchive_name) || ((comment_size) && (!pComment)) || (level > MZ_UBER_COMPRESSION)) return mz_zip_set_error(pZip, MZ_ZIP_INVALID_PARAMETER); pState = pZip->m_pState; @@ -6308,7 +6309,9 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n if (!(level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA)) { - uncomp_crc32 = (mz_uint32)mz_crc32(MZ_CRC32_INIT, (const mz_uint8 *)pBuf, buf_size); + if (!write_metadata_only) { + uncomp_crc32 = (mz_uint32)mz_crc32(MZ_CRC32_INIT, (const mz_uint8 *)pBuf, buf_size); + } uncomp_size = buf_size; if (uncomp_size <= 3) { @@ -6330,8 +6333,8 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n if (!pState->m_zip64) { /* Bail early if the archive would obviously become too large */ - if ((pZip->m_archive_size + num_alignment_padding_bytes + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + archive_name_size - + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + archive_name_size + comment_size + user_extra_data_len + + if ((pZip->m_archive_size + num_alignment_padding_bytes + MZ_ZIP_LOCAL_DIR_HEADER_SIZE + archive_name_size + + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + archive_name_size + comment_size + user_extra_data_len + pState->m_central_dir.m_size + MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE + user_extra_data_central_len + MZ_ZIP_DATA_DESCRIPTER_SIZE32) > 0xFFFFFFFF) { diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index efc37aee123d8..776125e84a7f1 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -2733,6 +2733,7 @@ - name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor) input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple()" + result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, epsilon) - name: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor) save_mean: not_implemented("miopen_batch_norm_backward save_mean") diff --git a/tools/autograd/templates/Functions.h b/tools/autograd/templates/Functions.h index 437a5e8e89889..1780df8edaab7 100644 --- a/tools/autograd/templates/Functions.h +++ b/tools/autograd/templates/Functions.h @@ -22,7 +22,7 @@ using at::ArrayRef; using at::Type; using at::TensorGeometry; using at::ScalarType; -using c10::optional; +using std::optional; using c10::fmap; inline std::vector unpack_list(at::ArrayRef xs, std::shared_ptr saved_for = nullptr) { @@ -34,12 +34,12 @@ inline std::vector unpack_list(at::ArrayRef xs, std::shar }); } -inline c10::List> unpack_opt_list(at::ArrayRef xs, std::shared_ptr saved_for = nullptr) { - torch::List> result; +inline c10::List> unpack_opt_list(at::ArrayRef xs, std::shared_ptr saved_for = nullptr) { + torch::List> result; result.reserve(xs.size()); for (const SavedVariable& v : xs) { auto var = v.unpack(saved_for); - result.push_back(var.defined() ? c10::optional(var) : c10::nullopt); + result.push_back(var.defined() ? std::optional(var) : c10::nullopt); } return result; } diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h index 065812694cfe4..08da173f94bf8 100644 --- a/tools/autograd/templates/VariableType.h +++ b/tools/autograd/templates/VariableType.h @@ -42,7 +42,7 @@ using at::Quantizer; // we'll remove them when we are actually exposing Quantizer class // to frontend using ConstQuantizerPtr = const c10::intrusive_ptr&; -using c10::optional; +using std::optional; namespace VariableType { TORCH_API std::vector allCUDATypes(); diff --git a/tools/autograd/templates/ViewFuncs.h b/tools/autograd/templates/ViewFuncs.h index faf5ab6881f18..1f69c062d344e 100644 --- a/tools/autograd/templates/ViewFuncs.h +++ b/tools/autograd/templates/ViewFuncs.h @@ -20,7 +20,7 @@ using at::IntArrayRef; using at::ArrayRef; using at::Type; using at::ScalarType; -using c10::optional; +using std::optional; using c10::fmap; ${view_func_declarations} diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp index 437ea23d079bf..242adcd205336 100644 --- a/tools/autograd/templates/python_variable_methods.cpp +++ b/tools/autograd/templates/python_variable_methods.cpp @@ -397,7 +397,7 @@ static PyObject * THPVariable_invert(PyObject* self, PyObject* args) { END_HANDLE_TH_ERRORS } -static Tensor dispatch_to(const Tensor & self, Device device, bool non_blocking, bool copy, c10::optional optional_memory_format) { +static Tensor dispatch_to(const Tensor & self, Device device, bool non_blocking, bool copy, std::optional optional_memory_format) { pybind11::gil_scoped_release no_gil; // NOTE: this is where we record aten::to in the graph during tracing. However, the behavior of aten::to // is different with respect to TensorOptions fields that are not present: aten::to inherits fields that @@ -407,18 +407,18 @@ static Tensor dispatch_to(const Tensor & self, Device device, bool non_blocking, return self.to(self.options().device(device).memory_format(optional_memory_format), non_blocking, copy); } -static Tensor dispatch_to(const Tensor & self, bool non_blocking, bool copy, c10::optional optional_memory_format) { +static Tensor dispatch_to(const Tensor & self, bool non_blocking, bool copy, std::optional optional_memory_format) { pybind11::gil_scoped_release no_gil; return self.to(self.options().memory_format(optional_memory_format), non_blocking, copy); } -static Tensor dispatch_to(const Tensor & self, ScalarType dtype, bool non_blocking, bool copy, c10::optional optional_memory_format) { +static Tensor dispatch_to(const Tensor & self, ScalarType dtype, bool non_blocking, bool copy, std::optional optional_memory_format) { pybind11::gil_scoped_release no_gil; // TODO: Make this call the TensorOptions version, maybe? return self.to(dtype, non_blocking, copy, optional_memory_format); } -static Tensor dispatch_to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking, bool copy, c10::optional optional_memory_format) { +static Tensor dispatch_to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking, bool copy, std::optional optional_memory_format) { pybind11::gil_scoped_release no_gil; // TODO: Make this call the TensorOptions version, maybe? return self.to(device, dtype, non_blocking, copy, optional_memory_format); @@ -546,7 +546,7 @@ static PyObject * THPVariable_ipu(PyObject* self, PyObject* args, PyObject* kwar END_HANDLE_TH_ERRORS } -static PyObject * THPVariable_to_type(PyObject* self, ScalarType scalarType, c10::optional optional_memory_format) { +static PyObject * THPVariable_to_type(PyObject* self, ScalarType scalarType, std::optional optional_memory_format) { HANDLE_TH_ERRORS auto& self_ = THPVariable_Unpack(self); return THPVariable_Wrap(dispatch_to(self_, scalarType, false, false, optional_memory_format)); diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py index 311eac59eb283..3e43edd502475 100644 --- a/tools/testing/test_selections.py +++ b/tools/testing/test_selections.py @@ -12,7 +12,7 @@ IS_MEM_LEAK_CHECK = os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1" BUILD_ENVIRONMENT = os.getenv("BUILD_ENVIRONMENT", "") -USE_3_PROCS = "sm86" in BUILD_ENVIRONMENT +USE_3_PROCS = "sm86" in BUILD_ENVIRONMENT or "cuda" not in BUILD_ENVIRONMENT # NUM_PROCS_FOR_SHARDING_CALC must remain consistent across all shards of a job # to ensure that sharding is consistent, NUM_PROCS is the actual number of procs diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py index 982544dfe6079..b277bb7eceb06 100644 --- a/torch/_decomp/__init__.py +++ b/torch/_decomp/__init__.py @@ -279,6 +279,7 @@ def core_aten_decompositions() -> Dict[torch._ops.OperatorBase, Callable]: aten.linalg_cross, aten.cudnn_batch_norm, aten.cudnn_batch_norm_backward, + aten.miopen_batch_norm_backward, aten.deg2rad, aten.deg2rad_, aten.detach, diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py index 6cfccbab0d04b..040fbc825becd 100644 --- a/torch/_decomp/decompositions.py +++ b/torch/_decomp/decompositions.py @@ -2319,6 +2319,32 @@ def native_batch_norm_backward_out( return grad_input +@register_decomposition(aten.miopen_batch_norm_backward) +@out_wrapper("out0", "out1", "out2") +def miopen_batch_norm_backward( + input: Tensor, + grad_output: Tensor, + weight: Tensor, + running_mean: Optional[Tensor], + running_var: Optional[Tensor], + save_mean: Optional[Tensor], + save_var: Optional[Tensor], + epsilon: float, +): + return aten.native_batch_norm_backward( + grad_output, + input, + weight, + running_mean, + running_var, + save_mean, + save_var, + True, + epsilon, + [True, True, True], + ) + + @register_decomposition(aten.cudnn_batch_norm_backward) @out_wrapper("out0", "out1", "out2") def cudnn_batch_norm_backward( diff --git a/torch/_decomp/decompositions_for_jvp.py b/torch/_decomp/decompositions_for_jvp.py index 81946c314638a..d430386ff3606 100644 --- a/torch/_decomp/decompositions_for_jvp.py +++ b/torch/_decomp/decompositions_for_jvp.py @@ -329,3 +329,4 @@ def batch_norm_backward( _register_jit_decomposition_for_jvp(torch.ops.aten.native_batch_norm_backward.default) _register_jit_decomposition_for_jvp(torch.ops.aten.cudnn_batch_norm_backward.default) _register_jit_decomposition_for_jvp(torch.ops.aten.batch_norm_backward.default) +_register_jit_decomposition_for_jvp(torch.ops.aten.miopen_batch_norm_backward.default) diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py index b9c4fbfd7b6e4..98496b5fc5de5 100644 --- a/torch/_dynamo/__init__.py +++ b/torch/_dynamo/__init__.py @@ -62,7 +62,7 @@ # Wrap manual_seed with the disable decorator. # Can't do it at its implementation due to dependency issues. - torch.manual_seed = disable(torch.manual_seed) + torch.manual_seed = torch._disable_dynamo(torch.manual_seed) # Add the new manual_seed to the builtin registry. torch.jit._builtins._register_builtin(torch.manual_seed, "aten::manual_seed") diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py index 0f4c0dad59bde..6dbd7f36b0b5d 100644 --- a/torch/_dynamo/codegen.py +++ b/torch/_dynamo/codegen.py @@ -136,6 +136,23 @@ def __call__(self, value, allow_cache=True): ) ) output.extend(create_call_function(2, True)) + elif ( + isinstance(value, SymNodeVariable) + and value.python_type() == float + and not self.tx.export + ): + # This is a little unusual; force the output convention to be a + # Tensor here. Don't do this for export because this is + # apparently load bearing for export tests (but I am a bit + # doubtful it actually works in the real world) + # NB: It works to add_graph_output on a computed expression + # as_tensor here, because we memoize as_tensor calls on + # SymNodeVariable! + graph_outputs_key = self.add_graph_output(value.as_tensor(self.tx)) + self.load_graph_output(graph_outputs[graph_outputs_key].index) + output.extend( + [self.create_load_attr("item")] + create_call_function(0, True) + ) elif isinstance( value, ( diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py index f5a3978eb2ae8..498478a540991 100644 --- a/torch/_dynamo/config.py +++ b/torch/_dynamo/config.py @@ -54,6 +54,11 @@ def is_fbcode(): # to be dynamic, but accesses to ints should NOT get promoted into inputs. specialize_int = False +# Whether or not to specialize on float inputs. Dynamo will always promote +# float inputs into Tensor inputs, but at the moment, backends inconsistently +# support codegen on float (this is to be fixed). +specialize_float = True + # legacy config, does nothing now! dynamic_shapes = True @@ -232,7 +237,7 @@ def is_fbcode(): # false_fn produces code with identical guards. enforce_cond_guards_match = True -# Specify how to optimize a compiiled DDP module. The flag accepts a bollean +# Specify how to optimize a compiled DDP module. The flag accepts a boolean # value or a string. There are 4 modes. # 1. "ddp_optimizer" (or True): with "ddp_ptimizer", Dynamo will automatically # split model graph into pieces to match DDP bucket sizes to allow DDP diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py index 77447bc17dee1..38795341be216 100644 --- a/torch/_dynamo/convert_frame.py +++ b/torch/_dynamo/convert_frame.py @@ -1,10 +1,14 @@ +import base64 import collections +import cProfile import dis import functools import itertools import logging import os +import pstats import random +import subprocess import sys import threading import time @@ -12,8 +16,11 @@ import types import typing import weakref +from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Set +from torch._utils_internal import maybe_upload_prof_stats_to_manifold + from torch.fx._lazy_graph_module import ( # type: ignore[attr-defined] _use_lazy_graph_module, ) @@ -87,7 +94,6 @@ is_namedtuple, istype, LazyString, - maybe_cprofile, orig_code_map, record_compilation_metrics, reset_graph_break_dup_checker, @@ -286,6 +292,83 @@ def exception_handler(e, code, frame=None, export=False): FRAME_COMPILE_COUNTER: typing.Counter[int] = collections.Counter() +def maybe_cprofile(func): + if config.cprofile: + return cprofile_wrapper(func) + return func + + +def cprofile_wrapper(func): + @functools.wraps(func) + def profile_wrapper(*args, **kwargs): + trace_id = CompileContext.current_trace_id() + assert trace_id, "Trace id is None" + profile_path = Path( + f"/tmp/{func.__name__}_{str(trace_id).replace('/','_')}.profile" + ) + prof = cProfile.Profile() + prof.enable() + start_ts = time.time() + retval = prof.runcall(func, *args, **kwargs) + profile_latency = time.time() - start_ts + prof.disable() + log.info( + "### Cprofile for %s trace id [%s] took %.3f seconds ###", + func.__name__, + trace_id, + profile_latency, + ) + ps = pstats.Stats(prof) + try: + prof.dump_stats(profile_path) + except PermissionError: + log.info("Cannot write to %s", str(profile_path)) + svg_path = profile_path.with_suffix(".svg") + try: + gprof2dot_process = subprocess.Popen( + [ + "gprof2dot", + "-f", + "pstats", + "--node-label=total-time-percentage", + "--node-label=self-time-percentage", + "--node-label=total-time", + str(profile_path), + ], + stdout=subprocess.PIPE, + ) + subprocess.check_call( + ["dot", "-Tsvg", "-o", str(svg_path)], + stdin=gprof2dot_process.stdout, + ) + log.info("Generated SVG from profile at %s", str(svg_path)) + except FileNotFoundError: + log.info( + "Failed to generate SVG from profile -- dumping stats instead." + "Try installing gprof2dot and dot for a better visualization" + ) + ps.sort_stats(pstats.SortKey.TIME).print_stats(20) + ps.sort_stats(pstats.SortKey.CUMULATIVE).print_stats(20) + + maybe_upload_prof_stats_to_manifold(str(profile_path)) # fb-only + + torch._logging.trace_structured( + "artifact", + lambda: { + "name": "dynamo_cprofile_prof", + "type": "prof", + "encoding": "base64", + }, + payload_fn=lambda: base64.encodebytes( + open(profile_path, "rb").read() + ).decode("ascii"), + ) + + return retval + + return profile_wrapper + + def convert_frame_assert( compiler_fn: CompilerFn, one_graph: bool = True, @@ -428,7 +511,6 @@ def register_bytecode_hook(hook: BytecodeHook) -> RemovableHandle: @compile_time_strobelight_meta(phase_name="_compile") @_use_lazy_graph_module(config.use_lazy_graph_module) -@maybe_cprofile def _compile( code: types.CodeType, globals: Dict[str, object], @@ -512,6 +594,7 @@ def transform(instructions, code_options): instructions[:] = remove_pointless_jumps(remove_dead_code(instructions)) @dynamo_timed(phase_name="entire_frame_compile") + @maybe_cprofile def compile_inner( code: types.CodeType, one_graph: bool, diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py index bb90d28421457..391bdfcf02020 100644 --- a/torch/_dynamo/eval_frame.py +++ b/torch/_dynamo/eval_frame.py @@ -150,7 +150,10 @@ def __init__(self, mod: torch.nn.Module, dynamo_ctx): def _initialize(self): # Do this stuff in constructor to lower overhead slightly - if isinstance(self._orig_mod.forward, types.MethodType) and trace_rules.check( + if isinstance(self.dynamo_ctx, DisableContext): + # No need to check trace rules + self.forward = self.dynamo_ctx(self._orig_mod.__call__) + elif isinstance(self._orig_mod.forward, types.MethodType) and trace_rules.check( self._orig_mod.forward ): # This may be a torch.nn.* instance in trace_rules.py which @@ -353,14 +356,9 @@ def get_compiler_config(): # User has wrapped the class with compile/disable decorator. Apply # disable to init/call method. cls_obj = fn - if isinstance(self, DisableContext): - # Disable on init is useful for reconstruction of bytecodes where we - # want to prevent Dynamo from tracing into the init function. Check - # test_reconstruction in test_model_output.py. - cls_obj.__init__ = self(cls_obj.__init__) cls_obj.__call__ = self(cls_obj.__call__) if issubclass(cls_obj, torch.nn.Module): - # NN module variable tracker directly inlines the _call_impl. Disable it. + # NN module variable tracker directly inlines the _call_impl. cls_obj._call_impl = self(cls_obj._call_impl) return cls_obj @@ -383,12 +381,8 @@ def get_compiler_config(): callback = self.callback - if isinstance(self, DisableContext): - is_jit_tracing = always_false - is_fx_tracing = always_false - else: - is_jit_tracing = torch._C._is_tracing - is_fx_tracing = torch.fx._symbolic_trace.is_fx_tracing + is_jit_tracing = torch._C._is_tracing + is_fx_tracing = torch.fx._symbolic_trace.is_fx_tracing @functools.wraps(fn) def _fn(*args, **kwargs): @@ -424,10 +418,7 @@ def _fn(*args, **kwargs): cleanup() # hooks to properly handle inlining - if isinstance(self, DisableContext): - _fn._torchdynamo_disable = True # type: ignore[attr-defined] - else: - _fn._torchdynamo_inline = fn # type: ignore[attr-defined] + _fn._torchdynamo_inline = fn # type: ignore[attr-defined] # Save the function pointer to find the original callable while nesting # of decorators. @@ -519,6 +510,53 @@ class DisableContext(_TorchDynamoContext): def __init__(self): super().__init__(callback=None) + def __call__(self, fn): + # Earlier this code was in the base class _TorchDynamoContext. But we + # moved it here to have better code organization. For disable, we just + # want the callback to be None. We don't have to check trace_rules or + # create any wrapper. + fn = innermost_fn(fn) + + if isinstance(fn, torch.nn.Module): + mod = fn + new_mod = OptimizedModule(mod, self) + new_mod._torchdynamo_orig_callable = mod.forward + return new_mod + + if inspect.isclass(fn): + # User has wrapped the class with compile/disable decorator. Apply + # disable to init/call method. + cls_obj = fn + # Disable on init is useful for reconstruction of bytecodes where we + # want to prevent Dynamo from tracing into the init function. Check + # test_reconstruction in test_model_output.py. + cls_obj.__init__ = self(cls_obj.__init__) + cls_obj.__call__ = self(cls_obj.__call__) + if issubclass(cls_obj, torch.nn.Module): + # NN module variable tracker directly inlines the _call_impl. Disable it. + cls_obj._call_impl = self(cls_obj._call_impl) + return cls_obj + + assert callable(fn) + + callback = self.callback + + @functools.wraps(fn) + def _fn(*args, **kwargs): + prior = set_eval_frame(callback) + try: + return fn(*args, **kwargs) + finally: + set_eval_frame(prior) + + _fn._torchdynamo_disable = True # type: ignore[attr-defined] + + # Save the function pointer to find the original callable while nesting + # of decorators. + _fn._torchdynamo_orig_callable = fn # type: ignore[attr-defined] + + return _fn + def _optimize_catch_errors( compile_fn, diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py index 42353eca8bb23..0e714cb1a5428 100644 --- a/torch/_dynamo/guards.py +++ b/torch/_dynamo/guards.py @@ -259,6 +259,7 @@ def uninteresting_files(): "utils_device": torch.utils._device, "device": torch.device, "___from_numpy": from_numpy, + "___as_tensor": torch.as_tensor, "torch": torch, "inspect": inspect, } diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py index 4606795bf677d..a1b63304fa897 100644 --- a/torch/_dynamo/output_graph.py +++ b/torch/_dynamo/output_graph.py @@ -1069,6 +1069,7 @@ def append_prefix_insts(): TensorWithTFOverrideVariable, ), ) + and not (isinstance(v, SymNodeVariable) and v.python_type() is float) for v in stack_values ) and all(isinstance(x, TensorVariable) for x in stack_values) @@ -1412,13 +1413,103 @@ def example_inputs(self) -> List[torch.Tensor]: return result def remove_unused_graphargs(self) -> None: + # NB: It's always OK to drop GraphArg for symbols that ended up being + # specialized. You don't even have to make a guard for it, because + # ShapeEnv produce_guards operates on tracked_fakes, which never gets + # pruned. That being said, you'll get marginally better generated + # guard code if you promote the guard into a Dynamo guard (since that + # allows for the guard to be done using C++ guards.) If we get + # ShapeEnv guards to go into C++ guards, this will stop being a thing + # though! + assert self.should_exit + # Miniature DCE pass, but only for obviously trivial operations + def is_static_true(b_node: fx.node.Argument): + if b_node is True: + return True + if not isinstance(b_node, fx.Node): + return False + b = b_node.meta.get("example_value") + if b is None: + return False + if b is True: + return True + if ( + isinstance(b, torch.SymBool) + and (r := b.node.maybe_as_bool()) is not None + ): + return r + # TODO: We can also technically remove all cases when the input + # doesn't have unbacked inputs, since it's all in the ShapeEnv + return False + + def is_symnode_arg(a: fx.node.Argument): + from torch.fx.experimental.sym_node import SymTypes + + if isinstance(a, (int, float, bool)): + return True + if isinstance(a, fx.Node): + return isinstance(a.meta.get("example_value"), SymTypes) + return False + + # NB: We assume that you cannot do mutations on int/float/bool, + # because they are immutable types, and therefore is always safe to + # DCE. + def is_symnode_compute_node(node): + from torch.fx.experimental.sym_node import SymTypes + + if node.op != "call_function": + return False + # TODO: I don't think it's possible to have a bare int/float here? + if not isinstance(node.meta.get("example_value"), SymTypes): + return False + # TODO: This will bail here if you ever end up with a more complicated + # computation function, like sum(list_of_ints), even though it + # should be DCE'able + if not all(is_symnode_arg(a) for a in node.args): + return False + if not all(is_symnode_arg(a) for a in node.kwargs.values()): + return False + return True + + # NB: You could try to expand this to cover more cases by simply + # detecting whenever you have an int output, but this is a bit + # dangerous in case someone adds a function that returns an int but is + # mutating. So manually whitelist for now. + def is_accessor_node(node): + if ( + node.op == "call_method" + and isinstance(node.args[0].meta.get("example_value"), torch.Tensor) + and node.target in ["size", "stride", "storage_offset", "item"] + ): + return True + if node.op == "call_function" and node.target in [ + torch.ops.aten.sym_size, + torch.ops.aten.sym_size.default, + torch.ops.aten.sym_size.int, + torch.ops.aten.sym_stride, + torch.ops.aten.sym_stride.default, + torch.ops.aten.sym_stride.int, + torch.ops.aten.sym_storage_offset, + torch.ops.aten.sym_storage_offset.default, + ]: + return True + return False + for node in reversed(list(self.graph.nodes)): if len(list(node.users)) == 0: - if node.op == "get_attr": - self.remove_node(node) - elif node.op == "call_function" and node.target is operator.getitem: + if ( + node.op == "get_attr" + or (node.op == "call_function" and node.target is operator.getitem) + or ( + node.op == "call_function" + and node.target is torch._check + and is_static_true(node.args[0]) + ) + or is_symnode_compute_node(node) + or is_accessor_node(node) + ): self.remove_node(node) def placeholder_binds_symbol(node): diff --git a/torch/_dynamo/polyfill.py b/torch/_dynamo/polyfill.py index 18aaa067a3d28..6104da9311098 100644 --- a/torch/_dynamo/polyfill.py +++ b/torch/_dynamo/polyfill.py @@ -56,6 +56,13 @@ def list_cmp(op: Callable[[Any, Any], bool], left: Sequence[Any], right: Sequenc return op(len(left), len(right)) +def set_isdisjoint(set1, set2): + for x in set1: + if x in set2: + return False + return True + + def dropwhile(predicate, iterable): # dropwhile(lambda x: x<5, [1,4,6,4,1]) -> 6 4 1 iterable = iter(iterable) diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py index cb42c7eb20344..33c464da5bd3f 100644 --- a/torch/_dynamo/source.py +++ b/torch/_dynamo/source.py @@ -560,6 +560,17 @@ def reconstruct(self, codegen): codegen.extend_output(create_call_function(1, True)) +# NB: We don't expect you to actually ever generate guards against this +# source, it is ephemeral +@dataclasses.dataclass(frozen=True) +class FloatTensorSource(ChainedSource): + def name(self) -> str: + return f"___as_tensor({self.base.name()})" + + def guard_source(self): + return self.base.guard_source() + + # This is a synthetic source that is associated with the singleton # shape env guard we always register for all frames. We get the actual # guard contents from the ambient ShapeEnv @@ -617,3 +628,7 @@ def is_from_defaults(source: Source): if isinstance(source, ChainedSource): return is_from_defaults(source.base) return False + + +def is_cell_contents(source: Source): + return isinstance(source, AttrSource) and source.member == "cell_contents" diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py index ff9438085c529..9c050d84a5eee 100644 --- a/torch/_dynamo/utils.py +++ b/torch/_dynamo/utils.py @@ -2,7 +2,6 @@ import collections import contextlib import copy -import cProfile import dataclasses import datetime import dis @@ -16,9 +15,7 @@ import math import operator import os -import pstats import re -import subprocess import sys import textwrap import threading @@ -28,7 +25,6 @@ import weakref from contextlib import contextmanager from functools import lru_cache, wraps -from pathlib import Path from types import MethodWrapperType from typing import ( Any, @@ -50,8 +46,6 @@ ValuesView, ) -from torch._utils_internal import maybe_upload_prof_stats_to_manifold - from ..utils.hooks import RemovableHandle try: @@ -135,63 +129,6 @@ def tabulate(rows, headers): ) -def maybe_cprofile(func): - if config.cprofile: - return cprofile_wrapper(func) - return func - - -def cprofile_wrapper(func): - @wraps(func) - def profile_wrapper(*args, **kwargs): - global timer_counter - profile_cnt = next(timer_counter) - profile_path = Path("/tmp/" + func.__name__ + f"{profile_cnt}.profile") - prof = cProfile.Profile() - prof.enable() - start_ts = time.time() - retval = prof.runcall(func, *args, **kwargs) - profile_latency = time.time() - start_ts - prof.disable() - print( - f"### Cprofile for {func.__name__} iter {profile_cnt} took {profile_latency:.3f} seconds ###" - ) - ps = pstats.Stats(prof) - prof.dump_stats(profile_path) - svg_path = profile_path.with_suffix(".svg") - try: - gprof2dot_process = subprocess.Popen( - [ - "gprof2dot", - "-f", - "pstats", - "--node-label=total-time-percentage", - "--node-label=self-time-percentage", - "--node-label=total-time", - str(profile_path), - ], - stdout=subprocess.PIPE, - ) - subprocess.check_call( - ["dot", "-Tsvg", "-o", str(svg_path)], - stdin=gprof2dot_process.stdout, - ) - print(f"Generated SVG from profile at {str(svg_path)}") - except FileNotFoundError: - print( - "Failed to generate SVG from profile -- dumping stats instead." - "Try installing gprof2dot and dot for a better visualization" - ) - ps.sort_stats(pstats.SortKey.TIME).print_stats(20) - ps.sort_stats(pstats.SortKey.CUMULATIVE).print_stats(20) - - maybe_upload_prof_stats_to_manifold(str(profile_path)) # fb-only - - return retval - - return profile_wrapper - - curr_frame = 0 diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py index 575ccfa53f8d2..8f9ab01088a70 100644 --- a/torch/_dynamo/variables/builder.py +++ b/torch/_dynamo/variables/builder.py @@ -9,6 +9,7 @@ import inspect import itertools import logging +import math import operator import re import sys @@ -54,8 +55,10 @@ ConstantSource, ConstDictKeySource, ConvertIntSource, + FloatTensorSource, GetItemSource, GradSource, + is_cell_contents, is_constant_source, is_from_defaults, is_from_optimizer_source, @@ -1152,8 +1155,7 @@ def wrap_module(self, value: torch.nn.Module): ) def wrap_literal(self, value): - unspec = not config.specialize_int - if unspec and type(value) is int: + if not config.specialize_int and type(value) is int: # unspecializing int by default, but still # specialize for the following conditions if not TracingContext.get().force_unspec_int_unbacked_size_like and ( @@ -1165,11 +1167,14 @@ def wrap_literal(self, value): # NN modules on the fly) or self.source.guard_source().is_nn_module() or is_from_defaults(self.source) + or is_cell_contents(self.source) ): self.install_guards(GuardBuilder.CONSTANT_MATCH) return ConstantVariable.create(value=value, source=self.source) else: return self.wrap_symint(value) + elif not config.specialize_float and type(value) is float: + return self.wrap_symfloat(value) else: self.install_guards(GuardBuilder.CONSTANT_MATCH) return ConstantVariable.create(value=value) @@ -1498,6 +1503,140 @@ def wrap_symint(self, value): return unspec_var + def wrap_symfloat(self, value): + # SymFloat wrapping is special. We first wrap it in the same way we + # do an unspecialized primitive, and then we item() it into a + # SymFloat. Removal of the item() call is left to a later FX pass, + # mostly because that pass is more easily done after we have lowered + # to ATen ops. (Dynamo doesn't do decomposition right now). + + if self.name in self.tx.output.unspec_variable_map: + return self.tx.output.unspec_variable_map[self.name] + + # NB: we specialize on nan input, because our guard modeling in + # ShapeEnv cannot deal with nan + if ( + torch._dynamo.config.specialize_float + or is_constant_source(self.get_source()) + or math.isnan(value) + ): + self.install_guards(GuardBuilder.CONSTANT_MATCH) + return ConstantVariable.create(value=value, source=self.source) + + # NB: At the point we've gotten here, we don't assume static by + # default. Since we have a guard mechanism, there isn't really any + # downside to trying to be dynamic for float all the time. Unlike + # ints, this won't make codegen perf worse. Modest cost to compile + # time. + + wrapped_value = torch.tensor(value) + # TODO: Switch RandomValueSource over to use this, this is more + # accurate + assert not isinstance(self.get_source(), RandomValueSource) + install_guard(self.get_source().make_guard(GuardBuilder.TYPE_MATCH)) + + # The FloatTensorSource here is just for pedantic correctness: if you + # guard against an UnspecializedPythonVariable, you need to guard + # against the tensor-ified version of the local, otherwise it's not a + # Tensor. However, we never let the UnspecializedPythonVariable escape + # here, so there should never actually be any guards against this + # source. + options = {"source": FloatTensorSource(self.get_source()), "raw_value": value} + + # TODO: Maybe the tensor-ification should be built into the source, + # rather than by special pattern match + proxy = self.tx.output.root_tracer.create_graph_input( + re.sub(r"[^a-zA-Z0-9]+", "_", self.name), + type(wrapped_value), + source=self.get_source(), + ) + + unspec_var = wrap_fx_proxy_cls( + UnspecializedPythonVariable, + tx=self.tx, + proxy=proxy, + example_value=wrapped_value, + **options, + ) + assert isinstance(unspec_var, UnspecializedPythonVariable) + self.tx.output.unspec_variable_map[self.name] = unspec_var + + if self.tx.export and not isinstance(self.get_source(), LocalSource): + raise AssertionError( + f"Dynamo attempts to add additional input during export: value={wrapped_value}, source={self.get_source()}" + ) + fake_tensor_value = None + example_value = unspec_var.proxy.node.meta["example_value"] + assert is_fake(example_value) + + fake_tensor_value = example_value + assert fake_tensor_value.fake_mode is self.tx.fake_mode, ( + f"fake mode ({fake_tensor_value.fake_mode}) from fake tensor metadata doesn't match mode" + "({self.tx.fake_mode}) from InstructionTranslator" + ) + + # There's something a bit incoherent about pass_arg_as_tensor, + # specifically regarding sources. + # + # Specifically, suppose we have "x: float" local argument. We + # eventually end up with an UnspecializedPythonVariable denoting + # torch.as_tensor(x)... but it's source is still L['x'] (which if you + # accessed it directly is a float!) So you gotta be careful when + # setting up your guards, because it's still going to be a float at + # this point, the conversion happens only precisely at the point we're + # actually calling the FX graph. This happens to be what we want for + # shape guard generation, but it's kind of unintuitive. + proxy.node.meta["grapharg"] = GraphArg( + self.get_source(), + wrapped_value, + pass_arg_as_tensor=True, + fake_tensor=fake_tensor_value, + is_tensor=False, + example_strong_ref=wrapped_value, + ) + + # OK, now the crazy sauce. We want to generate a SymNodeVariable to + # do the rest of our tracing, doing the equivalent of an item() call. + # But we don't /actually/ want to do an item() call, because that will + # give us an unbacked SymFloat, but this is really a backed SymFloat. + + item_proxy = self.tx.output.create_proxy( + "call_method", + "item", + (proxy,), + {}, + ) + # Do NOT do conventional fake tensor prop + + shape_env = self.tx.output.shape_env + item_symbol = shape_env.create_unspecified_symbol( + value, + # Interesting! Normally if you do compute on a Variable (the + # compute in this case being an item() call), you end up with a + # new variable that doesn't have source, but in this case, we can + # still put a source on it. + source=self.source, + # If we put in a Tensor input, definitely dynamic (if you wanted + # it to be static, gotta bail out earlier) + dynamic_dim=DimDynamic.DYNAMIC, + ) + item_example_value = shape_env.create_symfloatnode( + item_symbol, hint=value, source=self.source + ) + set_example_value(item_proxy.node, item_example_value) + + self.tx.output.tracked_fakes.append( + TrackedFake(item_example_value, self.source, None) + ) + + item_unspec_var = SymNodeVariable( + item_proxy, + item_example_value, + source=self.get_source(), # Interesting as above! + ) + + return item_unspec_var + def wrap_unspecialized_primitive(self, value): if self.name in self.tx.output.unspec_variable_map: return self.tx.output.unspec_variable_map[self.name] diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py index 91a1da13db895..791d19ffb4c1a 100644 --- a/torch/_dynamo/variables/builtin.py +++ b/torch/_dynamo/variables/builtin.py @@ -1187,8 +1187,10 @@ def _call_iter_tuple_list(self, tx, obj=None, *args, **kwargs): obj.source.make_guard(GuardBuilder.TUPLE_ITERATOR_LEN) ) else: - if getattr(obj, "source", False) and isinstance( - obj, ConstDictVariable + if ( + getattr(obj, "source", False) + and isinstance(obj, ConstDictVariable) + and not istype(obj, SetVariable) ): tx.output.guard_on_key_order.add(obj.source.name()) diff --git a/torch/_dynamo/variables/constant.py b/torch/_dynamo/variables/constant.py index 29a3a72a6f86f..c4502cca6bbe3 100644 --- a/torch/_dynamo/variables/constant.py +++ b/torch/_dynamo/variables/constant.py @@ -39,7 +39,7 @@ def create(value, **kwargs) -> VariableTracker: assert not isinstance(value, disallowed_type), reason # Routing for list and tuple literals. - if is_literal and isinstance(value, (list, tuple)): + if is_literal and isinstance(value, (list, tuple, set, frozenset)): items = [] for i, x in enumerate(value): item_source = GetItemSource(source, i) if source else None @@ -51,7 +51,11 @@ def create(value, **kwargs) -> VariableTracker: source=item_source, ) ) - return variables.BaseListVariable.cls_for(type(value))(items, **kwargs) + if isinstance(value, (list, tuple)): + return variables.BaseListVariable.cls_for(type(value))(items, **kwargs) + else: + assert isinstance(value, (set, frozenset)), type(value) + return variables.SetVariable(items) return ConstantVariable(value, **kwargs) diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py index 60fda2146432f..77da855b69aec 100644 --- a/torch/_dynamo/variables/dicts.py +++ b/torch/_dynamo/variables/dicts.py @@ -9,7 +9,7 @@ from torch._subclasses.fake_tensor import is_fake -from .. import variables +from .. import polyfill, variables from ..bytecode_transformation import ( create_call_function, create_call_method, @@ -17,7 +17,6 @@ create_load_method, ) from ..eval_frame import skip_code - from ..exc import unimplemented from ..guards import GuardBuilder, install_guard from ..source import AttrSource, GetItemSource @@ -401,6 +400,12 @@ def call_method( result = self.set_items.pop().vt super().call_method(tx, name, (result,), kwargs) return result + elif name == "isdisjoint": + assert not kwargs + assert len(args) == 1 + return variables.UserFunctionVariable( + polyfill.set_isdisjoint + ).call_function(tx, [self, args[0]], {}) return super().call_method(tx, name, args, kwargs) def getitem_const(self, arg: VariableTracker): diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py index d51b4daff3471..7802ddbb3390b 100644 --- a/torch/_dynamo/variables/lists.py +++ b/torch/_dynamo/variables/lists.py @@ -529,6 +529,9 @@ def get_item_dyn(self, tx, arg: VariableTracker): assert isinstance(index, (int, torch.SymInt)) return self.items[index] + def call_hasattr(self, tx, name: str) -> "VariableTracker": + return variables.ConstantVariable.create(hasattr(torch.Size, name)) + class NamedTupleVariable(TupleVariable): _nonvar_fields = { diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py index e928a9e0ea6ed..e1d5cee368dac 100644 --- a/torch/_dynamo/variables/tensor.py +++ b/torch/_dynamo/variables/tensor.py @@ -958,7 +958,9 @@ def set_name_hint(self, name: str): class SymNodeVariable(VariableTracker): """ - Represents a symbolic size, e.g., as returned by tensor.size(0) + Represents a symbolic scalar, either int, float or bool. This is most commonly used to + handle symbolic size computation, e.g., tensor.size(0), but it is also used to + handle logic like float_tensor.item() or unspecialized float inputs. """ _nonvar_fields = { @@ -986,6 +988,7 @@ def __init__(self, proxy, sym_num, **kwargs): self.proxy = proxy # TODO: Should we allow non SymTypes here? Today it is allowed self.sym_num = sym_num + self._tensor_var = None def python_type(self): if isinstance(self.sym_num, SymTypes): @@ -996,6 +999,15 @@ def python_type(self): def as_proxy(self): return self.proxy + def as_tensor(self, tx): + if self._tensor_var is None: + from .builder import SourcelessBuilder + + self._tensor_var = SourcelessBuilder.create( + tx, torch.scalar_tensor + ).call_function(tx, [self], {}) + return self._tensor_var + def evaluate_expr(self, output_graph=None): try: return guard_scalar(self.sym_num) diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py index 47705cdc07e1f..8e7089f080595 100644 --- a/torch/_dynamo/variables/torch.py +++ b/torch/_dynamo/variables/torch.py @@ -83,6 +83,7 @@ torch._assert, torch._utils._get_device_index, torch._C._get_cublas_allow_tf32, + torch._C._is_any_autocast_enabled, torch.cuda.get_device_properties, torch.cuda.is_available, torch.distributed.is_available, diff --git a/torch/_export/passes/_node_metadata_hook.py b/torch/_export/passes/_node_metadata_hook.py index 529690ed934f2..9ca554349fab3 100644 --- a/torch/_export/passes/_node_metadata_hook.py +++ b/torch/_export/passes/_node_metadata_hook.py @@ -37,7 +37,7 @@ def _node_metadata_hook(node: torch.fx.Node, stack_trace: str) -> None: node.meta["val"] = fake_res node.meta["stack_trace"] = stack_trace - node.meta["nn_module_stack"] = arg_meta["nn_module_stack"] + node.meta["nn_module_stack"] = arg_meta.get("nn_module_stack", {}) node.meta["torch_fn"] = ( f"{node.target.__name__}_0", f"{node.target.__class__.__name__}.{node.target.__name__}", diff --git a/torch/_export/passes/replace_set_grad_with_hop_pass.py b/torch/_export/passes/replace_set_grad_with_hop_pass.py index e362ee3547715..91104c17c38d7 100644 --- a/torch/_export/passes/replace_set_grad_with_hop_pass.py +++ b/torch/_export/passes/replace_set_grad_with_hop_pass.py @@ -60,6 +60,12 @@ def _replace_with_hop(node: torch.fx.Node): set_grad_node.meta.get("nn_module_stack", {}) ) output_node = next(iter(reversed(sub_gm.graph.nodes)), None) + # Split_module pass intentially doesn't add output node + # if the graph doesn't return anything. + # TODO (tmanlaibaatar) Figure out if this is right behaviour + # for split_module + if isinstance(output_node, torch.fx.Node) and output_node.op != "output": + output_node = None if output_node is not None: assert len(output_node.args) == 1 output_args = output_node.args[0] @@ -106,9 +112,7 @@ def _replace_with_hop(node: torch.fx.Node): f"repalce_set_grad_with_hop_pass doesnt' support output type {type(output_args)}" ) else: - raise NotImplementedError( - "Cannot replace a call_module with a hop if it has no output. This module will gets DCEed." - ) + node.graph.erase_node(node) sub_graph.erase_node(set_grad_node) @@ -164,6 +168,7 @@ def _maybe_inline_or_replace_with_hop(node: torch.fx.Node): else node ), ) + new_gm.recompile() return new_gm return gm diff --git a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py index f0d39fd1e858e..320a899e6b646 100644 --- a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py +++ b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py @@ -88,13 +88,18 @@ def _compute_output_meta_with_inductor_strides(fw_module, fwd_output_strides): # will only be set for inductor if not fwd_output_strides: return out - with TracingContext.get().fake_mode.shape_env.suppress_guards(): - for i in range(len(out)): - if not isinstance(out[i], Tensor): - continue - if all(s1 == s2 for s1, s2 in zip(out[i].stride(), fwd_output_strides[i])): - continue - out[i] = out[i].as_strided(out[i].shape, fwd_output_strides[i]) + + from torch.fx.experimental.symbolic_shapes import statically_known_true + + for i in range(len(out)): + if not isinstance(out[i], Tensor): + continue + if all( + statically_known_true(s1 == s2) + for s1, s2 in zip(out[i].stride(), fwd_output_strides[i]) + ): + continue + out[i] = out[i].as_strided(out[i].shape, fwd_output_strides[i]) return out @@ -141,7 +146,6 @@ def aot_dispatch_base( ( fw_module, updated_flat_args, - aot_config, fw_metadata, ) = fakified_out_wrapper.pre_compile( fw_module, updated_flat_args, aot_config, fw_metadata=fw_metadata @@ -150,7 +154,6 @@ def aot_dispatch_base( ( fw_module, updated_flat_args, - aot_config, fw_metadata, ) = functionalized_rng_wrapper.pre_compile( fw_module, updated_flat_args, aot_config, fw_metadata=fw_metadata @@ -187,12 +190,12 @@ def aot_dispatch_base( # Create a wrapper to set up the rng functionalize and fakified out bits compiled_fw = functionalized_rng_wrapper.post_compile( - compiled_fw, aot_config, fw_metadata=fw_metadata + compiled_fw, aot_config, runtime_metadata=fw_metadata ) compiled_fw = fakified_out_wrapper.post_compile( compiled_fw, aot_config, - fw_metadata=fw_metadata, + runtime_metadata=fw_metadata, ) # Why do we need to pass in num_fw_outs_saved_for_bw? # See Note: [Partitioner handling for Subclasses, Part 2] @@ -205,7 +208,7 @@ def aot_dispatch_base( ).post_compile( compiled_fw, aot_config, # not used - fw_metadata=fw_metadata, + runtime_metadata=fw_metadata, ) if not hasattr(compiled_fw_func, "_boxed_call"): @@ -218,7 +221,7 @@ def aot_dispatch_base( ).post_compile( compiled_fw_func, aot_config, - fw_metadata=fw_metadata, + runtime_metadata=fw_metadata, ) return compiled_fn @@ -420,7 +423,6 @@ def aot_dispatch_autograd( ( fw_module, adjusted_flat_args, - aot_config, fw_metadata, ) = fakified_out_wrapper.pre_compile( fw_module, adjusted_flat_args, aot_config, fw_metadata=fw_metadata @@ -432,7 +434,6 @@ def aot_dispatch_autograd( ( fw_module, adjusted_flat_args, - aot_config, fw_metadata, ) = functionalized_rng_wrapper.pre_compile( fw_module, adjusted_flat_args, aot_config, fw_metadata=fw_metadata @@ -457,16 +458,16 @@ def aot_dispatch_autograd( ).post_compile( compiled_fw_func, aot_config, # not used - fw_metadata=fw_metadata, + runtime_metadata=fw_metadata, ) compiled_fw_func = functionalized_rng_wrapper.post_compile( - compiled_fw_func, aot_config, fw_metadata=fw_metadata + compiled_fw_func, aot_config, runtime_metadata=fw_metadata ) compiled_fw_func = fakified_out_wrapper.post_compile( compiled_fw_func, aot_config, - fw_metadata=fw_metadata, + runtime_metadata=fw_metadata, ) # NB: It's important to compile backwards ahead of time, as this may @@ -1032,7 +1033,7 @@ def backward(ctx, *args): ).post_compile( CompiledFunction.apply, aot_config, - fw_metadata=fw_metadata, + runtime_metadata=fw_metadata, ) if not config.debug_assert: diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py index 934b783f6fc8d..a1fb2980ed1d4 100644 --- a/torch/_functorch/_aot_autograd/runtime_wrappers.py +++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py @@ -73,7 +73,7 @@ def pre_compile( aot_config: AOTConfig, *, fw_metadata: ViewAndMutationMeta, - ): + ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]: """ Process the inputs to the compiler_fn. You can pass in extra metadata via kwargs. Args: @@ -82,15 +82,15 @@ def pre_compile( aot_config: AOTConfig passed in at compile time fw_metadata: ViewAndMutationMeta generated from flat_fn and flat_args """ - return flat_fn, flat_args, aot_config, fw_metadata + return flat_fn, flat_args, fw_metadata - def post_compile(self, compiled_fn, aot_config, *, fw_metadata): + def post_compile(self, compiled_fn, aot_config, *, runtime_metadata) -> Callable: """ Given an output of the compiler, wrap it with information received from prologue. Args: compiled_fn: Callable after calling compiler_fn aot_config: AOTConfig after calling prologue - fw_metadata: ViewAndMutationMeta after calling prologue + runtime_metadata: ViewAndMutationMeta after calling all wrappers's pre_compile steps. Example: def wrapped_compiled_fn(args): @@ -101,28 +101,6 @@ def wrapped_compiled_fn(args): """ return compiled_fn - def create( - self, - flat_fn, - flat_args: List[Tensor], - aot_config: AOTConfig, - *, - fw_metadata: ViewAndMutationMeta, - compiler_fn, - ): - ( - wrapped_flat_fn, - new_flat_args, - new_aot_config, - new_fw_metadata, - ) = self.pre_compile(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata) - compiled_fn = compiler_fn( - wrapped_flat_fn, new_flat_args, new_aot_config, fw_metadata=new_fw_metadata - ) - return self.post_compile( - compiled_fn, new_aot_config, fw_metadata=new_fw_metadata - ) - # The wrapper created by this function handles all of the runtime aliasing and mutation "epilogue" logic # that needs to run after the compiled function. @@ -143,11 +121,11 @@ def post_compile( compiled_fn, aot_config: AOTConfig, *, - fw_metadata: ViewAndMutationMeta, + runtime_metadata: ViewAndMutationMeta, ): return _create_runtime_wrapper( compiled_fn, - runtime_metadata=fw_metadata, + runtime_metadata=runtime_metadata, indices_of_inps_to_detach=self.indices_of_inps_to_detach, trace_joint=self.trace_joint, keep_input_mutations=aot_config.keep_inference_input_mutations, @@ -421,7 +399,7 @@ def pre_compile( aot_config, *, fw_metadata, - ): + ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]: if config.functionalize_rng_ops: # Update example inputs for the fw_compiler fake_mode = detect_fake_mode() @@ -430,27 +408,27 @@ def pre_compile( # We are not clearing flat_args here because # 1) There is a check in the debug compiler at the end # 2) It does not matter as these are fake tensors - return flat_fn, flat_args, aot_config, fw_metadata + return flat_fn, flat_args, fw_metadata def post_compile( self, compiled_fn, aot_config: AOTConfig, *, - fw_metadata: ViewAndMutationMeta, + runtime_metadata: ViewAndMutationMeta, ): @wraps(compiled_fn) def wrapper(runtime_args: List[Any]): - if fw_metadata.is_rng_op_functionalized: + if runtime_metadata.is_rng_op_functionalized: # Add the seed and offset to args seed, offset = CUDARngStateHelper.get_torch_state_as_tuple() runtime_args.extend([seed, offset]) out = compiled_fn(runtime_args) out = self._functionalized_rng_runtime_epilogue( - fw_metadata, + runtime_metadata, out, # TODO: this won't be right for the backward when we convert the call_compiled_backward to use the wrapper - fw_metadata.num_forward_returns, + runtime_metadata.num_forward_returns, ) return out return compiled_fn(runtime_args) @@ -493,7 +471,7 @@ def pre_compile( aot_config, *, fw_metadata, - ): + ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]: tracing_context = torch._guards.TracingContext.try_get() if tracing_context and tracing_context.fakify_first_call: self.out_metas = [ @@ -501,7 +479,7 @@ def pre_compile( ] else: self.needs_post_compile = False - return fw_module, flat_args, aot_config, fw_metadata + return fw_module, flat_args, fw_metadata def _compute_output_meta_with_inductor_strides(self): out = self.out_metas @@ -528,7 +506,7 @@ def post_compile( compiled_fn, aot_config: AOTConfig, *, - fw_metadata: ViewAndMutationMeta, + runtime_metadata: ViewAndMutationMeta, ): if self.needs_post_compile: assert self.fwd_output_strides is not None @@ -575,19 +553,19 @@ def pre_compile( fw_only=self.fw_only, # type: ignore[arg-type] ) self.maybe_subclass_meta = subclass_meta - return new_flat_fn, new_flat_args, aot_config, fw_metadata + return new_flat_fn, new_flat_args, fw_metadata def post_compile( self, compiled_fn, _aot_config: AOTConfig, *, - fw_metadata: ViewAndMutationMeta, + runtime_metadata: ViewAndMutationMeta, ): if self.maybe_subclass_meta is None: return compiled_fn - subclass_metas = fw_metadata.subclass_fw_graph_out_meta + subclass_metas = runtime_metadata.subclass_fw_graph_out_meta @wraps(compiled_fn) def inner_fn(args: List[Any]): @@ -713,7 +691,7 @@ def pre_compile( aot_config: AOTConfig, *, fw_metadata: ViewAndMutationMeta, - ): + ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]: # Use information about whether or not flat_fn mutates its arguments # or not to handle dupe args @@ -740,7 +718,7 @@ def pre_compile( if ok: self.needs_post_compile = False - return flat_fn, leaf_flat_args, aot_config, fw_metadata + return flat_fn, leaf_flat_args, fw_metadata if requires_subclass_dispatch(leaf_flat_args, fw_metadata): raise RuntimeError( @@ -865,14 +843,14 @@ def wrapped_flat_fn(*args): ref_fw_metadata == updated_fw_metadata ), f"ref_metadata={str(ref_fw_metadata)}, actual_metadata={str(updated_fw_metadata)}" - return wrapped_flat_fn, deduped_flat_args, aot_config, updated_fw_metadata + return wrapped_flat_fn, deduped_flat_args, updated_fw_metadata def post_compile( self, compiled_fn, aot_config: AOTConfig, *, - fw_metadata: ViewAndMutationMeta, + runtime_metadata: ViewAndMutationMeta, ): if not self.needs_post_compile: return compiled_fn @@ -932,6 +910,8 @@ def debugged_compiled_fn(args): # would cause us to hit that path more frequently). @dataclass class AOTSyntheticBaseWrapper(CompilerWrapper): + # Currently, the only reason we need to plumb this bool is because + # the synthetic base code prohibits more cases in the autograd case than the inference case. trace_joint: bool # TODO: refactor trace_joint needs_post_compile: bool = True aliased_arg_idx_with_metadata_mutations: List[int] = field(default_factory=list) @@ -943,7 +923,7 @@ def pre_compile( aot_config: AOTConfig, *, fw_metadata: ViewAndMutationMeta, - ): + ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]: is_inference = not self.trace_joint flat_args_with_synthetic_bases, synthetic_base_info = merge_view_inputs( flat_args, @@ -954,7 +934,7 @@ def pre_compile( # Happy path: we don't need synthetic bases if synthetic_base_info is None: self.needs_post_compile = False - return flat_fn, flat_args, aot_config, fw_metadata + return flat_fn, flat_args, fw_metadata # export path: ban synthetic bases for now, add later if requested. if requires_subclass_dispatch(flat_args, fw_metadata): @@ -1050,7 +1030,6 @@ def wrapped_flat_fn(*args): return ( wrapped_flat_fn, flat_args_with_synthetic_bases, - aot_config, fw_metadata_updated, ) @@ -1059,7 +1038,7 @@ def post_compile( compiled_fn, aot_config: AOTConfig, *, - fw_metadata: ViewAndMutationMeta, + runtime_metadata: ViewAndMutationMeta, ): if not self.needs_post_compile: return compiled_fn diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py index f1ba67794bc70..379518fb958c3 100644 --- a/torch/_functorch/aot_autograd.py +++ b/torch/_functorch/aot_autograd.py @@ -670,23 +670,26 @@ def convert(idx, x): aot_dispatch_base_graph if aot_config.is_export else aot_dispatch_base ) - wrappers = [ + # Wrappers that edit fw_metadata + fw_metadata_wrappers = [ AOTDedupeWrapper(), AOTSyntheticBaseWrapper(trace_joint=needs_autograd), # Add more passes here ] - for wrapper in wrappers: - flat_fn, fake_flat_args, aot_config, fw_metadata = wrapper.pre_compile( + for wrapper in fw_metadata_wrappers: + flat_fn, fake_flat_args, fw_metadata = wrapper.pre_compile( flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata ) + # Once all fw_metadata_wrappers have run, runtime_metadata is fixed + runtime_metadata = fw_metadata compiled_fn = compiler_fn( - flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata + flat_fn, fake_flat_args, aot_config, fw_metadata=runtime_metadata ) - for wrapper in reversed(wrappers): + for wrapper in reversed(fw_metadata_wrappers): compiled_fn = wrapper.post_compile( - compiled_fn, aot_config, fw_metadata=fw_metadata + compiled_fn, aot_config, runtime_metadata=runtime_metadata ) if aot_config.is_export: diff --git a/torch/_higher_order_ops/associative_scan.py b/torch/_higher_order_ops/associative_scan.py index 287e59ea00932..8b406f39a64d7 100644 --- a/torch/_higher_order_ops/associative_scan.py +++ b/torch/_higher_order_ops/associative_scan.py @@ -110,16 +110,12 @@ def add(x: torch.Tensor, y: torch.Tensor): def trace_associative_scan( proxy_mode, func_overload, combine_fn: Callable, input: List[torch.Tensor], dim: int ): - pre_dispatch = getattr(proxy_mode, "pre_dispatch", False) - with disable_proxy_modes_tracing(): sample_inputs = [ torch.full((), False, dtype=x.dtype, device=x.device) for x in itertools.chain(input, input) ] - combine_graph = reenter_make_fx(combine_fn, pre_dispatch=pre_dispatch)( - *sample_inputs - ) + combine_graph = reenter_make_fx(combine_fn)(*sample_inputs) outputs = None for node in combine_graph.graph.nodes: diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py index 40aee90affccd..359feb192ae5c 100644 --- a/torch/_higher_order_ops/cond.py +++ b/torch/_higher_order_ops/cond.py @@ -29,7 +29,6 @@ from torch._subclasses.fake_tensor import FakeTensorMode from torch.fx.experimental.proxy_tensor import ( _temp_remove_pre_dispatch_torch_function_mode, - disable_proxy_modes_tracing, ProxyTorchDispatchMode, track_tensor_tree, ) @@ -159,11 +158,8 @@ def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands): isinstance(o, torch.Tensor) for o in operands ), "Cond operands must be a list of tensors" - pre_dispatch = getattr(proxy_mode, "pre_dispatch", False) - - with disable_proxy_modes_tracing(): - true_graph = reenter_make_fx(true_fn, pre_dispatch)(*operands) - false_graph = reenter_make_fx(false_fn, pre_dispatch)(*operands) + true_graph = reenter_make_fx(true_fn)(*operands) + false_graph = reenter_make_fx(false_fn)(*operands) true_outs = [] false_outs = [] diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py index 664bfe1c4dd0a..b5e1385da346b 100644 --- a/torch/_higher_order_ops/flex_attention.py +++ b/torch/_higher_order_ops/flex_attention.py @@ -6,6 +6,7 @@ from torch._higher_order_ops.utils import ( _has_potential_branch_input_mutation, autograd_not_implemented, + reenter_make_fx, UnsupportedAliasMutationException, ) from torch._ops import HigherOrderOperator @@ -178,7 +179,7 @@ def trace_flex_attention( torch.zeros((), dtype=query.dtype, requires_grad=query.requires_grad) ] + [torch.zeros((), dtype=torch.int) for _ in range(4)] with TransformGetItemToIndex(): - score_graph = make_fx(score_mod)(*example_vals, *other_buffers) + score_graph = reenter_make_fx(score_mod)(*example_vals, *other_buffers) qualname = proxy_mode.tracer.get_fresh_qualname("sdpa_score") proxy_mode.tracer.root.register_module(qualname, score_graph) node_args = (query, key, value, score_graph, *other_buffers) diff --git a/torch/_higher_order_ops/map.py b/torch/_higher_order_ops/map.py index 6bef897dfa511..2bf88ea19565f 100644 --- a/torch/_higher_order_ops/map.py +++ b/torch/_higher_order_ops/map.py @@ -230,8 +230,7 @@ def trace_map(proxy_mode, func_overload, f, xs, pos_args): example_input = _unstack_pytree(xs)[0] body_graph = f - pre_dispatch = getattr(proxy_mode, "pre_dispatch", False) - body_graph = reenter_make_fx(body_graph, pre_dispatch)(*example_input, *pos_args) + body_graph = reenter_make_fx(body_graph)(*example_input, *pos_args) next_name = proxy_mode.tracer.get_fresh_qualname("body_graph_") diff --git a/torch/_higher_order_ops/utils.py b/torch/_higher_order_ops/utils.py index 32bb465041ce5..0fcf22bcc3388 100644 --- a/torch/_higher_order_ops/utils.py +++ b/torch/_higher_order_ops/utils.py @@ -1,3 +1,4 @@ +import functools from contextlib import contextmanager from dataclasses import dataclass from typing import Any, Callable @@ -76,16 +77,19 @@ def graph_with_interpreter(*args): return maybe_interpreted_fn -# We'll use the current decomposition table to make sure operators in subgraphs are -# decomposed properly. -# We also need to maybe run with interpreter for propagating stack_trace -def reenter_make_fx(fn, pre_dispatch=False): - decomp_table = torch.fx.experimental.proxy_tensor.CURRENT_DECOMPOSITION_TABLE - return make_fx( - _maybe_run_with_interpreter(fn), - decomposition_table=decomp_table, - pre_dispatch=pre_dispatch, - ) +def reenter_make_fx(fn): + from torch.fx.experimental.proxy_tensor import _CURRENT_MAKE_FX_TRACER + + @functools.wraps(fn) + def wrapped(*args): + assert ( + _CURRENT_MAKE_FX_TRACER is not None + ), "Cannot reenter make_fx when we're not under a make_fx tracing session" + return _CURRENT_MAKE_FX_TRACER.trace_subgraph( + _maybe_run_with_interpreter(fn), *args + ) + + return wrapped @contextmanager diff --git a/torch/_higher_order_ops/while_loop.py b/torch/_higher_order_ops/while_loop.py index 15bacb4bc1942..b0ab00bdfac45 100644 --- a/torch/_higher_order_ops/while_loop.py +++ b/torch/_higher_order_ops/while_loop.py @@ -15,11 +15,7 @@ ) from torch._ops import HigherOrderOperator from torch._subclasses.fake_tensor import FakeTensorMode -from torch.fx.experimental.proxy_tensor import ( - disable_proxy_modes_tracing, - ProxyTorchDispatchMode, - track_tensor_tree, -) +from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree class WhileLoopOp(HigherOrderOperator): @@ -189,14 +185,8 @@ def while_loop_tracing(mode, cond_fn, body_fn, carried_inputs, additional_inputs def _trace_while_loop( proxy_mode, while_loop_op, cond_fn, body_fn, carried_inputs, additional_inputs ): - pre_dispatch = getattr(proxy_mode, "pre_dispatch", False) - with disable_proxy_modes_tracing(): - cond_graph = reenter_make_fx(cond_fn, pre_dispatch)( - *carried_inputs, *additional_inputs - ) - body_graph = reenter_make_fx(body_fn, pre_dispatch)( - *carried_inputs, *additional_inputs - ) + cond_graph = reenter_make_fx(cond_fn)(*carried_inputs, *additional_inputs) + body_graph = reenter_make_fx(body_fn)(*carried_inputs, *additional_inputs) next_name = None i = 0 diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py index 13a61e10689b2..5ac418b847f85 100644 --- a/torch/_inductor/codecache.py +++ b/torch/_inductor/codecache.py @@ -564,23 +564,27 @@ def get_str(obj) -> str: @functools.lru_cache(None) -def get_inductor_code_hash() -> bytes: +def torch_key(): """ - Compute a hash of all inductor code modules. Used by the FxGraph cache - so any inductor code changes would result in new cache keys. + Compute a key that contains relevant information about torch source files """ - inductor_root = os.path.dirname(__file__) + if not config.is_fbcode(): + inductor_root = os.path.dirname(__file__) - contents: Dict[str, bytes] = {} - for lib in pkgutil.iter_modules([inductor_root]): - spec = lib.module_finder.find_spec(lib.name, None) - assert spec is not None - module = spec.origin - assert module is not None - with open(module, "rb") as f: - contents[module] = f.read() + contents: Dict[str, bytes] = {torch.__version__: b""} + for lib in pkgutil.iter_modules([inductor_root]): + spec = lib.module_finder.find_spec(lib.name, None) + assert spec is not None + module = spec.origin + assert module is not None + with open(module, "rb") as f: + contents[module] = f.read() + + return hashlib.sha256(pickle.dumps(contents)).digest() + + from libfb.py import parutil - return hashlib.sha256(pickle.dumps(contents)).digest() + return parutil.get_file_contents("torch/src_hash.txt").rstrip() @dataclasses.dataclass @@ -645,11 +649,9 @@ def __init__( ) # Also hash on various system info (including the triton compiler version). - self.torch_version = torch.__version__ + self.torch_version = torch_key() self.system_info = CacheBase.get_system() - # And the inductor configuration and code. - self.inductor_code_hash = get_inductor_code_hash() try: self.inductor_config = config.save_config() except (TypeError, AttributeError) as e: diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py index 2b7d6c65704e7..0d90e474d04b5 100644 --- a/torch/_inductor/codegen/common.py +++ b/torch/_inductor/codegen/common.py @@ -1717,7 +1717,14 @@ def rename_indexing(self, index) -> sympy.Expr: replacements = { x: self.args.size(x) for x in sorted_symbols - if symbol_is_type(x, (SymT.UNBACKED_INT, SymT.SIZE, SymT.PRECOMPUTED_SIZE)) + if symbol_is_type( + x, + ( + SymT.UNBACKED_INT, + SymT.SIZE, + SymT.PRECOMPUTED_SIZE, + ), + ) } return sympy_subs(index, replacements) diff --git a/torch/_inductor/codegen/cpp_micro_gemm.py b/torch/_inductor/codegen/cpp_micro_gemm.py index 7d54bd8605ec4..353562923c91c 100644 --- a/torch/_inductor/codegen/cpp_micro_gemm.py +++ b/torch/_inductor/codegen/cpp_micro_gemm.py @@ -344,7 +344,7 @@ def create_from_config(cls, config: CppMicroGemmConfig): assert isinstance(n, int) or n.is_number, n assert isinstance(k, int) or k.is_number, k - m = V.graph.sizevars.size_hint(m) if isinstance(m, sympy.Expr) else m + m = V.graph.sizevars.size_hint(m, fallback=1) if isinstance(m, sympy.Expr) else m assert isinstance(m, int), m if output_dtype is None: output_dtype = input_dtype diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h index c034522b83332..45f874fc4d269 100644 --- a/torch/_inductor/codegen/cpp_prefix.h +++ b/torch/_inductor/codegen/cpp_prefix.h @@ -152,6 +152,28 @@ inline at::vec::Vectorized vec_shuffle_down(at::vec::Vectorized x, } #endif +#ifdef CPU_CAPABILITY_AVX512 +inline at::vec::Vectorized vec_shuffle_down(at::vec::Vectorized x, size_t n) { + using vec_t = at::vec::Vectorized; +#define SHUFFLE_MASK(z, y, x, w) ((z << 6) | (y << 4) | (x << 2) | w) + switch (n) { + case 1: + return vec_t(_mm512_permute_ps(x, SHUFFLE_MASK(1, 1, 3, 3))); + case 2: + return vec_t(_mm512_permute_ps(x, SHUFFLE_MASK(2, 2, 2, 2))); + case 4: + return vec_t(_mm512_permutexvar_ps( + _mm512_set_epi32( + 12, 12, 12, 12, 12, 12, 12, 12, 4, 4, 4, 4, 4, 4, 4, 4), + x)); + case 8: + return vec_t(_mm512_permutexvar_ps( + _mm512_set_epi32(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8), x)); + } + TORCH_CHECK(false, "Unhandled vec_shuffle_down value ", n); +} +#endif + template Welford welford_vec_reduce_all(Welford> acc) { using Vec = at::vec::Vectorized; diff --git a/torch/_inductor/codegen/cpp_utils.py b/torch/_inductor/codegen/cpp_utils.py index 5e27f99f181d7..a3b4fd3206b6b 100644 --- a/torch/_inductor/codegen/cpp_utils.py +++ b/torch/_inductor/codegen/cpp_utils.py @@ -107,16 +107,19 @@ def _print_Pow(self, expr): if exp == 0.5 or exp == -0.5: return f"std::sqrt({base})" if exp == 0.5 else f"1.0/std::sqrt({base})" - assert exp.is_integer - exp = int(exp) - if exp > 0: - r = "*".join([self.paren(base)] * exp) - elif exp < 0: - r = "1.0/" + self.paren("*".join([self.paren(base)] * abs(exp))) - else: # exp == 0 - r = "1.0" - - return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r + if exp.is_integer: + exp = int(exp) + if exp > 0: + r = "*".join([self.paren(base)] * exp) + elif exp < 0: + r = "1.0/" + self.paren("*".join([self.paren(base)] * abs(exp))) + else: # exp == 0 + r = "1.0" + + return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r + else: + # TODO: float vs double + return f"std::pow({base}, {float(exp)})" def _print_Rational(self, expr): # Uses float constants to perform FP div diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py index fafd176dc26a5..6ce230714632a 100644 --- a/torch/_inductor/codegen/cpp_wrapper_cpu.py +++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py @@ -1158,7 +1158,9 @@ def generate_c_shim_extern_kernel_call(self, kernel, args): # so just avoid wrapping integers. # Name matching is to find tensor is hacky, but fixing all the # ArrayRefTensor issues is not a priority for now. - if isinstance(piece, str) and piece.startswith(("buf", "arg")): + if isinstance(piece, str) and piece.startswith( + ("buf", "arg", "wrap_with_raii_handle_if_needed") + ): piece = f"convert_arrayref_tensor_to_tensor({piece})" wrapped_args.append(piece) diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py index 5ad5f791a9023..ac6699675af1f 100644 --- a/torch/_inductor/codegen/triton.py +++ b/torch/_inductor/codegen/triton.py @@ -1681,7 +1681,15 @@ def indexing( cse_var = self.cse.varname_map[var.name] mask_vars.update(cse_var.mask_vars) elif symbol_is_type( - var, (SymT.UNBACKED_INT, SymT.SIZE, SymT.PRECOMPUTED_SIZE, SymT.INDEX) + var, + ( + SymT.UNBACKED_INT, + SymT.SIZE, + SymT.PRECOMPUTED_SIZE, + SymT.INDEX, + SymT.FLOAT, + SymT.UNBACKED_FLOAT, + ), ): pass else: @@ -2755,6 +2763,7 @@ def inductor_meta_common(): "autotune_local_cache": config.autotune_local_cache, "autotune_pointwise": config.triton.autotune_pointwise, "autotune_remote_cache": config.autotune_remote_cache, + "force_disable_caches": config.force_disable_caches, "dynamic_scale_rblock": config.dynamic_scale_rblock, "max_autotune": config.max_autotune, "max_autotune_pointwise": config.max_autotune_pointwise, diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py index cf37b7b9dbe32..bdbfef2eee28f 100644 --- a/torch/_inductor/compile_fx.py +++ b/torch/_inductor/compile_fx.py @@ -39,6 +39,7 @@ from torch._inductor.utils import ( BoxedBool, count_tangents, + fresh_inductor_cache, should_assume_input_aligned, tensor_is_aligned, ) @@ -414,6 +415,15 @@ def get_patched_config_dict(config_patches=None) -> Dict[str, Any]: return config.get_config_copy() +@functools.wraps +def with_fresh_cache_if_config(f): + if config.force_disable_caches: + with fresh_inductor_cache(): + return f + else: + return f + + @DebugContext.wrap @torch.utils._python_dispatch._disable_current_modes() @time_and_log(attr="compilation time (in seconds)") @@ -422,6 +432,7 @@ def get_patched_config_dict(config_patches=None) -> Dict[str, Any]: # compile_fx return and we may want to use the _LazyGraphModule for compiling # the backward graph as well. @_use_lazy_graph_module(dynamo_config.use_lazy_graph_module) +@with_fresh_cache_if_config @dynamo_utils.dynamo_timed(phase_name="inductor_compile") def compile_fx_inner( gm: torch.fx.GraphModule, @@ -494,7 +505,11 @@ def compile_fx_inner( start = time.time() fx_graph_remote_cache = should_use_remote_fx_graph_cache() - if (config.fx_graph_cache or fx_graph_remote_cache) and not aot_mode: + if ( + not config.force_disable_caches + and (config.fx_graph_cache or fx_graph_remote_cache) + and not aot_mode + ): compiled_graph = FxGraphCache.load( fx_codegen_and_compile, gm, @@ -1413,7 +1428,6 @@ def partition_fn(graph, joint_inputs, **kwargs): @compile_time_strobelight_meta(phase_name="bw_compiler") @dynamo_utils.dynamo_timed - @dynamo_utils.maybe_cprofile def bw_compiler(model: torch.fx.GraphModule, example_inputs: List[torch.Tensor]): user_visible_outputs = {} diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py index 9968ce460cc02..79af641514bd6 100644 --- a/torch/_inductor/config.py +++ b/torch/_inductor/config.py @@ -33,6 +33,9 @@ def is_fbcode(): # enable autotune remote cache autotune_remote_cache = os.environ.get("TORCHINDUCTOR_AUTOTUNE_REMOTE_CACHE") == "1" +# Force disabled all inductor level caching -- This will override any other caching flag +force_disable_caches = os.environ.get("TORCHINDUCTOR_FORCE_DISABLE_CACHES") == "1" + # use cpp wrapper instead of python wrapper cpp_wrapper = os.environ.get("TORCHINDUCTOR_CPP_WRAPPER", "0") == "1" @@ -315,15 +318,13 @@ def is_fbcode(): benchmark_fusion = os.environ.get("TORCHINDUCTOR_BENCHMARK_FUSION") == "1" enabled_metric_tables = os.environ.get("TORCHINDUCTOR_ENABLED_METRIC_TABLES", "") -benchmark_multi_templates = ( - os.environ.get( - "TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES", "0" if is_fbcode() else "1" - ) - == "1" +# For Triton Templates, select fastest of best template + epilogue vs best template + separate epilogue kernel +benchmark_epilogue_fusion = ( + os.environ.get("TORCHINDUCTOR_BENCHMARK_EPILOGUE_FUSION", "1") == "1" ) # Take how many of the top triton kernels to benchmark epilogue -max_epilogue_benchmarked_choices = 3 +max_epilogue_benchmarked_choices = 1 # how many nodes to allow into a single fusion max_fusion_size = 64 @@ -456,6 +457,9 @@ def decide_compile_threads(): # For user visible outputs, inductor will make sure the stride matches with eager. bw_outputs_user_visible = True +# Whether to always use shape padding if it is enabled and possible +force_shape_pad: bool = False + # Fx-based linear/matmul/bmm + permute/transpose vertical fusion permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1" diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py index a4fd1a9191c1c..01803af152608 100644 --- a/torch/_inductor/decomposition.py +++ b/torch/_inductor/decomposition.py @@ -28,7 +28,11 @@ ) from . import config, inductor_prims -from .utils import needs_fallback_due_to_atomic_add_limitations, use_scatter_fallback +from .utils import ( + is_gpu, + needs_fallback_due_to_atomic_add_limitations, + use_scatter_fallback, +) log = logging.getLogger(__name__) aten = torch.ops.aten @@ -167,7 +171,7 @@ def convolution_backward( groups, output_mask, ): - if not output_mask[2] or grad_output.device.type != "cuda": + if not output_mask[2] or not is_gpu(grad_output.device.type): return NotImplemented grad_bias = aten.sum(grad_output, [0] + list(range(2, grad_output.dim()))) grad_inp, grad_weight, _ = aten.convolution_backward( @@ -593,7 +597,7 @@ def select_decomp_table(): @register_decomposition(aten.masked_scatter) def masked_scatter(self, mask, source): - if self.device.type == "cuda": + if is_gpu(self.device.type): # This two-step algorithm is the same as eager CUDA, for eager CPU we # use a 1-shot serial iteration. self, mask = aten.broadcast_tensors([self, mask]) diff --git a/torch/_inductor/fx_passes/joint_graph.py b/torch/_inductor/fx_passes/joint_graph.py index 3713583e69eee..3302dfd632921 100644 --- a/torch/_inductor/fx_passes/joint_graph.py +++ b/torch/_inductor/fx_passes/joint_graph.py @@ -1,11 +1,14 @@ +import itertools import logging import typing from collections import Counter -from typing import Dict, List, Set +from typing import Dict, List, Set, Union import torch import torch._guards from torch._inductor.constant_folding import ConstantFolder +from torch._inductor.virtualized import V +from torch.fx.experimental.symbolic_shapes import statically_known_true from torch.multiprocessing.reductions import StorageWeakRef from .. import config @@ -14,6 +17,7 @@ init_once_fakemode, KeywordArg, Match, + MULTIPLE, PatternMatcherPass, register_graph_pattern, stable_topological_sort, @@ -22,6 +26,13 @@ log = logging.getLogger(__name__) patterns = PatternMatcherPass() +aten = torch.ops.aten +prims = torch.ops.prims + +pass_patterns = [ + patterns, + PatternMatcherPass(), +] @init_once_fakemode @@ -40,7 +51,6 @@ def remove_no_ops( gm: torch.fx.GraphModule, zeros: Set[torch.fx.Node], ones: Set[torch.fx.Node] ): "Removes no-ops: (+ 0, - 0, * 1, / 1)" - aten = torch.ops.aten graph = gm.graph def fake_tensors_eq(t1, t2, fields=("shape", "dtype", "device")): @@ -308,7 +318,8 @@ def joint_graph_passes(graph: torch.fx.GraphModule): constant_fold_uniform_value(graph) if config.pattern_matcher: - count += patterns.apply(graph.graph) # type: ignore[arg-type] + for patterns in pass_patterns: + count += patterns.apply(graph.graph) # type: ignore[arg-type] if not config.fallback_random: count += replace_random_passes(graph) @@ -362,3 +373,131 @@ def pointless_view(match: Match, arg, size): if size == arg_size: node.replace_all_uses_with(node.args[0]) match.erase_nodes(graph) + + +# When softmax is used with temperature or other scaling, we get the pattern +# +# scale(x) - scale(x).amax(dim, keepdim=True) +# +# which is expected to be at most zero, but we may end up with numerical +# discrepancies # between the recomputed values of scale(x) inside and out +# of the reduction, # depending on compiler optimizations, e.g. use of fma +# instructions. +# +# Here we replace it with the mathematically equivalent, +# +# scale(x - x.amax(dim, keepdim=True)) +# +# which is more stable as we only compute the scaling once. +# +# NOTE: This pattern must come after fused attention matching! + + +def _partial_softmax_pattern(linear_func, reverse=False, to_dtype=False): + # Allow matching inp * other and other * input + if reverse: + scaled = CallFunction( + linear_func, KeywordArg("other"), KeywordArg("inp"), _users=MULTIPLE + ) + else: + scaled = CallFunction( + linear_func, KeywordArg("inp"), KeywordArg("other"), _users=MULTIPLE + ) + if to_dtype: + scaled = CallFunction( + prims.convert_element_type, scaled, KeywordArg("dtype"), _users=MULTIPLE + ) + amax = CallFunction( + aten.amax.default, scaled, KeywordArg("dim"), KeywordArg("keepdim") + ) + return CallFunction(aten.sub.Tensor, scaled, amax) + + +def _other_is_broadcasted_in_dim(match): + # Check that the scaling factor is constant across the reduction dim, + # so scaling doesn't change which index corresponds to the maximum value + other = match.kwargs["other"] + if isinstance(other, (int, float)): + return True + + inp = match.kwargs["inp"] + if not all(isinstance(x, torch.fx.Node) for x in (inp, other)): + return False + + inp_example = inp.meta["val"] + other_example = other.meta["val"] + if isinstance(other_example, (torch.SymInt, torch.SymFloat)): + return True + + if not all(isinstance(x, torch.Tensor) for x in (inp_example, other_example)): + return False + + inp_ndim = inp_example.ndim + other_shape = other_example.shape + if inp_ndim < len(other_shape): + return False + + # Pad other_shape to the same ndim as inp + other_shape = [1] * (inp_ndim - len(other_shape)) + list(other_shape) + + dim = match.kwargs["dim"] + if isinstance(dim, int): + dim = (dim,) + + return all(statically_known_true(other_shape[d] == 1) for d in dim) + + +def mul_softmax_pattern(match: Match, *, inp, other, dim, keepdim, dtype=None): + def repl(inp, other): + if dtype is not None: + inp = inp.to(dtype) + + sign: Union[int, float, torch.Tensor] + if isinstance(other, (int, float)): + sign = 1 if other >= 0 else -1 + else: + one = torch.scalar_tensor(1, dtype=inp.dtype, device=inp.device) + sign = torch.where(other >= 0, one, -one) + + inp = inp * sign + max_ = torch.amax(inp, dim=dim, keepdim=keepdim) + return (inp - max_) * (sign * other) + + with V.fake_mode: + match.replace_by_example(repl, [inp, other]) + + +for reverse, to_dtype in itertools.product((False, True), repeat=2): + register_graph_pattern( + _partial_softmax_pattern(aten.mul.Tensor, reverse=reverse, to_dtype=to_dtype), + pass_dict=pass_patterns[1], + extra_check=_other_is_broadcasted_in_dim, + )(mul_softmax_pattern) + + +def div_softmax_pattern(match: Match, *, inp, other, dim, keepdim, dtype=None): + def repl(inp, other): + if dtype is not None: + inp = inp.to(dtype) + + sign: Union[int, float, torch.Tensor] + if isinstance(other, (int, float)): + sign = 1 if other >= 0 else -1 + else: + one = torch.scalar_tensor(1, dtype=inp.dtype, device=inp.device) + sign = torch.where(other >= 0, one, -one) + + inp = inp * sign + max_ = torch.amax(inp, dim=dim, keepdim=keepdim) + return (inp - max_) / (sign * other) + + with V.fake_mode: + match.replace_by_example(repl, [inp, other]) + + +for to_dtype in (False, True): + register_graph_pattern( + _partial_softmax_pattern(aten.div.Tensor, to_dtype=to_dtype), + pass_dict=pass_patterns[1], + extra_check=_other_is_broadcasted_in_dim, + )(div_softmax_pattern) diff --git a/torch/_inductor/fx_passes/pad_mm.py b/torch/_inductor/fx_passes/pad_mm.py index e351d38d96ec0..df282629e2ce7 100644 --- a/torch/_inductor/fx_passes/pad_mm.py +++ b/torch/_inductor/fx_passes/pad_mm.py @@ -1,4 +1,5 @@ import functools +import operator from typing import List, Optional, Union import torch @@ -7,7 +8,7 @@ from torch._inductor import utils from torch._subclasses.fake_tensor import FakeTensor from torch.utils._mode_utils import no_dispatch -from torch.utils._triton import has_triton +from ...utils._triton import has_triton from ..pattern_matcher import fwd_only, gen_register_replacement, joint_fwd_bwd, Match @@ -111,32 +112,10 @@ def addmm_pattern( def should_pad_addmm(match: Match) -> bool: mat1, mat2, input = fetch_fake_tensors(match, ("mat1", "mat2", "input")) return should_pad_common(mat1, mat2, input) and should_pad_bench( - mat1, mat2, torch.ops.aten.addmm, input=input + match, mat1, mat2, torch.ops.aten.addmm, input=input ) -def addmm_replace( - input: Optional[Tensor], mat1: Tensor, mat2: Tensor, beta=1.0, alpha=1.0 -) -> Tensor: - m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1)) - k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1)) - n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2)) - - if m_padded_length != 0 or k_padded_length != 0 or n_padded_length != 0: - return pad_addmm( - input, - mat1, - mat2, - m_padded_length, - k_padded_length, - n_padded_length, - beta, - alpha, - ) - - return aten.addmm(input, mat1, mat2, beta=beta, alpha=alpha) - - def pad_addmm( input: Optional[Tensor], mat1: Tensor, @@ -146,36 +125,55 @@ def pad_addmm( n_padded_length: int, beta=1.0, alpha=1.0, + mat1_pre_padded: bool = False, + mat2_pre_padded: bool = False, ): - # addmm decomp with padding will go through pad_addmm multiple times if multiple dimensions are needed to be padded - if k_padded_length != 0: - mat1 = pad_dim(mat1, k_padded_length, 1) - mat2 = pad_dim(mat2, k_padded_length, 0) - elif n_padded_length != 0: - mat2 = pad_dim(mat2, n_padded_length, 1) - elif m_padded_length != 0: - mat1 = pad_dim(mat1, m_padded_length, 0) + # for paddings, dim order is reversed for some reasons + # and for every dim, we need to specify left and right padding + if not mat1_pre_padded: + mat1 = pad_mat1( + mat1, m_padded_length=m_padded_length, k_padded_length=k_padded_length + ) + if not mat2_pre_padded: + mat2 = pad_mat2( + mat2, k_padded_length=k_padded_length, n_padded_length=n_padded_length + ) # the add broadcasts, so we only pad if the dimension != 1 - if input is not None and k_padded_length == 0: + if input is not None: if n_padded_length != 0: if input.dim() == 2 and input.shape[1] != 1: input = pad_dim(input, n_padded_length, 1) elif input.dim() == 1 and input.shape[0] != 1: input = pad_dim(input, n_padded_length, 0) - elif m_padded_length != 0 and input.dim() == 2 and input.shape[0] != 1: + if m_padded_length != 0 and input.dim() == 2 and input.shape[0] != 1: input = pad_dim(input, m_padded_length, 0) - if k_padded_length != 0: - return addmm_replace(input, mat1, mat2, beta=beta, alpha=alpha) - elif n_padded_length != 0: - return addmm_replace(input, mat1, mat2, beta=beta, alpha=alpha)[ - :, :-n_padded_length - ] - else: - return addmm_replace(input, mat1, mat2, beta=beta, alpha=alpha)[ - :-m_padded_length, : - ] + res = aten.addmm(input, mat1, mat2, beta=beta, alpha=alpha) + + if m_padded_length != 0: + res = res[:-m_padded_length, :] + if n_padded_length != 0: + res = res[:, :-n_padded_length] + return res + + +def addmm_replace( + input: Optional[Tensor], mat1: Tensor, mat2: Tensor, beta=1.0, alpha=1.0 +) -> Tensor: + k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1)) + n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2)) + m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1)) + return pad_addmm( + input, + mat1, + mat2, + m_padded_length, + k_padded_length, + n_padded_length, + beta, + alpha, + ) def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool: @@ -216,16 +214,29 @@ def get_pad_cache(): return torch._inductor.codecache.LocalCache() -def get_cached_should_pad(key): +def get_cached_should_pad(key: str) -> bool: return get_pad_cache().lookup(key) -def set_cached_should_pad(key, value): +def set_cached_should_pad(key: str, value: bool): + return get_pad_cache().set_value(key, value=value) + + +def get_cached_base_mm_benchmark_time(key: str) -> float: + return get_pad_cache().lookup(key) + + +def set_cached_base_mm_benchmark_time(key: str, value: float): return get_pad_cache().set_value(key, value=value) def should_pad_bench_key( - mat1: Tensor, mat2: Tensor, op, input: Optional[Tensor] = None + match, + mat1: Tensor, + mat2: Tensor, + op, + input: Optional[Tensor] = None, + is_base_time_key=False, ) -> str: def tensor_key(t): return (t.shape, t.stride(), t.dtype) @@ -233,44 +244,80 @@ def tensor_key(t): tf32_key = ( None if mat1.dtype != torch.float32 else torch.backends.cuda.matmul.allow_tf32 ) + + def fmt_pad(name): + if is_base_time_key: + return None + return f"exclude_pad:{should_exclude_padding_time(match, name)}" + key = ( tensor_key(mat1), tensor_key(mat2), + fmt_pad("mat1"), + fmt_pad("mat2"), op, input if input is None else tensor_key(input), tf32_key, ) - return str(key) + key = str(key) + if is_base_time_key: + key = f"base mm time: {key}" + return key -def should_pad_bench( - mat1: Tensor, mat2: Tensor, op, input: Optional[Tensor] = None -) -> bool: - if not has_triton(): +def get_non_view_def(node): + if node.op == operator.getitem: + return get_non_view_def(node.args[0]) + + if ( + node.op == "call_function" + and isinstance(node.target, torch._ops.OpOverload) + and utils.is_view(node.target) + ): + return get_non_view_def(node.all_input_nodes[0]) + + return node + + +def should_exclude_padding_time(match, arg_name): + node_def = get_non_view_def(match.kwargs[arg_name]) + + # constant padding converts tensors to contiguous so even if the input tensor + # can be planned layout transform is not free. TODO - way to pad and preserve layout ? + if not fetch_fake_tensors(match, (arg_name,))[0].is_contiguous(): return False + # optimistically assume we should be able to memory plan away + # all non inputs + return node_def.op != "placeholder" + + +def should_pad_bench( + match, mat1: Tensor, mat2: Tensor, op, input: Optional[Tensor] = None +) -> bool: do_bench = functools.partial( torch._inductor.runtime.runtime_utils.do_bench_gpu, warmup=5, ) - + m_padded_length = 0 + n_padded_length = 0 + batchsize = 1 with no_dispatch(): if op is torch.ops.aten.mm or op is torch.ops.aten.addmm: m = mat1.shape[0] k = mat1.shape[1] n = mat2.shape[1] - - m_padded_length = get_padded_length(m, get_alignment_size(mat1)) k_padded_length = get_padded_length(k, get_alignment_size(mat1)) n_padded_length = get_padded_length(n, get_alignment_size(mat2)) + m_padded_length = get_padded_length(m, get_alignment_size(mat1)) elif op is torch.ops.aten.bmm: + batchsize = mat1.shape[0] m = mat1.shape[1] k = mat1.shape[2] n = mat2.shape[2] - - m_padded_length = get_padded_length(m, get_alignment_size(mat1)) k_padded_length = get_padded_length(k, get_alignment_size(mat1)) + m_padded_length = get_padded_length(m, get_alignment_size(mat1)) n_padded_length = get_padded_length(n, get_alignment_size(mat2)) else: return False @@ -278,12 +325,18 @@ def should_pad_bench( if m_padded_length == k_padded_length == n_padded_length == 0: return False + if torch._inductor.config.force_shape_pad: + return True + + if not has_triton(): + return False + if not is_mm_compute_bound(m, k, n, mat1.dtype): return False # We don't want to look up the cache for cases that are trivially false # since it does file io - key = should_pad_bench_key(mat1, mat2, op, input) + key = should_pad_bench_key(match, mat1, mat2, op, input) cached_pad = get_cached_should_pad(key) if cached_pad is not None: @@ -306,19 +359,48 @@ def realize_tensor(t): mat1 = realize_tensor(mat1) mat2 = realize_tensor(mat2) - if op is torch.ops.aten.bmm or op is torch.ops.aten.mm: - ori_time = do_bench( - lambda: op(mat1, mat2), - ) - else: - if input is not None: - input = realize_tensor(input) - ori_time = do_bench( - lambda: op(input, mat1, mat2), + + # since we key on whether or not the inputs can be memory planned, set cache for the + # original time which is unaffected by whether or not the input can be planned + ori_time_key = should_pad_bench_key( + match, mat1, mat2, op, input, is_base_time_key=True + ) + ori_time = get_cached_base_mm_benchmark_time(ori_time_key) + if ori_time is None: + if op is torch.ops.aten.bmm or op is torch.ops.aten.mm: + ori_time = do_bench( + lambda: op(mat1, mat2), + ) + else: + if input is not None: + # realize bias for addmm + input = realize_tensor(input) + ori_time = do_bench( + lambda: op(input, mat1, mat2), + ) + set_cached_base_mm_benchmark_time(ori_time_key, ori_time) + + mat1_pad = mat1 + mat2_pad = mat2 + + is_bmm = op is torch.ops.aten.bmm + mat1_pre_padded = should_exclude_padding_time(match, "mat1") + if mat1_pre_padded: + mat1_pad = pad_mat1( + mat1_pad, + m_padded_length=m_padded_length, + k_padded_length=k_padded_length, + is_bmm=is_bmm, ) - mat1_pad = torch.randn_like(mat1) - mat2_pad = torch.randn_like(mat2) + mat2_pre_padded = should_exclude_padding_time(match, "mat2") + if mat2_pre_padded: + mat2_pad = pad_mat2( + mat2_pad, + k_padded_length=k_padded_length, + n_padded_length=n_padded_length, + is_bmm=is_bmm, + ) if op is torch.ops.aten.addmm: input_pad = None @@ -332,6 +414,8 @@ def realize_tensor(t): m_padded_length, k_padded_length, n_padded_length, + mat1_pre_padded=mat1_pre_padded, + mat2_pre_padded=mat2_pre_padded, ), ) elif op is torch.ops.aten.mm: @@ -342,6 +426,8 @@ def realize_tensor(t): m_padded_length, k_padded_length, n_padded_length, + mat1_pre_padded=mat1_pre_padded, + mat2_pre_padded=mat2_pre_padded, ), ) else: @@ -352,6 +438,8 @@ def realize_tensor(t): m_padded_length, k_padded_length, n_padded_length, + mat1_pre_padded=mat1_pre_padded, + mat2_pre_padded=mat2_pre_padded, ), ) @@ -371,16 +459,29 @@ def mm_pattern(mat1: Tensor, mat2: Tensor) -> Tensor: def should_pad_mm(match: Match) -> bool: mat1, mat2 = fetch_fake_tensors(match, ("mat1", "mat2")) return should_pad_common(mat1, mat2) and should_pad_bench( - mat1, mat2, torch.ops.aten.mm + match, mat1, mat2, torch.ops.aten.mm ) -def mm_replace(mat1: Tensor, mat2: Tensor) -> Tensor: - m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1)) - k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1)) - n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2)) +def pad_mat1(mat1, *, m_padded_length, k_padded_length, is_bmm=False): + if k_padded_length != 0 or m_padded_length != 0: + # dim order is reversed for constant_pad_nd, for every dim we specify right and left padding + pad_arg = [0, k_padded_length, 0, m_padded_length] + if is_bmm: + pad_arg.extend((0, 0)) + return aten.constant_pad_nd(mat1, pad_arg) + return mat1 - return pad_mm(mat1, mat2, m_padded_length, k_padded_length, n_padded_length) + +def pad_mat2(mat2, *, k_padded_length, n_padded_length, is_bmm=False): + if k_padded_length != 0 or n_padded_length != 0: + # dim order is reversed for constant_pad_nd, for every dim we specify right and left padding + pad_arg = [0, n_padded_length, 0, k_padded_length] + if is_bmm: + pad_arg.extend((0, 0)) + return aten.constant_pad_nd(mat2, pad_arg) + else: + return mat2 def pad_mm( @@ -389,18 +490,36 @@ def pad_mm( m_padded_length: int, k_padded_length: int, n_padded_length: int, + mat1_pre_padded: bool = False, + mat2_pre_padded: bool = False, ) -> Tensor: - # mm_replace will go through pad_mm multiple times if multiple dimensions are needed to be padded - if k_padded_length != 0: - mat1 = pad_dim(mat1, k_padded_length, 1) - mat2 = pad_dim(mat2, k_padded_length, 0) - return torch.ops.aten.mm(mat1, mat2) - elif n_padded_length != 0: - mat2 = pad_dim(mat2, n_padded_length, 1) - return torch.ops.aten.mm(mat1, mat2)[:, :-n_padded_length] - else: - mat1 = pad_dim(mat1, m_padded_length, 0) - return torch.ops.aten.mm(mat1, mat2)[:-m_padded_length, :] + if not mat1_pre_padded: + mat1 = pad_mat1( + mat1, m_padded_length=m_padded_length, k_padded_length=k_padded_length + ) + if not mat2_pre_padded: + mat2 = pad_mat2( + mat2, k_padded_length=k_padded_length, n_padded_length=n_padded_length + ) + res = aten.mm(mat1, mat2) + if m_padded_length != 0: + res = res[:-m_padded_length, :] + if n_padded_length != 0: + res = res[:, :-n_padded_length] + return res + + +def mm_replace(mat1: Tensor, mat2: Tensor) -> Tensor: + k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1)) + m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1)) + n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2)) + return pad_mm( + mat1, + mat2, + m_padded_length, + k_padded_length, + n_padded_length, + ) def bmm_pattern(mat1: Tensor, mat2: Tensor) -> Tensor: @@ -410,40 +529,52 @@ def bmm_pattern(mat1: Tensor, mat2: Tensor) -> Tensor: def should_pad_bmm(match: Match) -> bool: mat1, mat2 = fetch_fake_tensors(match, ("mat1", "mat2")) return should_pad_common(mat1, mat2) and should_pad_bench( - mat1, mat2, torch.ops.aten.bmm + match, mat1, mat2, torch.ops.aten.bmm ) -def bmm_replace(mat1: Tensor, mat2: Tensor) -> Tensor: - m_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1)) - k_padded_length = get_padded_length(mat1.shape[2], get_alignment_size(mat1)) - n_padded_length = get_padded_length(mat2.shape[2], get_alignment_size(mat2)) - - if m_padded_length != 0 or k_padded_length != 0 or n_padded_length != 0: - return pad_bmm(mat1, mat2, m_padded_length, k_padded_length, n_padded_length) - - return aten.bmm(mat1, mat2) - - def pad_bmm( mat1: Tensor, mat2: Tensor, m_padded_length: int, k_padded_length: int, n_padded_length: int, + mat1_pre_padded: bool = False, + mat2_pre_padded: bool = False, ) -> Tensor: - # bmm_replace will go through pad_bmm multiple times if multiple dimensions are needed to be padded - if k_padded_length != 0: - mat1 = pad_dim(mat1, k_padded_length, 2) - mat2 = pad_dim(mat2, k_padded_length, 1) - - return aten.bmm(mat1, mat2) - elif n_padded_length != 0: - mat2 = pad_dim(mat2, n_padded_length, 2) - return aten.bmm(mat1, mat2)[:, :, :-n_padded_length].contiguous() - else: - mat1 = pad_dim(mat1, m_padded_length, 1) - return aten.bmm(mat1, mat2)[:, :-m_padded_length, :].contiguous() + if not mat1_pre_padded: + mat1 = pad_mat1( + mat1, + m_padded_length=m_padded_length, + k_padded_length=k_padded_length, + is_bmm=True, + ) + if not mat2_pre_padded: + mat2 = pad_mat2( + mat2, + k_padded_length=k_padded_length, + n_padded_length=n_padded_length, + is_bmm=True, + ) + res = aten.bmm(mat1, mat2) + if m_padded_length != 0: + res = res[:, :-m_padded_length, :] + if n_padded_length != 0: + res = res[:, :, :-n_padded_length] + return res + + +def bmm_replace(mat1: Tensor, mat2: Tensor) -> Tensor: + k_padded_length = get_padded_length(mat1.shape[2], get_alignment_size(mat1)) + n_padded_length = get_padded_length(mat2.shape[2], get_alignment_size(mat2)) + m_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1)) + return pad_bmm( + mat1, + mat2, + m_padded_length, + k_padded_length, + n_padded_length, + ) @functools.lru_cache(None) diff --git a/torch/_inductor/index_propagation.py b/torch/_inductor/index_propagation.py index 6bc5def57d650..ea22955edbb6a 100644 --- a/torch/_inductor/index_propagation.py +++ b/torch/_inductor/index_propagation.py @@ -82,6 +82,10 @@ def to_dtype( ) -> TypedExpr: return TypedExpr(value.expr, dtype) + @staticmethod + def abs(x: TypedExpr) -> TypedExpr: + return TypedExpr(abs(x.expr), x.dtype) # type: ignore[arg-type] + @staticmethod def square(x: TypedExpr) -> TypedExpr: return TypedExpr(x.expr * x.expr, x.dtype) diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py index 143c616fcb84e..a8650cd32c3f0 100644 --- a/torch/_inductor/kernel/bmm.py +++ b/torch/_inductor/kernel/bmm.py @@ -59,8 +59,15 @@ def bmm_grid(b, m, n, meta): rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1): + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + else: + ram = rm % M + if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1): + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + else: + rbn = rn % N + rk = tl.arange(0, BLOCK_K) idx_q = tl.program_id(1) # batch dimension for BMM diff --git a/torch/_inductor/kernel/conv.py b/torch/_inductor/kernel/conv.py index 37a760a90e1e0..44aef074457a2 100644 --- a/torch/_inductor/kernel/conv.py +++ b/torch/_inductor/kernel/conv.py @@ -5,6 +5,7 @@ from typing import cast, List, Optional, Sequence, Tuple, TYPE_CHECKING, TypedDict import torch + from .. import config, ir from ..lowering import ( @@ -245,11 +246,11 @@ def conv_layout( ir.ir_node_to_tensor(x, guard_shape=True), ir.ir_node_to_tensor(weight, guard_shape=True), ir.ir_node_to_tensor(bias, guard_shape=True), - stride, - tuple(V.graph.sizevars.size_hint(p) for p in padding), # type: ignore[arg-type] + V.graph.sizevars.size_hints(stride), # type: ignore[arg-type] + V.graph.sizevars.size_hints(padding), # type: ignore[arg-type] dilation, transposed, - tuple(V.graph.sizevars.size_hint(p) for p in output_padding), # type: ignore[arg-type] + V.graph.sizevars.size_hints(output_padding), # type: ignore[arg-type] groups, ) sizes = ir.convert_shape_to_inductor(output.size()) diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py index 15a99faa7b37f..a780d3709cb0c 100644 --- a/torch/_inductor/kernel/flex_attention.py +++ b/torch/_inductor/kernel/flex_attention.py @@ -162,7 +162,7 @@ def sdpa_grid(batch_size, num_heads, num_queries, d_model, meta): # TODO generalize and add proper mask support mask = (idx_m != -1) & (idx_d != -1) - {{store_output(("idx_z", "idx_h", "idx_m", "idx_d"), "acc")}} + {{store_output(("idx_z", "idx_h", "idx_m", "idx_d"), "acc", "mask")}} # TODO dont want to write this if we dont require grad if OUTPUT_LOGSUMEXP: diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py index be1177393df93..fa14b4406de69 100644 --- a/torch/_inductor/kernel/mm.py +++ b/torch/_inductor/kernel/mm.py @@ -65,8 +65,14 @@ rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1): + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + else: + ram = rm % M + if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1): + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + else: + rbn = rn % N rk = tl.arange(0, BLOCK_K) A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) diff --git a/torch/_inductor/kernel/mm_common.py b/torch/_inductor/kernel/mm_common.py index 5a7f60e59102f..26d08183b0e55 100644 --- a/torch/_inductor/kernel/mm_common.py +++ b/torch/_inductor/kernel/mm_common.py @@ -178,14 +178,14 @@ def filtered_configs( if config["cond"] ) -# On ROCm convert num_stages to 1 as pipelining provides no benefit +# On ROCm convert num_stages to 0 to enable software pipelining if torch.version.hip: mm_platform_configs = tuple( - (config[0], config[1], config[2], 1, config[4]) + (config[0], config[1], config[2], 0, config[4]) for config in mm_platform_configs ) int8_platform_configs = tuple( - (config[0], config[1], config[2], 1, config[4]) + (config[0], config[1], config[2], 0, config[4]) for config in mm_platform_configs ) diff --git a/torch/_inductor/kernel/mm_plus_mm.py b/torch/_inductor/kernel/mm_plus_mm.py index 95ef6f043dfce..931aa592556bd 100644 --- a/torch/_inductor/kernel/mm_plus_mm.py +++ b/torch/_inductor/kernel/mm_plus_mm.py @@ -54,8 +54,19 @@ rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + + if (((stride_am == 1 and stride_ak == M) or (stride_am == K1 and stride_ak == 1)) + and ((stride_cm == 1 and stride_ck == M) or (stride_cm == K1 and stride_ck == 1))): + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + else: + ram = rm % M + + if (((stride_bk == 1 and stride_bn == K1) or (stride_bk == N and stride_bn == 1)) + and ((stride_dk == 1 and stride_dn == K1) or (stride_dk == N and stride_dn == 1))): + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + else: + rbn = rn % N + rk = tl.arange(0, BLOCK_K) A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py index e4001aa3b27b8..389ff16e39025 100644 --- a/torch/_inductor/lowering.py +++ b/torch/_inductor/lowering.py @@ -1342,21 +1342,6 @@ def unwrap_tensor(x: Union[TensorBox, ir.StorageBox]) -> ir.IRNode: return x - def should_lower_cat_input(x) -> bool: - # Unrealized inputs will not be storage and layouts, and we dont want to realize - # them in case we want to fuse - if ir.is_storage_and_layout(x): - storage, _ = ir.as_storage_and_layout(x, freeze=False) - return not ir.ConcatKernel.can_realize_into_without_copy(storage) - - if isinstance(x, (TensorBox, ir.StorageBox)): - return should_lower_cat_input(unwrap_tensor(x)) - - if isinstance(x, ir.Pointwise): - return True - - return False - def is_reduction(t): return isinstance(t, ir.ComputedBuffer) and isinstance(t.data, ir.Reduction) @@ -1375,9 +1360,24 @@ def can_fuse_reduction(t): # fusing reducutions into computed concat buffer can cause regressions. fusable_reduction = any(can_fuse_reduction(t) for t in inputs) + def should_lower_cat_input(x) -> bool: + # Unrealized inputs will not be storage and layouts, and we dont want to realize + # them in case we want to fuse + if ir.is_storage_and_layout(x): + storage, _ = ir.as_storage_and_layout(x, freeze=False) + return not ir.ConcatKernel.can_realize_into_without_copy(storage) + + if isinstance(x, (TensorBox, ir.StorageBox)): + return should_lower_cat_input(unwrap_tensor(x)) + + if isinstance(x, ir.Pointwise): + return True + + return False + # TODO: We observed negative performance impact of pointwise_cat optimization on CPU so disabled it. # We will revisit this later after enabling vectorization on index_expr. - if cpu_device or fusable_reduction: + if cpu_device: return TensorBox(ir.ConcatKernel.create(inputs, dim)) def op_count(x): @@ -1406,10 +1406,18 @@ def op_count(x): and all(op_count(t) <= MAX_SIMPLE_OP_COUNT for t in inputs) ): pointwise_uses = all(is_pointwise_use(use) for use in V.current_node.users) - all_pointwise_inputs = all(should_lower_cat_input(inp) for inp in inputs) - any_pointwise_inputs = any(should_lower_cat_input(inp) for inp in inputs) + # fuse in case we will be used in a pointwise node, and there are any inputs we + # we can prevent materialization of. + fuse_pointwise_use = ( + any(should_lower_cat_input(inp) for inp in inputs) and pointwise_uses + ) - if all_pointwise_inputs or (any_pointwise_inputs and pointwise_uses): + # horizontal fuse in case all inputs will require a copy kernel anyway. + # only horizontally fuse pointwise kernels + horizontal_fuse_cat = all( + should_lower_cat_input(inp) for inp in inputs + ) and not any(can_fuse_reduction(t) for t in inputs) + if fuse_pointwise_use or (horizontal_fuse_cat and not fusable_reduction): return pointwise_cat(inputs, dim) return TensorBox(ir.ConcatKernel.create(inputs, dim)) diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py index 53b2790df20c2..d7fb163cd589f 100644 --- a/torch/_inductor/runtime/triton_heuristics.py +++ b/torch/_inductor/runtime/triton_heuristics.py @@ -1086,17 +1086,18 @@ def cached_autotune( ) best_config = None - if cache_filename is not None and os.path.exists(cache_filename): - with open(cache_filename) as fd: - best_config = json.loads(fd.read()) - elif remote_cache is not None and remote_cache_key is not None: - best_config = remote_cache.get(remote_cache_key) - - best_config = load_cached_autotuning( - best_config, configs_hash, configs, inductor_meta - ) - if best_config: - configs = [best_config] + if not inductor_meta.get("force_disable_caches", False): + if cache_filename is not None and os.path.exists(cache_filename): + with open(cache_filename) as fd: + best_config = json.loads(fd.read()) + elif remote_cache is not None and remote_cache_key is not None: + best_config = remote_cache.get(remote_cache_key) + + best_config = load_cached_autotuning( + best_config, configs_hash, configs, inductor_meta + ) + if best_config: + configs = [best_config] def save_cache_hook(cfg, time_taken_ns, found_by_coordesc=False): data = { diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py index 0bcc166982a18..d1550529bb8ee 100644 --- a/torch/_inductor/select_algorithm.py +++ b/torch/_inductor/select_algorithm.py @@ -1499,7 +1499,7 @@ def autotune_select_algorithm(*args, **kwargs): if "return_multi_template" not in kwargs: kwargs[ "return_multi_template" - ] = torch._inductor.config.benchmark_multi_templates + ] = torch._inductor.config.benchmark_epilogue_fusion return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs) diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py index b770d51d67cc1..917dbfc3dd193 100644 --- a/torch/_inductor/utils.py +++ b/torch/_inductor/utils.py @@ -8,6 +8,7 @@ import inspect import io import itertools +import json import logging import math import operator @@ -21,6 +22,7 @@ import unittest from datetime import datetime from io import StringIO +from pathlib import Path from typing import ( Any, Callable, @@ -32,6 +34,7 @@ Optional, Protocol, Set, + Tuple, TypeVar, Union, ValuesView, @@ -42,6 +45,8 @@ from typing_extensions import Concatenate, ParamSpec import torch +import torch._export +import torch.utils._pytree as pytree from torch._dynamo.device_interface import get_interface_for_device from torch._dynamo.utils import detect_fake_mode from torch.autograd import DeviceType @@ -51,7 +56,7 @@ from torch.utils._sympy.symbol import make_symbol, SymT from torch.utils._sympy.value_ranges import bound_sympy, ValueRanges from . import config -from .runtime.runtime_utils import ceildiv as runtime_ceildiv +from .runtime.runtime_utils import cache_dir, ceildiv as runtime_ceildiv log = logging.getLogger(__name__) @@ -1524,7 +1529,7 @@ def should_assume_input_aligned(example_input: torch.Tensor): # See Note: [Input Alignment handling in Inductor] # right now, we only care about alignment for cuda tensors. - if example_input.device.type != "cuda": + if not is_gpu(example_input.device.type): return False return config.assume_aligned_inputs or tensor_is_aligned(example_input) @@ -1544,3 +1549,140 @@ def maybe_get_suppress_shape_guards_ctx(): return contextlib.nullcontext() return shape_env.suppress_guards() + + +def aoti_eager_cache_dir(namespace: str, device: str): + return Path(cache_dir()) / "aoti_eager" / namespace / device + + +def aoti_eager_op_conf_lock(op_func_name_with_overload: str): + from filelock import FileLock + + # Avoid circular import + from torch._inductor.codecache import get_lock_dir, LOCK_TIMEOUT + + op_conf_lock_file = f"{op_func_name_with_overload}.lock" + lock_dir = get_lock_dir() + return FileLock(os.path.join(lock_dir, op_conf_lock_file), timeout=LOCK_TIMEOUT) + + +def load_aoti_eager_cache(ns: str, op_func_name_with_overload: str, device_type: str): + device_kernel_cache = aoti_eager_cache_dir(ns, device_type) + op_conf = device_kernel_cache / f"{op_func_name_with_overload}.json" + if not op_conf.exists(): + return [] + + with aoti_eager_op_conf_lock(op_func_name_with_overload): + with open(op_conf) as f: + json_data = json.load(f) + for item in json_data: + # Get absolution path for kernel library + kernel_lib_abs_path = device_kernel_cache / item["kernel_path"] + item["kernel_path"] = kernel_lib_abs_path.as_posix() + + # Check if the kernel library exists + if not kernel_lib_abs_path.exists(): + return [] + + for metadata in item["meta_info"]: + assert not metadata[ + "is_dynamic" + ], "Only support static shape for now" + if metadata["device_type"] == "cpu": + metadata["device_index"] = -1 + metadata["dtype"] = getattr(torch, metadata["dtype"].split(".")[-1]) + + return json_data + + +def aoti_compile_with_persistent_cache( + ns: str, + op_func_name_with_overload: str, + device_type: str, + dynamic: bool, + f: Callable[..., Any], + args: Tuple[Any], + kwargs: Dict[str, Any], + *, + dynamic_shapes: Optional[Dict[str, Any]] = None, + options: Optional[Dict[str, Any]] = None, + remove_runtime_assertions: bool = False, + disable_constraint_solver: bool = False, +): + """ + Compile the given function with persistent cache for AOTI eager mode. + """ + flattened_inputs = pytree.arg_tree_leaves(*args, **kwargs) + assert all( + isinstance(input, torch.Tensor) for input in flattened_inputs + ), "Only support tensor for now" + assert not dynamic, "Only support static shape for now" + + persistent_cache = aoti_eager_cache_dir(ns, device_type) + persistent_cache.mkdir(parents=True, exist_ok=True) + persistent_cache_lib = persistent_cache / "lib" + persistent_cache_lib.mkdir(parents=True, exist_ok=True) + + with mock.patch.dict( + os.environ, + {"TORCHINDUCTOR_CACHE_DIR": persistent_cache_lib.absolute().as_posix()}, + ): + try: + kernel_lib_path = torch._export.aot_compile( + f, + args, + kwargs, + dynamic_shapes=dynamic_shapes, + options=options, + remove_runtime_assertions=remove_runtime_assertions, + disable_constraint_solver=disable_constraint_solver, + ) + + kernel_metadata_items = [] + for input_tensor in flattened_inputs: + # TODO(Eikan): To add dynamic support + metadata: Dict[str, Any] = {} + metadata["is_dynamic"] = dynamic + metadata["device_type"] = f"{input_tensor.device.type}" + if is_cpu_device([input_tensor]): + metadata["device_index"] = -1 + else: + metadata["device_index"] = input_tensor.device.index + metadata["dtype"] = f"{input_tensor.dtype}" + metadata["sizes"] = list(input_tensor.size()) + metadata["strides"] = list(input_tensor.stride()) + kernel_metadata_items.append(metadata) + + kernel_meta_info: Dict[str, Any] = {} + kernel_meta_info["meta_info"] = kernel_metadata_items + kernel_meta_info["kernel_path"] = ( + Path(kernel_lib_path).relative_to(persistent_cache).as_posix() + ) + + json_data = [] + update_json = True + op_conf = persistent_cache / f"{op_func_name_with_overload}.json" + mode = "r" if op_conf.exists() else "w" + with aoti_eager_op_conf_lock(op_func_name_with_overload): + with open(op_conf, mode) as op_conf_file: + try: + json_data = json.load(op_conf_file) + except Exception as e: + json_data = [] + + assert isinstance(json_data, list) + for item in json_data: + assert isinstance(item, dict) + # Same kernel meta info already exists in the json file + if item["meta_info"] == kernel_metadata_items: + update_json = False + break + + if update_json: + json_data.append(kernel_meta_info) + with open(op_conf, "w") as op_conf_file: + json.dump(json_data, op_conf_file, indent=4) + + return kernel_lib_path + except Exception as e: + return "" diff --git a/torch/_ops.py b/torch/_ops.py index f5d7313591dbd..0b19c75a51aa6 100644 --- a/torch/_ops.py +++ b/torch/_ops.py @@ -412,14 +412,22 @@ def key_extractor(tensors, key_mask): # Mode stack for PreDispatchKey -# it should always have two keys with +# it should always have three keys with # priority given to FunctionalTensorMode and # then ProxyTorchDispatchMode. It means that # slot 0 belongs to ProxyTorchDispatchMode and # slot 1 belongs to FunctionalTensorMode. +# +# SchemaCheckMode is separate from the other 2, +# and is only valid when the stack is empty. +# SchemaCheckMode is for testing purposes, and +# is meant to run in eager mode on concrete inputs, +# checking for incorrect schemas in regards to +# aliasing or mutating ops. class _ModeStackStateForPreDispatch: def __init__(self): self.__infra_modes = [None, None] + self._schema_check_mode = None def set(self, index, mode): assert index < len(self.__infra_modes) @@ -430,28 +438,36 @@ def get(self, index): return self.__infra_modes[index] def count(self): - return len([i for i in self.__infra_modes if i is not None]) + return len([i for i in self.__infra_modes if i is not None]) + int( + self._schema_check_mode is not None + ) _mode_stack_state_for_pre_dispatch = _ModeStackStateForPreDispatch() -def unset_mode_pre_dispatch(mode_key): +def unset_mode_pre_dispatch(mode_key, schema_check=False): current_mode_stack_pre_dispatch = mode_stack_state_for_pre_dispatch() - assert mode_key in ( + assert mode_key is None or mode_key in ( torch._C._TorchDispatchModeKey.PROXY, torch._C._TorchDispatchModeKey.FUNCTIONAL, ) + if schema_check: + assert mode_key is None def _unset_mode(): if mode_key == torch._C._TorchDispatchModeKey.PROXY: current_mode = current_mode_stack_pre_dispatch.get(0) mode_stack_state_for_pre_dispatch().set(0, None) return current_mode - else: + elif mode_key == torch._C._TorchDispatchModeKey.FUNCTIONAL: current_mode = current_mode_stack_pre_dispatch.get(1) mode_stack_state_for_pre_dispatch().set(1, None) return current_mode + else: + current_mode = mode_stack_state_for_pre_dispatch()._schema_check_mode + mode_stack_state_for_pre_dispatch()._schema_check_mode = None + return current_mode current_mode = _unset_mode() @@ -470,12 +486,27 @@ def _unset_mode(): def _set_mode_pre_dispatch(mode): from torch._subclasses.functional_tensor import FunctionalTensorMode + from torch._subclasses.schema_check_mode import SchemaCheckMode from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode - assert isinstance(mode, (FunctionalTensorMode, ProxyTorchDispatchMode)) + assert isinstance( + mode, + ( + FunctionalTensorMode, + ProxyTorchDispatchMode, + SchemaCheckMode, + ), + ) previous_mode_stack_len = _len_torch_dispatch_stack_pre_dispatch() - if isinstance(mode, FunctionalTensorMode): + if isinstance(mode, SchemaCheckMode): + current_mode = mode_stack_state_for_pre_dispatch()._schema_check_mode + if previous_mode_stack_len > 0: + raise AssertionError( + "SchemaCheckMode for pre-dispatch must be used exclusively, found other modes on the stack" + ) + mode_stack_state_for_pre_dispatch()._schema_check_mode = mode + elif isinstance(mode, FunctionalTensorMode): current_mode = mode_stack_state_for_pre_dispatch().get(1) assert current_mode is None mode_stack_state_for_pre_dispatch().set(1, mode) @@ -501,9 +532,10 @@ def _pop_mode_from_pre_dispatch(): if pre_dispatch_len == 0: raise AssertionError("Trying to pop empty mode stack") + if mode_stack._schema_check_mode is not None: + return unset_mode_pre_dispatch(None, schema_check=True) if mode_stack.get(1) is not None: return unset_mode_pre_dispatch(torch._C._TorchDispatchModeKey.FUNCTIONAL) - if mode_stack.get(0) is not None: return unset_mode_pre_dispatch(torch._C._TorchDispatchModeKey.PROXY) @@ -519,19 +551,23 @@ def _get_dispatch_mode_pre_dispatch(mode_key): ) if mode_key == torch._C._TorchDispatchModeKey.PROXY: return mode_stack_state_for_pre_dispatch().get(0) - return mode_stack_state_for_pre_dispatch().get(1) + else: + return mode_stack_state_for_pre_dispatch().get(1) def _get_current_dispatch_mode_pre_dispatch(): - stack_len = mode_stack_state_for_pre_dispatch().count() - if stack_len == 2: - return mode_stack_state_for_pre_dispatch().get(1) - if stack_len == 1: - return ( - mode_stack_state_for_pre_dispatch().get(1) - if mode_stack_state_for_pre_dispatch().get(1) is not None - else mode_stack_state_for_pre_dispatch().get(0) - ) + if mode_stack_state_for_pre_dispatch()._schema_check_mode is not None: + return mode_stack_state_for_pre_dispatch()._schema_check_mode + else: + stack_len = mode_stack_state_for_pre_dispatch().count() + if stack_len == 2: + return mode_stack_state_for_pre_dispatch().get(1) + if stack_len == 1: + return ( + mode_stack_state_for_pre_dispatch().get(1) + if mode_stack_state_for_pre_dispatch().get(1) is not None + else mode_stack_state_for_pre_dispatch().get(0) + ) return None diff --git a/torch/_streambase.py b/torch/_streambase.py index 5a0df2c22ba95..b06946523fa3b 100644 --- a/torch/_streambase.py +++ b/torch/_streambase.py @@ -5,27 +5,27 @@ class _StreamBase(ABC): r"""Base stream class abstraction for multi backends Stream to herit from""" @abstractmethod - def wait_event(self, event): + def wait_event(self, event) -> None: raise NotImplementedError @abstractmethod - def wait_stream(self, stream): + def wait_stream(self, stream) -> None: raise NotImplementedError @abstractmethod - def record_event(self, event=None): + def record_event(self, event=None) -> None: raise NotImplementedError @abstractmethod - def query(self): + def query(self) -> bool: raise NotImplementedError @abstractmethod - def synchronize(self): + def synchronize(self) -> None: raise NotImplementedError @abstractmethod - def __eq__(self, stream): + def __eq__(self, stream) -> bool: raise NotImplementedError @@ -33,13 +33,13 @@ class _EventBase(ABC): r"""Base Event class abstraction for multi backends Event to herit from""" @abstractmethod - def wait(self, stream=None): + def wait(self, stream=None) -> None: raise NotImplementedError @abstractmethod - def query(self): + def query(self) -> bool: raise NotImplementedError @abstractmethod - def synchronize(self): + def synchronize(self) -> None: raise NotImplementedError diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py index 3fdc4fc01e6b9..79c8e951edfcc 100644 --- a/torch/_subclasses/fake_tensor.py +++ b/torch/_subclasses/fake_tensor.py @@ -215,8 +215,8 @@ def tensor_memo(self): meta_converter: MetaConverter constant_storage_mapping: Dict[StorageWeakRef, List[ReferenceType]] - def __init__(self): - self.meta_converter = MetaConverter() + def __init__(self, *, copy_data=False): + self.meta_converter = MetaConverter(copy_data=copy_data) # map from to storage to corresponding constant tensors self.constant_storage_mapping = {} @@ -294,8 +294,6 @@ def from_real_tensor( assert not make_constant def mk_fake_tensor(make_meta_t): - from torch._dynamo.utils import clone_input - # NB: don't use in_kernel_invocation_manager. to # ensure FakeTensor can internally do constant computation # as necessary. Invocation manager is "more correct" as @@ -311,18 +309,6 @@ def mk_fake_tensor(make_meta_t): # TODO: callback might be used in recursive contexts, in # which case using t is wrong! BUG! constant=t if make_constant else None, - # TODO: This won't preserve aliasing relationships, so if - # there is mutation you won't see it reflect elsewhere. - # This is fine because propagate_real_tensors isn't - # intended to give you exact results and some inaccuracy - # is OK, although if its use case expands we would want to - # do something similar to meta converter, but poking in - # real tensors at the storage cloning phase - real_tensor=( - (t if make_constant else clone_input(t)) - if fake_mode.propagate_real_tensors - else None - ), ) out = self.meta_converter( @@ -870,23 +856,26 @@ def __init__( ): log.debug("create_mode 0x%x", id(self)) self.allow_fallback_kernels = allow_fallback_kernels - self.fake_tensor_converter = FakeTensorConverter() + + import torch._dynamo.config + import torch._functorch.config + + self.propagate_real_tensors = ( + torch._functorch.config.fake_tensor_propagate_real_tensors + ) + self.fake_tensor_converter = FakeTensorConverter( + copy_data=self.propagate_real_tensors + ) + if static_shapes is not None: self.static_shapes = static_shapes else: self.static_shapes = shape_env is None - import torch._dynamo.config - import torch._functorch.config - # This is temporarily patched to True in Dynamo to grandfather in some # places where we unconditionally allow scalar outputs, TO BE REMOVED self.allow_scalar_outputs = False - self.propagate_real_tensors = ( - torch._functorch.config.fake_tensor_propagate_real_tensors - ) - self._allow_unsafe_data_ptr_access = ( torch._functorch.config.fake_tensor_allow_unsafe_data_ptr_access ) @@ -1552,7 +1541,7 @@ def maybe_to_real_tensor(t): func, flat_arg_fake_tensors, flat_args, - self.shape_env.unbacked_var_to_val, + self.shape_env.unbacked_var_to_val if self.shape_env else None, ) def maybe_propagate_real_tensors(fake_out): diff --git a/torch/_subclasses/functional_tensor.py b/torch/_subclasses/functional_tensor.py index 1762059eedf22..dfef5951ab26f 100644 --- a/torch/_subclasses/functional_tensor.py +++ b/torch/_subclasses/functional_tensor.py @@ -17,6 +17,27 @@ not_implemented_log = torch._logging.getArtifactLogger(__name__, "not_implemented") +# NOTE Some special handling for tensor conversion during export is needed. +# Normally, when tracing through the model with tensor.to(), the maybe-aliasing +# relationship between input and output tensors will be baked into the graph. +# For example, if we got a tensor with device cpu and call tensor.to("cpu"), +# it will become a no-op in the graph. For a whole graph capture, this is not +# sound so we need to do something different. Instead, in export we will try to +# preserve the tensor conversion by forcing a non-semantic-breaking aten::_to_copy +# operator to be traced in the graph, and subsequently banning mutations on all +# such converted tensors. +# In addition to patching .to() method call in functionalization, we will have to +# patch other similar methods like float() and cpu(), because they intentionally +# don't fall back to .to() methods, but have the same behavior as .to() according to +# pytorch document. https://pytorch.org/docs/stable/generated/torch.Tensor.float.html +# thus we simply force them to go through .to() call. +def _conversion_method_template(**extra_kwargs): + def _(self, *args, **kwargs): + return self.to(*args, **{**kwargs, **extra_kwargs}) + + return _ + + class FunctionalTensor(torch.Tensor): """ Functional tensors represent tensors that will remove mutations @@ -225,6 +246,24 @@ def to(self, *args, **kwargs): return super().to(*args, **{**kwargs, "copy": True}) return super().to(*args, **kwargs) + def cuda(self, device=None, *args, **kwargs): + device = device or torch.cuda.current_device() + if len(args) > 0: + return self.to(device, *args, **kwargs) + else: + return self.to(device=device, **kwargs) + + char = _conversion_method_template(dtype=torch.int8) + cpu = _conversion_method_template(device=torch.device("cpu")) + bfloat16 = _conversion_method_template(dtype=torch.bfloat16) + byte = _conversion_method_template(dtype=torch.uint8) + double = _conversion_method_template(dtype=torch.float64) + float = _conversion_method_template(dtype=torch.float32) + bool = _conversion_method_template(dtype=torch.bool) + half = _conversion_method_template(dtype=torch.float16) + int = _conversion_method_template(dtype=torch.int32) + long = _conversion_method_template(dtype=torch.int64) + class FunctionalTensorMode(TorchDispatchMode): def __init__(self, pre_dispatch=False, export=False, _allow_token_discovery=False): diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py index c674120a22ff6..780ec54888da9 100644 --- a/torch/_subclasses/meta_utils.py +++ b/torch/_subclasses/meta_utils.py @@ -34,6 +34,7 @@ maybe_get_level, peek_interpreter_stack, ) +from torch.utils._mode_utils import no_dispatch from torch.utils._python_dispatch import is_traceable_wrapper_subclass from torch.utils.weak import WeakIdKeyDictionary @@ -153,13 +154,14 @@ class MetaTensorDescriber: the same ID when we see the same tensor/storage. """ - def __init__(self): + def __init__(self, *, copy_data=False): self.next_tensor_id: MetaTensorId = 0 self.next_storage_id: MetaStorageId = 0 # Tensor -> int self.lookup_tensor = WeakIdKeyDictionary() # Storage -> int self.lookup_storage = WeakIdKeyDictionary() + self.copy_data = copy_data def get_tensor_id(self, t: torch.Tensor): if t not in self.lookup_tensor: @@ -180,6 +182,9 @@ def describe_storage(self, s: torch.UntypedStorage): return MetaStorageDesc( id=self.get_storage_id(s), size=s.size(), + # NB: We don't do the copy yet; copy happens when we start + # creating the new storages + data=s if self.copy_data else None, ) def describe_tensor(self, t: torch.Tensor, recurse: bool = True): @@ -354,6 +359,7 @@ def describe_tensor(self, t: torch.Tensor, recurse: bool = True): functorch_stack=maybe_functorch_stack, autograd_meta_from=autograd_meta_from, current_level=current_level, + data=t if self.copy_data else None, ) @@ -361,6 +367,9 @@ def describe_tensor(self, t: torch.Tensor, recurse: bool = True): class MetaStorageDesc: id: MetaStorageId size: int + # NB: this is only populated with copy_data True, it is not directly + # serializable in JSON, you want to do something special here anyway + data: Optional[torch.UntypedStorage] @dataclass(frozen=True) @@ -388,7 +397,9 @@ class MetaTensorDesc: # NB: Sometimes, size, stride and storage_offset contain SymInt, in which # case this is NOT serializable. That only happens when you're # re-fakeifying a fake tensor with an existing ShapeEnv... maybe we - # can get rid of this use case entirely + # can get rid of this use case entirely. Notably, even if we are + # fakeifying a real tensor into a fake tensor with symbolic shapes, the + # size here is NOT dynamic # NB: size could potentially be None as you can override it and make it # throw an error, but we don't currently have any subclasses that do this # except C++ nested tensor but we're going to have nested int to make this @@ -434,6 +445,11 @@ class MetaTensorDesc: functorch_stack: Optional[List[CInterpreter]] = None autograd_meta_from: Optional[torch.Tensor] = None + # This is only populated on copy_data, and typically is not used at all, + # except for some of our meta-ification paths that don't properly use + # storage (pro-tip: you should use storage) + data: Optional[torch.Tensor] = None + # Faithfully serializing functorch tensors will not be too difficult. # We only need to consider grad/vmap interpreters, and their internal # state is only bools (mostly what the grad enabled/disabled state @@ -457,7 +473,7 @@ def shape(self): # meta storages. This class will hold weak references to cached tenosrs # and tensor storages. class MetaConverter: - def __init__(self): + def __init__(self, *, copy_data: bool = False): # Maps MetaStorageId to UntypedStorage self.storage_memo: weakref.WeakValueDictionary = weakref.WeakValueDictionary() # Maps MetaTensorId to torch.Tensor (typically a meta tensor or @@ -467,7 +483,12 @@ def __init__(self): self.miss = 0 self.del_hook = None self.arg_cnt = 0 - self.describer = MetaTensorDescriber() + # Ensures real_storage/real_tensor are populated on the resulting + # metaified storage/tensor. The naming of this attribute is load + # bearing: FakeTensor relies on real tensor being set to exactly this + # value + self.copy_data = copy_data + self.describer = MetaTensorDescriber(copy_data=copy_data) def successful(self): return self.hit > 0 and self.miss == 0 @@ -489,8 +510,12 @@ def meta_storage(self, s: MetaStorageDesc, callback): # Need to make sure to resize the meta storage too. if self.get_storage_memo(s) is None: r_s = callback( - lambda: torch.empty(s.size, dtype=torch.uint8, device="meta") + lambda: torch.empty(s.size, dtype=torch.uint8, device="meta"), ).untyped_storage() + if self.copy_data: + with torch.no_grad(), no_dispatch(): + assert s.data is not None + r_s.real_storage = s.data.clone() self.set_storage_memo(s, r_s) return r_s else: @@ -640,8 +665,8 @@ def empty_create_subclass( outer_size = outer_size if outer_size is not None else t.size outer_stride = outer_stride if outer_stride is not None else t.stride - transformed_tensors_dict = { - attr: callback( + def transform(attr, inner_t): + r = callback( lambda: empty_create( inner_t, AttrSource(source, attr), @@ -652,7 +677,29 @@ def empty_create_subclass( ), ) ) - for attr, inner_t in t.attrs.items() + # Note [Inaccessible data is not copied] + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # A more faithful reproduction would do a copy on the entire + # storage, but this needs to be done carefully because the + # underlying storage could have larger extent than is implied + # by size/stride. The real fix is to properly call + # meta_storage recursively here. + if self.copy_data: + with torch.no_grad(), no_dispatch(): + r.real_tensor = torch.empty_strided( + inner_t.size, + inner_t.stride, + dtype=inner_t.dtype, + device=inner_t.device, + ) + assert inner_t.data is not None + r.real_tensor.copy_( + inner_t.data + ) # Note [Inaccessible data is not copied] + return r + + transformed_tensors_dict = { + attr: transform(attr, inner_t) for attr, inner_t in t.attrs.items() } sub = t.type.__tensor_unflatten__( @@ -892,6 +939,11 @@ def tensor_visitor_fn( device="meta", ) ) + if self.copy_data: + # Pray that sparse clone doesn't lose information + assert t.data is not None + with torch.no_grad(), no_dispatch(): + r.real_tensor = t.data.clone() assert safe_is_leaf(r), "the callback you passed in doesn't detach" # Note [is_coalesced is dispatched] # Strangely enough, is_coalesced() is a dispatched operator, @@ -939,6 +991,11 @@ def tensor_visitor_fn( device="meta", ) ) + if self.copy_data: + # Pray sparse clone doesn't lose information + assert t.data is not None + with torch.no_grad(), no_dispatch(): + r.real_tensor = t.data.clone() assert safe_is_leaf(r), "the callback you passed in doesn't detach" if t.requires_grad: r.requires_grad = True @@ -961,11 +1018,24 @@ def tensor_visitor_fn( sizes, strides, _storage_offset = sym_sizes_strides_storage_offset( t, source ) + # TODO: This doesn't seem right, where's the MKLDNN'ness + # lol r = callback( lambda: torch.empty_strided( sizes, strides, dtype=t.dtype, device="meta" ) ) + if self.copy_data: + with torch.no_grad(), no_dispatch(): + assert t.size is not None + assert t.stride is not None + r.real_tensor = torch.empty_strided( + t.size, t.stride, dtype=t.dtype, device=t.device + ) + assert t.data is not None + r.real_tensor.copy_( + t.data + ) # Note [Inaccessible data is not copied] assert safe_is_leaf(r), "the callback you passed in doesn't detach" if t.requires_grad: r.requires_grad = True @@ -1056,6 +1126,19 @@ def _to_fake_tensor(t: MetaTensorDesc): device="meta", ) ) + if self.copy_data: + with torch.no_grad(), no_dispatch(): + r.real_tensor = torch.empty_strided( # type: ignore[attr-defined] + t.size, + t.stride, + dtype=t.dtype, + device=t.device, + ) + assert t.data is not None + # Note [Inaccessible data is not copied] + r.real_tensor.copy_( # type: ignore[attr-defined] + t.data + ) return r r = _to_fake_tensor(t) @@ -1211,6 +1294,13 @@ def is_c_of_r(complex_dtype, real_dtype): device="meta", ) ) + if self.copy_data: + with torch.no_grad(), no_dispatch(): + assert t.size is not None + assert t.stride is not None + r.real_tensor = torch.empty_strided( + t.size, t.stride, dtype=t.dtype, device=t.device + ) assert safe_is_leaf(r), "the callback you passed in doesn't detach" if t.requires_grad: @@ -1248,6 +1338,12 @@ def is_c_of_r(complex_dtype, real_dtype): ): # You're normal and happy, install the fresh storage into the memo self.set_storage_memo(s, r.untyped_storage()) + if self.copy_data: + with torch.no_grad(), no_dispatch(): + r.real_tensor.untyped_storage().copy_(s.data) + r.untyped_storage().real_storage = ( + r.real_tensor.untyped_storage() + ) else: # You're in crazy town; somehow you gave us a tensor # that wasn't a view, but had nonzero storage offset, @@ -1286,8 +1382,17 @@ def is_c_of_r(complex_dtype, real_dtype): mb_fake_mode = maybe_get_fake_mode(r) if mb_fake_mode is not None: maybe_fake_mgr = in_kernel_invocation_manager(mb_fake_mode) - with maybe_fake_mgr, torch.no_grad(), maybe_suppress(): - r.set_(r_s, storage_offset, sizes, strides) + with torch.no_grad(), maybe_suppress(): + with maybe_fake_mgr: + r.set_(r_s, storage_offset, sizes, strides) + if self.copy_data: + with torch.no_grad(), no_dispatch(): + r.real_tensor.set_( + r_s.real_storage, + t.storage_offset, + t.size, + t.stride, + ) if t.grad is not None: from torch._dynamo.source import AttrSource diff --git a/torch/_subclasses/schema_check_mode.py b/torch/_subclasses/schema_check_mode.py index 72a2082a162df..3ddb611e5e412 100644 --- a/torch/_subclasses/schema_check_mode.py +++ b/torch/_subclasses/schema_check_mode.py @@ -6,7 +6,6 @@ import torch from torch.fx.operator_schemas import normalize_function -from torch.testing._internal.jit_utils import clone_inputs from torch.utils import _pytree as pytree from torch.utils._python_dispatch import TorchDispatchMode from torch.utils._pytree import tree_map @@ -27,6 +26,38 @@ # - Checks for aliasing on all inputs +# move these 2 functions here to avoid numpy dependency in testing/_internal/common_utils.py + + +def is_iterable_of_tensors(iterable): + # Tensor itself is iterable so we check this first + if isinstance(iterable, torch.Tensor): + return False + try: + if len(iterable) == 0: + return False + for t in iter(iterable): + if not isinstance(t, torch.Tensor): + return False + except TypeError as te: + return False + return True + + +def clone_inputs(args): + inputs = [] + + for arg in args: + if isinstance(arg, torch.Tensor): + inputs.append(arg.detach().clone()) + elif is_iterable_of_tensors(arg): + inputs.append([t.detach().clone() for t in arg]) + else: + inputs.append(arg) + + return inputs + + class SchemaCheckMode(TorchDispatchMode): def __init__(self): # Information recorded for testing purposes. For example: diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py index 8feafafea2fd4..f2e774590be3f 100644 --- a/torch/ao/quantization/fx/_decomposed.py +++ b/torch/ao/quantization/fx/_decomposed.py @@ -768,7 +768,7 @@ def quantize_per_token( _quant_min_max_bounds_check(quant_min, quant_max, dtype) _per_token_quant_qparam_dim_check(input, scales, zero_points) input = ( - torch.round(input / scales + zero_points).clamp(quant_min, quant_max).to(dtype) + input.mul(1.0 / scales).add(zero_points).round().clamp(quant_min, quant_max).to(dtype) ) return input @@ -875,7 +875,7 @@ def quantize_per_channel_group( zero_points = zero_points.reshape(-1, 1) input_int8 = ( - to_quant.div(scales) + to_quant.mul(1.0 / scales) .add(zero_points) .round() .clamp_(quant_min, quant_max) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index e053d89d79835..5da75b608a82a 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -195,7 +195,7 @@ def __init__( self, enabled=True, *, - use_cuda=False, + use_cuda=False, # Deprecated use_device=None, record_shapes=False, with_flops=False, diff --git a/torch/cpu/__init__.py b/torch/cpu/__init__.py index 14794627d752b..2f2561b69c1c1 100644 --- a/torch/cpu/__init__.py +++ b/torch/cpu/__init__.py @@ -11,6 +11,7 @@ from .. import device as _device from . import amp + __all__ = [ "is_available", "synchronize", @@ -49,7 +50,6 @@ def synchronize(device: _device_t = None) -> None: N.B. This function only exists to facilitate device-agnostic code. """ - pass class Stream: @@ -57,7 +57,7 @@ class Stream: N.B. This class only exists to facilitate device-agnostic code """ - def __init__(self, priority: int = -1): + def __init__(self, priority: int = -1) -> None: pass def wait_stream(self, stream) -> None: @@ -68,13 +68,13 @@ class Event: def query(self) -> bool: return True - def record(self, stream=None): + def record(self, stream=None) -> None: pass - def synchronize(self): + def synchronize(self) -> None: pass - def wait(self, stream=None): + def wait(self, stream=None) -> None: pass @@ -100,6 +100,7 @@ class StreamContext(AbstractContextManager): N.B. This class only exists to facilitate device-agnostic code """ + cur_stream: Optional[Stream] def __init__(self, stream): @@ -115,7 +116,7 @@ def __enter__(self): self.prev_stream = _current_stream _current_stream = cur_stream - def __exit__(self, type: Any, value: Any, traceback: Any): + def __exit__(self, type: Any, value: Any, traceback: Any) -> None: cur_stream = self.stream if cur_stream is None: return @@ -146,7 +147,6 @@ def set_device(device: _device_t) -> None: N.B. This function only exists to facilitate device-agnostic code """ - pass def current_device() -> str: diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h index 6b8d923f40909..4f8d614e16dcf 100644 --- a/torch/csrc/Exceptions.h +++ b/torch/csrc/Exceptions.h @@ -15,7 +15,7 @@ #include #include -#if defined(USE_DISTRIBUTED) && defined(USE_C10D) +#if defined(USE_DISTRIBUTED) #include #endif diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index 22a257909bf12..3be764220e0de 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -168,12 +168,14 @@ static PyObject* THPModule_initExtension( PyObject* shm_manager_path) { HANDLE_TH_ERRORS #if !defined(FBCODE_CAFFE2) - if (torch::get_cpp_stacktraces_enabled() && !torch::get_disable_addr2line()) { + if (torch::get_cpp_stacktraces_enabled()) { c10::SetStackTraceFetcher([]() -> std::string { auto tb = torch::CapturedTraceback::gather(false, false, true); - LOG(WARNING) - << "symbolizing C++ stack trace for exception; if this hangs, rerun with TORCH_DISABLE_ADDR2LINE=1..." - << std::endl; + if (torch::get_symbolize_mode() == torch::unwind::Mode::addr2line) { + LOG(WARNING) + << "symbolizing C++ stack trace for exception; if this hangs, rerun with TORCH_DISABLE_ADDR2LINE=1..." + << std::endl; + } auto s_tbs = torch::symbolize({tb.get()}); std::stringstream oss; oss << "C++ CapturedTraceback:" << std::endl; @@ -396,10 +398,10 @@ PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) { // The TensorImpls contain PyObjectSlots that have a reference to the PyObject // associated with the TensorImpl. Swap this field as well. - c10::optional mb_obj_a = + std::optional mb_obj_a = a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( getPyInterpreter(), /*ignore_hermetic_tls=*/false); - c10::optional mb_obj_b = + std::optional mb_obj_b = b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( getPyInterpreter(), /*ignore_hermetic_tls=*/false); TORCH_INTERNAL_ASSERT( @@ -1803,7 +1805,7 @@ Call this whenever a new thread is created in order to propagate values from "_select_conv_backend", [](const at::Tensor& input, const at::Tensor& weight, - const c10::optional& bias_opt, + const std::optional& bias_opt, at::SymIntArrayRef stride_, at::SymIntArrayRef padding_, at::SymIntArrayRef dilation_, @@ -1837,14 +1839,14 @@ Call this whenever a new thread is created in order to propagate values from "_select_conv_backend", [](const at::Tensor& input, const at::Tensor& weight, - const c10::optional& bias, + const std::optional& bias, at::SymIntArrayRef stride_, at::SymIntArrayRef padding_, at::SymIntArrayRef dilation_, bool transposed_, at::SymIntArrayRef output_padding_, c10::SymInt groups_, - c10::optional> bias_sizes_opt) { + std::optional> bias_sizes_opt) { c10::OptionalArrayRef ref = c10::nullopt; if (bias_sizes_opt) { ref = (*bias_sizes_opt); @@ -1883,7 +1885,7 @@ Call this whenever a new thread is created in order to propagate values from .def(py::init([](at::Tensor const& query, at::Tensor const& key, at::Tensor const& value, - c10::optional attn_mask, + std::optional attn_mask, double dropout, bool is_causal) { return sdp::sdp_params{ @@ -2034,7 +2036,7 @@ Call this whenever a new thread is created in order to propagate values from py_module.def( "_get_accelerator", - [](c10::optional check = c10::nullopt) { + [](std::optional check = c10::nullopt) { return c10::Device( at::getAccelerator(check.value_or(false)) .value_or(c10::DeviceType::CPU), @@ -2175,7 +2177,7 @@ Call this whenever a new thread is created in order to propagate values from _DeviceDtypeHasher>; py_module.def( "_group_tensors_by_device_and_dtype", - [](const std::vector>>& + [](const std::vector>>& nested_tensorlist, const bool with_indices) { _FlatMap map; diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp index 4582cb2a8340c..a7e5c5e9fb873 100644 --- a/torch/csrc/PyInterpreter.cpp +++ b/torch/csrc/PyInterpreter.cpp @@ -592,7 +592,7 @@ static void set_tensor_attr_with_capsule( const c10::TensorImpl* tensor, py::capsule& capsule, const char* attr_name) { - c10::optional mb_obj = tensor->pyobj_slot()->check_pyobj( + std::optional mb_obj = tensor->pyobj_slot()->check_pyobj( getPyInterpreter(), /*ignore_hermetic_tls=*/false); TORCH_CHECK( mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value"); @@ -620,7 +620,7 @@ static c10::ArrayRef get_set_cached_attr( const c10::TensorImpl* tensor, const char* base_attr_name, const py::object& obj) { - c10::optional mb_obj = + std::optional mb_obj = tensor->pyobj_slot()->check_pyobj(getPyInterpreter()); TORCH_CHECK( mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value"); diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp index a3f8263303782..aa5584abd39e4 100644 --- a/torch/csrc/Storage.cpp +++ b/torch/csrc/Storage.cpp @@ -108,7 +108,7 @@ PyObject* THPStorage_Wrap(c10::Storage storage) { c10::newStorageImplFromRefcountedDataPtr(storage), c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED); } - c10::optional maybe_pyobj = pyobj_slot->check_pyobj( + std::optional maybe_pyobj = pyobj_slot->check_pyobj( getPyInterpreter(), /*ignore_hermetic_tls=*/false); c10::impl::PyInterpreterStatus status = c10::impl::PyInterpreterStatus::TAGGED_BY_US; @@ -236,7 +236,7 @@ static void THPStorage_subclass_dealloc(PyObject* self) { if (type->tp_del) { PyObject_GC_Track(self); type->tp_del(self); - if (self->ob_refcnt > 0) { + if (Py_REFCNT(self) > 0) { // Resurrected (see above comment about resurrection from `__del__`) return; } @@ -316,8 +316,8 @@ static PyObject* THPStorage_pynew( device_arg_idx = 2; } - c10::optional allocator_opt = r.toInt64Optional(allocator_arg_idx); - c10::optional device_opt = r.deviceOptional(device_arg_idx); + std::optional allocator_opt = r.toInt64Optional(allocator_arg_idx); + std::optional device_opt = r.deviceOptional(device_arg_idx); TORCH_CHECK( !allocator_opt.has_value() || !device_opt.has_value(), @@ -498,7 +498,7 @@ static PyObject* THPStorage_get(THPStorage* self, PyObject* index) { at::StorageImpl* old_storage_impl = storage.unsafeGetStorageImpl(); c10::raw::intrusive_ptr::incref(old_storage_impl); - c10::optional device_opt = old_storage_impl->device(); + std::optional device_opt = old_storage_impl->device(); auto new_storage_impl = make_storage_impl( c10::StorageImpl::use_byte_size_t(), #ifdef THQUANTIZED diff --git a/torch/csrc/Stream.cpp b/torch/csrc/Stream.cpp index 06dac515c1a5e..179f4f1390aff 100644 --- a/torch/csrc/Stream.cpp +++ b/torch/csrc/Stream.cpp @@ -82,7 +82,7 @@ static PyObject* THPStream_pynew( // It requires other device backends override getNewStream method. How the new // stream is created is backend specific. Backend should be able to correctly // manage the lifetime of streams. - c10::optional stream_opt; + std::optional stream_opt; if (r.idx == 0) { c10::impl::VirtualGuardImpl impl{static_cast(device_type)}; stream_opt = impl.getNewStream( diff --git a/torch/csrc/api/include/torch/expanding_array.h b/torch/csrc/api/include/torch/expanding_array.h index aa4fecf4ff37c..f77c05119ebf7 100644 --- a/torch/csrc/api/include/torch/expanding_array.h +++ b/torch/csrc/api/include/torch/expanding_array.h @@ -104,15 +104,15 @@ std::ostream& operator<<( } /// A utility class that accepts either a container of `D`-many -/// `c10::optional` values, or a single `c10::optional` value, which is +/// `std::optional` values, or a single `c10::optional` value, which is /// internally repeated `D` times. It has the additional ability to accept /// containers of the underlying type `T` and convert them to a container of -/// `c10::optional`. +/// `std::optional`. template class ExpandingArrayWithOptionalElem - : public ExpandingArray> { + : public ExpandingArray> { public: - using ExpandingArray>::ExpandingArray; + using ExpandingArray>::ExpandingArray; /// Constructs an `ExpandingArrayWithOptionalElem` from an `initializer_list` /// of the underlying type `T`. The extent of the length is checked against @@ -130,7 +130,7 @@ class ExpandingArrayWithOptionalElem /// the underlying type `T`. The extent of the length is checked against the /// `ExpandingArrayWithOptionalElem`'s extent parameter `D` at runtime. /*implicit*/ ExpandingArrayWithOptionalElem(at::ArrayRef values) - : ExpandingArray>(0) { + : ExpandingArray>(0) { // clang-format off TORCH_CHECK( values.size() == D, @@ -145,7 +145,7 @@ class ExpandingArrayWithOptionalElem /// underlying type `T`, which is repeated `D` times (where `D` is the extent /// parameter of the `ExpandingArrayWithOptionalElem`). /*implicit*/ ExpandingArrayWithOptionalElem(T single_size) - : ExpandingArray>(0) { + : ExpandingArray>(0) { for (const auto i : c10::irange(this->values_.size())) { this->values_[i] = single_size; } @@ -154,7 +154,7 @@ class ExpandingArrayWithOptionalElem /// Constructs an `ExpandingArrayWithOptionalElem` from a correctly sized /// `std::array` of the underlying type `T`. /*implicit*/ ExpandingArrayWithOptionalElem(const std::array& values) - : ExpandingArray>(0) { + : ExpandingArray>(0) { for (const auto i : c10::irange(this->values_.size())) { this->values_[i] = values[i]; } diff --git a/torch/csrc/api/include/torch/fft.h b/torch/csrc/api/include/torch/fft.h index da1f7e518ae54..d9a3430a7a249 100644 --- a/torch/csrc/api/include/torch/fft.h +++ b/torch/csrc/api/include/torch/fft.h @@ -15,9 +15,9 @@ namespace fft { /// ``` inline Tensor fft( const Tensor& self, - c10::optional n = c10::nullopt, + std::optional n = c10::nullopt, int64_t dim = -1, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_fft_symint(self, n, dim, norm); } @@ -31,9 +31,9 @@ inline Tensor fft( /// ``` inline Tensor ifft( const Tensor& self, - c10::optional n = c10::nullopt, + std::optional n = c10::nullopt, int64_t dim = -1, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_ifft_symint(self, n, dim, norm); } @@ -49,7 +49,7 @@ inline Tensor fft2( const Tensor& self, OptionalIntArrayRef s = c10::nullopt, IntArrayRef dim = {-2, -1}, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_fft2(self, s, dim, norm); } @@ -65,7 +65,7 @@ inline Tensor ifft2( const Tensor& self, at::OptionalIntArrayRef s = c10::nullopt, IntArrayRef dim = {-2, -1}, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_ifft2(self, s, dim, norm); } @@ -81,7 +81,7 @@ inline Tensor fftn( const Tensor& self, at::OptionalIntArrayRef s = c10::nullopt, at::OptionalIntArrayRef dim = c10::nullopt, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_fftn(self, s, dim, norm); } @@ -97,7 +97,7 @@ inline Tensor ifftn( const Tensor& self, at::OptionalIntArrayRef s = c10::nullopt, at::OptionalIntArrayRef dim = c10::nullopt, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_ifftn(self, s, dim, norm); } @@ -112,9 +112,9 @@ inline Tensor ifftn( /// ``` inline Tensor rfft( const Tensor& self, - c10::optional n = c10::nullopt, + std::optional n = c10::nullopt, int64_t dim = -1, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_rfft_symint(self, n, dim, norm); } @@ -131,9 +131,9 @@ inline Tensor rfft( /// ``` inline Tensor irfft( const Tensor& self, - c10::optional n = c10::nullopt, + std::optional n = c10::nullopt, int64_t dim = -1, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_irfft_symint(self, n, dim, norm); } @@ -149,7 +149,7 @@ inline Tensor rfft2( const Tensor& self, at::OptionalIntArrayRef s = c10::nullopt, IntArrayRef dim = {-2, -1}, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_rfft2(self, s, dim, norm); } @@ -165,7 +165,7 @@ inline Tensor irfft2( const Tensor& self, at::OptionalIntArrayRef s = c10::nullopt, IntArrayRef dim = {-2, -1}, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_irfft2(self, s, dim, norm); } @@ -181,7 +181,7 @@ inline Tensor rfftn( const Tensor& self, at::OptionalIntArrayRef s = c10::nullopt, at::OptionalIntArrayRef dim = c10::nullopt, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_rfftn(self, s, dim, norm); } @@ -197,7 +197,7 @@ inline Tensor irfftn( const Tensor& self, at::OptionalIntArrayRef s = c10::nullopt, at::OptionalIntArrayRef dim = c10::nullopt, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_irfftn(self, s, dim, norm); } @@ -215,9 +215,9 @@ inline Tensor irfftn( /// ``` inline Tensor hfft( const Tensor& self, - c10::optional n = c10::nullopt, + std::optional n = c10::nullopt, int64_t dim = -1, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_hfft_symint(self, n, dim, norm); } @@ -234,9 +234,9 @@ inline Tensor hfft( /// ``` inline Tensor ihfft( const Tensor& self, - c10::optional n = c10::nullopt, + std::optional n = c10::nullopt, int64_t dim = -1, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_ihfft_symint(self, n, dim, norm); } @@ -255,7 +255,7 @@ inline Tensor hfft2( const Tensor& self, at::OptionalIntArrayRef s = c10::nullopt, IntArrayRef dim = {-2, -1}, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_hfft2(self, s, dim, norm); } @@ -275,7 +275,7 @@ inline Tensor ihfft2( const Tensor& self, at::OptionalIntArrayRef s = c10::nullopt, IntArrayRef dim = {-2, -1}, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_ihfft2(self, s, dim, norm); } @@ -294,7 +294,7 @@ inline Tensor hfftn( const Tensor& self, at::OptionalIntArrayRef s = c10::nullopt, IntArrayRef dim = {-2, -1}, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_hfftn(self, s, dim, norm); } @@ -314,7 +314,7 @@ inline Tensor ihfftn( const Tensor& self, at::OptionalIntArrayRef s = c10::nullopt, IntArrayRef dim = {-2, -1}, - c10::optional norm = c10::nullopt) { + std::optional norm = c10::nullopt) { return torch::fft_ihfftn(self, s, dim, norm); } diff --git a/torch/csrc/api/include/torch/linalg.h b/torch/csrc/api/include/torch/linalg.h index 38010fbfcd4d2..3b398fa935b91 100644 --- a/torch/csrc/api/include/torch/linalg.h +++ b/torch/csrc/api/include/torch/linalg.h @@ -118,8 +118,8 @@ inline std::tuple lu_out( inline std::tuple lstsq( const Tensor& self, const Tensor& b, - c10::optional cond, - c10::optional driver) { + std::optional cond, + std::optional driver) { return torch::linalg_lstsq(self, b, cond, driver); } @@ -245,16 +245,16 @@ inline Tensor matrix_rank( inline Tensor matrix_rank( const Tensor& input, - c10::optional atol, - c10::optional rtol, + std::optional atol, + std::optional rtol, bool hermitian) { return torch::linalg_matrix_rank(input, atol, rtol, hermitian); } inline Tensor matrix_rank( const Tensor& input, - const c10::optional& atol, - const c10::optional& rtol, + const std::optional& atol, + const std::optional& rtol, bool hermitian) { return torch::linalg_matrix_rank(input, atol, rtol, hermitian); } @@ -278,8 +278,8 @@ inline Tensor& matrix_rank_out( inline Tensor& matrix_rank_out( Tensor& result, const Tensor& input, - c10::optional atol, - c10::optional rtol, + std::optional atol, + std::optional rtol, bool hermitian) { return torch::linalg_matrix_rank_out(result, input, atol, rtol, hermitian); } @@ -287,8 +287,8 @@ inline Tensor& matrix_rank_out( inline Tensor& matrix_rank_out( Tensor& result, const Tensor& input, - const c10::optional& atol, - const c10::optional& rtol, + const std::optional& atol, + const std::optional& rtol, bool hermitian) { return torch::linalg_matrix_rank_out(result, input, atol, rtol, hermitian); } @@ -382,7 +382,7 @@ inline Tensor& solve_triangular_out( inline std::tuple svd( const Tensor& input, bool full_matrices, - c10::optional driver) { + std::optional driver) { return torch::linalg_svd(input, full_matrices, driver); } @@ -392,20 +392,20 @@ inline std::tuple svd_out( Tensor& Vh, const Tensor& input, bool full_matrices, - c10::optional driver) { + std::optional driver) { return torch::linalg_svd_out(U, S, Vh, input, full_matrices, driver); } inline Tensor svdvals( const Tensor& input, - c10::optional driver) { + std::optional driver) { return torch::linalg_svdvals(input, driver); } inline Tensor& svdvals_out( Tensor& result, const Tensor& input, - c10::optional driver) { + std::optional driver) { return torch::linalg_svdvals_out(result, input, driver); } @@ -561,8 +561,8 @@ inline Tensor& householder_product_out( inline std::tuple lstsq( const Tensor& self, const Tensor& b, - c10::optional cond, - c10::optional driver) { + std::optional cond, + std::optional driver) { return detail::lstsq(self, b, cond, driver); } @@ -773,16 +773,16 @@ inline Tensor matrix_rank( inline Tensor matrix_rank( const Tensor& input, - c10::optional atol, - c10::optional rtol, + std::optional atol, + std::optional rtol, bool hermitian) { return detail::matrix_rank(input, atol, rtol, hermitian); } inline Tensor matrix_rank( const Tensor& input, - const c10::optional& atol, - const c10::optional& rtol, + const std::optional& atol, + const std::optional& rtol, bool hermitian) { return detail::matrix_rank(input, atol, rtol, hermitian); } @@ -806,8 +806,8 @@ inline Tensor& matrix_rank_out( inline Tensor& matrix_rank_out( Tensor& result, const Tensor& input, - c10::optional atol, - c10::optional rtol, + std::optional atol, + std::optional rtol, bool hermitian) { return detail::matrix_rank_out(result, input, atol, rtol, hermitian); } @@ -815,8 +815,8 @@ inline Tensor& matrix_rank_out( inline Tensor& matrix_rank_out( Tensor& result, const Tensor& input, - const c10::optional& atol, - const c10::optional& rtol, + const std::optional& atol, + const std::optional& rtol, bool hermitian) { return detail::matrix_rank_out(result, input, atol, rtol, hermitian); } @@ -976,7 +976,7 @@ inline Tensor& solve_triangular_out( inline std::tuple svd( const Tensor& input, bool full_matrices, - c10::optional driver) { + std::optional driver) { return detail::svd(input, full_matrices, driver); } @@ -986,7 +986,7 @@ inline std::tuple svd_out( Tensor& Vh, const Tensor& input, bool full_matrices, - c10::optional driver) { + std::optional driver) { return detail::svd_out(U, S, Vh, input, full_matrices, driver); } @@ -995,14 +995,14 @@ inline std::tuple svd_out( /// See https://pytorch.org/docs/main/linalg.html#torch.linalg.svdvals inline Tensor svdvals( const Tensor& input, - c10::optional driver) { + std::optional driver) { return detail::svdvals(input, driver); } inline Tensor& svdvals_out( Tensor& result, const Tensor& input, - c10::optional driver) { + std::optional driver) { return detail::svdvals_out(result, input, driver); } diff --git a/torch/csrc/api/include/torch/nested.h b/torch/csrc/api/include/torch/nested.h index 524b4d433186c..780aab4230472 100644 --- a/torch/csrc/api/include/torch/nested.h +++ b/torch/csrc/api/include/torch/nested.h @@ -72,8 +72,8 @@ inline at::Tensor nested_tensor( /// ``` inline at::Tensor as_nested_tensor( at::TensorList list, - c10::optional dtype = c10::nullopt, - c10::optional device = c10::nullopt) { + std::optional dtype = c10::nullopt, + std::optional device = c10::nullopt) { return at::_nested_tensor_from_tensor_list( list, dtype, c10::nullopt, device, c10::nullopt); } diff --git a/torch/csrc/api/include/torch/nn/functional/activation.h b/torch/csrc/api/include/torch/nn/functional/activation.h index 9c100287f9559..89e596f71d143 100644 --- a/torch/csrc/api/include/torch/nn/functional/activation.h +++ b/torch/csrc/api/include/torch/nn/functional/activation.h @@ -233,7 +233,7 @@ namespace detail { inline Tensor softmax( const Tensor& input, int64_t dim, - c10::optional dtype) { + std::optional dtype) { Tensor ret; if (dtype == c10::nullopt) { @@ -270,7 +270,7 @@ namespace detail { inline Tensor softmin( const Tensor& input, int64_t dim, - c10::optional dtype) { + std::optional dtype) { Tensor ret; if (dtype == c10::nullopt) { @@ -307,7 +307,7 @@ namespace detail { inline Tensor log_softmax( const Tensor& input, int64_t dim, - c10::optional dtype) { + std::optional dtype) { Tensor ret; if (dtype == c10::nullopt) { diff --git a/torch/csrc/api/include/torch/nn/functional/batchnorm.h b/torch/csrc/api/include/torch/nn/functional/batchnorm.h index 487bd78ad44fe..bc6f141281b39 100644 --- a/torch/csrc/api/include/torch/nn/functional/batchnorm.h +++ b/torch/csrc/api/include/torch/nn/functional/batchnorm.h @@ -17,7 +17,7 @@ inline Tensor batch_norm( Tensor weight, Tensor bias, bool training, - c10::optional momentum, + std::optional momentum, double eps) { TORCH_CHECK( input.dim() >= 2, diff --git a/torch/csrc/api/include/torch/nn/functional/embedding.h b/torch/csrc/api/include/torch/nn/functional/embedding.h index 99432c09d36be..b06b0a3dc1e85 100644 --- a/torch/csrc/api/include/torch/nn/functional/embedding.h +++ b/torch/csrc/api/include/torch/nn/functional/embedding.h @@ -24,8 +24,8 @@ inline void _no_grad_embedding_renorm_( inline Tensor embedding( const Tensor& input, const Tensor& weight, - c10::optional padding_idx, - c10::optional max_norm, + std::optional padding_idx, + std::optional max_norm, double norm_type, bool scale_grad_by_freq, bool sparse) { @@ -90,14 +90,14 @@ inline Tensor embedding_bag( const Tensor& input, const Tensor& weight, const Tensor& offsets, - c10::optional max_norm, + std::optional max_norm, double norm_type, bool scale_grad_by_freq, EmbeddingBagMode mode, bool sparse, const Tensor& per_sample_weights, bool include_last_offset, - c10::optional padding_idx) { + std::optional padding_idx) { auto input_ = input; auto offsets_ = offsets; auto per_sample_weights_ = per_sample_weights; diff --git a/torch/csrc/api/include/torch/nn/functional/loss.h b/torch/csrc/api/include/torch/nn/functional/loss.h index 17fa2be1afc7a..c4124c2b23859 100644 --- a/torch/csrc/api/include/torch/nn/functional/loss.h +++ b/torch/csrc/api/include/torch/nn/functional/loss.h @@ -346,7 +346,7 @@ inline Tensor smooth_l1_loss( const Tensor& input, const Tensor& target, SmoothL1LossFuncOptions::reduction_t reduction, - c10::optional beta_opt = c10::nullopt) { + std::optional beta_opt = c10::nullopt) { if (target.sizes() != input.sizes()) { TORCH_WARN( "Using a target size (", @@ -656,7 +656,7 @@ inline Tensor triplet_margin_with_distance_loss( const Tensor& anchor, const Tensor& positive, const Tensor& negative, - c10::optional + std::optional distance_function, double margin, bool swap, diff --git a/torch/csrc/api/include/torch/nn/functional/normalization.h b/torch/csrc/api/include/torch/nn/functional/normalization.h index a45fec6ca34f9..53bd61839f745 100644 --- a/torch/csrc/api/include/torch/nn/functional/normalization.h +++ b/torch/csrc/api/include/torch/nn/functional/normalization.h @@ -16,7 +16,7 @@ inline Tensor normalize( double p, int64_t dim, double eps, - c10::optional out) { + std::optional out) { if (out == c10::nullopt) { auto denom = input.norm(p, dim, true).clamp_min(eps).expand_as(input); return input / denom; diff --git a/torch/csrc/api/include/torch/nn/functional/padding.h b/torch/csrc/api/include/torch/nn/functional/padding.h index d4b81fb53f26a..1bb6f95382904 100644 --- a/torch/csrc/api/include/torch/nn/functional/padding.h +++ b/torch/csrc/api/include/torch/nn/functional/padding.h @@ -27,7 +27,7 @@ inline Tensor pad( TORCH_CHECK(false, "Unrecognised padding mode"); }(); - c10::optional fill_value; + std::optional fill_value; if (value != 0.0) { fill_value = value; } diff --git a/torch/csrc/api/include/torch/nn/functional/pooling.h b/torch/csrc/api/include/torch/nn/functional/pooling.h index 9f9708ce657ec..be3009f62201a 100644 --- a/torch/csrc/api/include/torch/nn/functional/pooling.h +++ b/torch/csrc/api/include/torch/nn/functional/pooling.h @@ -57,7 +57,7 @@ inline Tensor avg_pool2d( ExpandingArray<2> padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { return torch::avg_pool2d( input, kernel_size, @@ -104,7 +104,7 @@ inline Tensor avg_pool3d( ExpandingArray<3> padding, bool ceil_mode, bool count_include_pad, - c10::optional divisor_override) { + std::optional divisor_override) { return torch::avg_pool3d( input, kernel_size, @@ -632,7 +632,7 @@ inline std::vector _unpool_output_size( const IntArrayRef& kernel_size, const IntArrayRef& stride, const IntArrayRef& padding, - const c10::optional>& output_size) { + const std::optional>& output_size) { auto input_size = input.sizes(); std::vector default_size; for (const auto d : c10::irange(kernel_size.size())) { @@ -688,7 +688,7 @@ inline Tensor max_unpool1d( ExpandingArray<1> kernel_size, ExpandingArray<1> stride, ExpandingArray<1> padding, - const c10::optional>& output_size) { + const std::optional>& output_size) { auto output_size_ = _unpool_output_size(input, kernel_size, stride, padding, output_size); output_size_.push_back(1); @@ -733,7 +733,7 @@ inline Tensor max_unpool2d( ExpandingArray<2> kernel_size, ExpandingArray<2> stride, ExpandingArray<2> padding, - const c10::optional>& output_size) { + const std::optional>& output_size) { auto output_size_ = _unpool_output_size(input, kernel_size, stride, padding, output_size); @@ -776,7 +776,7 @@ inline Tensor max_unpool3d( ExpandingArray<3> kernel_size, ExpandingArray<3> stride, ExpandingArray<3> padding, - const c10::optional>& output_size) { + const std::optional>& output_size) { auto output_size_ = _unpool_output_size(input, kernel_size, stride, padding, output_size); @@ -817,8 +817,8 @@ namespace detail { inline std::tuple fractional_max_pool2d_with_indices( const Tensor& input, const ExpandingArray<2>& kernel_size, - const c10::optional>& output_size, - const c10::optional>& output_ratio, + const std::optional>& output_size, + const std::optional>& output_ratio, const Tensor& _random_samples) { if (output_size == c10::nullopt && output_ratio == c10::nullopt) { TORCH_CHECK( @@ -826,7 +826,7 @@ inline std::tuple fractional_max_pool2d_with_indices( "fractional_max_pool2d requires specifying either ", "an output_size or an output_ratio"); } - c10::optional> output_size_ = output_size; + std::optional> output_size_ = output_size; if (output_size_ == c10::nullopt) { TORCH_INTERNAL_ASSERT(output_ratio != c10::nullopt); output_size_ = { @@ -875,8 +875,8 @@ namespace detail { inline Tensor fractional_max_pool2d( const Tensor& input, ExpandingArray<2> kernel_size, - c10::optional> output_size, - c10::optional> output_ratio, + std::optional> output_size, + std::optional> output_ratio, const Tensor& _random_samples) { return std::get<0>(fractional_max_pool2d_with_indices( input, kernel_size, output_size, output_ratio, _random_samples)); @@ -910,8 +910,8 @@ namespace detail { inline std::tuple fractional_max_pool3d_with_indices( const Tensor& input, const ExpandingArray<3>& kernel_size, - const c10::optional>& output_size, - const c10::optional>& output_ratio, + const std::optional>& output_size, + const std::optional>& output_ratio, const Tensor& _random_samples) { if (output_size == c10::nullopt && output_ratio == c10::nullopt) { TORCH_CHECK( @@ -920,7 +920,7 @@ inline std::tuple fractional_max_pool3d_with_indices( "an output_size or an output_ratio"); } - c10::optional> output_size_ = output_size; + std::optional> output_size_ = output_size; if (output_size_ == c10::nullopt) { TORCH_INTERNAL_ASSERT(output_ratio != c10::nullopt); output_size_ = { @@ -971,8 +971,8 @@ namespace detail { inline Tensor fractional_max_pool3d( const Tensor& input, ExpandingArray<3> kernel_size, - c10::optional> output_size, - c10::optional> output_ratio, + std::optional> output_size, + std::optional> output_ratio, const Tensor& _random_samples) { return std::get<0>(fractional_max_pool3d_with_indices( input, kernel_size, output_size, output_ratio, _random_samples)); diff --git a/torch/csrc/api/include/torch/nn/functional/upsampling.h b/torch/csrc/api/include/torch/nn/functional/upsampling.h index 8fe1b3f00f85d..38c5c51f9a475 100644 --- a/torch/csrc/api/include/torch/nn/functional/upsampling.h +++ b/torch/csrc/api/include/torch/nn/functional/upsampling.h @@ -15,9 +15,9 @@ inline std::vector _interp_output_size( int64_t dim, std::tuple< Tensor, - c10::optional>, - c10::optional>, - c10::optional> closed_over_args) { + std::optional>, + std::optional>, + std::optional> closed_over_args) { auto [input, size, scale_factor, recompute_scale_factor] = closed_over_args; if (size == c10::nullopt && scale_factor == c10::nullopt) { TORCH_CHECK(false, "either size or scale_factor should be defined"); @@ -75,11 +75,11 @@ inline std::vector _interp_output_size( namespace detail { inline Tensor interpolate( const Tensor& input, - const c10::optional>& size, - const c10::optional>& scale_factor, + const std::optional>& size, + const std::optional>& scale_factor, InterpolateFuncOptions::mode_t mode, - c10::optional align_corners, - c10::optional recompute_scale_factor, + std::optional align_corners, + std::optional recompute_scale_factor, bool antialias) { if (std::holds_alternative(mode) || std::get_if(&mode)) { @@ -113,7 +113,7 @@ inline Tensor interpolate( ")"); auto scale_factor_len = input.dim() - 2; - std::vector> scale_factor_list( + std::vector> scale_factor_list( scale_factor_len, c10::nullopt); if (scale_factor != c10::nullopt && !recompute_scale_factor.value_or(false)) { auto _scale_factor_repeated = *scale_factor; diff --git a/torch/csrc/api/include/torch/nn/functional/vision.h b/torch/csrc/api/include/torch/nn/functional/vision.h index e9cb1eb11ac0f..a6c53e0c0a9ad 100644 --- a/torch/csrc/api/include/torch/nn/functional/vision.h +++ b/torch/csrc/api/include/torch/nn/functional/vision.h @@ -59,7 +59,7 @@ inline Tensor grid_sample( const Tensor& grid, GridSampleFuncOptions::mode_t mode, GridSampleFuncOptions::padding_mode_t padding_mode, - c10::optional align_corners) { + std::optional align_corners) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t mode_enum, padding_mode_enum; diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h index 65a2d6905c0a9..9c55254ddb910 100644 --- a/torch/csrc/api/include/torch/nn/modules/conv.h +++ b/torch/csrc/api/include/torch/nn/modules/conv.h @@ -315,7 +315,7 @@ class ConvTransposeNdImpl : public ConvNdImpl { std::vector _output_padding( const Tensor& input, - const c10::optional& output_size, + const std::optional& output_size, const ExpandingArray& stride, const ExpandingArray& padding, const ExpandingArray& kernel_size); @@ -350,10 +350,10 @@ class TORCH_API ConvTranspose1dImpl explicit ConvTranspose1dImpl(ConvTranspose1dOptions options_); Tensor forward( const Tensor& input, - const c10::optional& output_size = c10::nullopt); + const std::optional& output_size = c10::nullopt); protected: - FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(c10::optional())}) + FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(std::optional())}) }; /// A `ModuleHolder` subclass for `ConvTranspose1dImpl`. @@ -392,10 +392,10 @@ class TORCH_API ConvTranspose2dImpl explicit ConvTranspose2dImpl(ConvTranspose2dOptions options_); Tensor forward( const Tensor& input, - const c10::optional& output_size = c10::nullopt); + const std::optional& output_size = c10::nullopt); protected: - FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(c10::optional())}) + FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(std::optional())}) }; /// A `ModuleHolder` subclass for `ConvTranspose2dImpl`. @@ -434,10 +434,10 @@ class TORCH_API ConvTranspose3dImpl explicit ConvTranspose3dImpl(ConvTranspose3dOptions options_); Tensor forward( const Tensor& input, - const c10::optional& output_size = c10::nullopt); + const std::optional& output_size = c10::nullopt); protected: - FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(c10::optional())}) + FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(std::optional())}) }; /// A `ModuleHolder` subclass for `ConvTranspose3dImpl`. diff --git a/torch/csrc/api/include/torch/nn/modules/pooling.h b/torch/csrc/api/include/torch/nn/modules/pooling.h index a9db131b0dd08..6bcdca463b1ba 100644 --- a/torch/csrc/api/include/torch/nn/modules/pooling.h +++ b/torch/csrc/api/include/torch/nn/modules/pooling.h @@ -507,10 +507,10 @@ class TORCH_API MaxUnpool1dImpl : public MaxUnpoolImpl<1, MaxUnpool1dImpl> { Tensor forward( const Tensor& input, const Tensor& indices, - const c10::optional>& output_size = c10::nullopt); + const std::optional>& output_size = c10::nullopt); protected: - FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(c10::optional>())}) + FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(std::optional>())}) }; /// A `ModuleHolder` subclass for `MaxUnpool1dImpl`. @@ -539,10 +539,10 @@ class TORCH_API MaxUnpool2dImpl : public MaxUnpoolImpl<2, MaxUnpool2dImpl> { Tensor forward( const Tensor& input, const Tensor& indices, - const c10::optional>& output_size = c10::nullopt); + const std::optional>& output_size = c10::nullopt); protected: - FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(c10::optional>())}) + FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(std::optional>())}) }; /// A `ModuleHolder` subclass for `MaxUnpool2dImpl`. @@ -571,10 +571,10 @@ class TORCH_API MaxUnpool3dImpl : public MaxUnpoolImpl<3, MaxUnpool3dImpl> { Tensor forward( const Tensor& input, const Tensor& indices, - const c10::optional>& output_size = c10::nullopt); + const std::optional>& output_size = c10::nullopt); protected: - FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(c10::optional>())}) + FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(std::optional>())}) }; /// A `ModuleHolder` subclass for `MaxUnpool3dImpl`. diff --git a/torch/csrc/api/include/torch/nn/modules/utils.h b/torch/csrc/api/include/torch/nn/modules/utils.h index 6d3d383465f33..869027a241492 100644 --- a/torch/csrc/api/include/torch/nn/modules/utils.h +++ b/torch/csrc/api/include/torch/nn/modules/utils.h @@ -32,7 +32,7 @@ inline std::vector _reverse_repeat_vector( } inline std::vector _list_with_default( - torch::ArrayRef> out_size, + torch::ArrayRef> out_size, torch::IntArrayRef defaults) { TORCH_CHECK( defaults.size() > out_size.size(), diff --git a/torch/csrc/api/include/torch/nn/options/activation.h b/torch/csrc/api/include/torch/nn/options/activation.h index e51805d364852..165212e0e860c 100644 --- a/torch/csrc/api/include/torch/nn/options/activation.h +++ b/torch/csrc/api/include/torch/nn/options/activation.h @@ -252,7 +252,7 @@ struct TORCH_API SoftmaxFuncOptions { /// If specified, the input tensor is casted to `dtype` before the operation /// is performed. This is useful for preventing data type overflows. Default: /// None. - TORCH_ARG(c10::optional, dtype) = c10::nullopt; + TORCH_ARG(std::optional, dtype) = c10::nullopt; }; } // namespace functional @@ -293,7 +293,7 @@ struct TORCH_API SoftminFuncOptions { /// If specified, the input tensor is casted to `dtype` before the operation /// is performed. This is useful for preventing data type overflows. Default: /// None. - TORCH_ARG(c10::optional, dtype) = c10::nullopt; + TORCH_ARG(std::optional, dtype) = c10::nullopt; }; } // namespace functional @@ -334,7 +334,7 @@ struct TORCH_API LogSoftmaxFuncOptions { /// If specified, the input tensor is casted to `dtype` before the operation /// is performed. This is useful for preventing data type overflows. Default: /// None. - TORCH_ARG(c10::optional, dtype) = c10::nullopt; + TORCH_ARG(std::optional, dtype) = c10::nullopt; }; } // namespace functional diff --git a/torch/csrc/api/include/torch/nn/options/batchnorm.h b/torch/csrc/api/include/torch/nn/options/batchnorm.h index cd2d7f164203e..943673e2aae74 100644 --- a/torch/csrc/api/include/torch/nn/options/batchnorm.h +++ b/torch/csrc/api/include/torch/nn/options/batchnorm.h @@ -21,7 +21,7 @@ struct TORCH_API BatchNormOptions { /// A momentum multiplier for the mean and variance. /// Changing this parameter after construction __is effective__. - TORCH_ARG(c10::optional, momentum) = 0.1; + TORCH_ARG(std::optional, momentum) = 0.1; /// Whether to learn a scale and bias that are applied in an affine /// transformation on the input. @@ -82,7 +82,7 @@ struct TORCH_API BatchNormFuncOptions { /// A momentum multiplier for the mean and variance. /// Changing this parameter after construction __is effective__. - TORCH_ARG(c10::optional, momentum) = 0.1; + TORCH_ARG(std::optional, momentum) = 0.1; /// The epsilon value added for numerical stability. /// Changing this parameter after construction __is effective__. diff --git a/torch/csrc/api/include/torch/nn/options/embedding.h b/torch/csrc/api/include/torch/nn/options/embedding.h index d8d06716308e1..20eacf9073355 100644 --- a/torch/csrc/api/include/torch/nn/options/embedding.h +++ b/torch/csrc/api/include/torch/nn/options/embedding.h @@ -28,10 +28,10 @@ struct TORCH_API EmbeddingOptions { /// Embedding, the embedding vector at `padding_idx` will default to all /// zeros, but can be updated to another value to be used as the padding /// vector. - TORCH_ARG(c10::optional, padding_idx) = c10::nullopt; + TORCH_ARG(std::optional, padding_idx) = c10::nullopt; /// If given, each embedding vector with norm larger than `max_norm` is /// renormalized to have norm `max_norm`. - TORCH_ARG(c10::optional, max_norm) = c10::nullopt; + TORCH_ARG(std::optional, max_norm) = c10::nullopt; /// The p of the p-norm to compute for the `max_norm` option. Default ``2``. TORCH_ARG(double, norm_type) = 2.; /// If given, this will scale gradients by the inverse of frequency of the @@ -55,10 +55,10 @@ struct TORCH_API EmbeddingFromPretrainedOptions { /// If specified, the entries at `padding_idx` do not contribute to the /// gradient; therefore, the embedding vector at `padding_idx` is not updated /// during training, i.e. it remains as a fixed "pad". - TORCH_ARG(c10::optional, padding_idx) = c10::nullopt; + TORCH_ARG(std::optional, padding_idx) = c10::nullopt; /// If given, each embedding vector with norm larger than `max_norm` is /// renormalized to have norm `max_norm`. - TORCH_ARG(c10::optional, max_norm) = c10::nullopt; + TORCH_ARG(std::optional, max_norm) = c10::nullopt; /// The p of the p-norm to compute for the `max_norm` option. Default ``2``. TORCH_ARG(double, norm_type) = 2.; /// If given, this will scale gradients by the inverse of frequency of the @@ -84,10 +84,10 @@ struct TORCH_API EmbeddingFuncOptions { /// If specified, the entries at `padding_idx` do not contribute to the /// gradient; therefore, the embedding vector at `padding_idx` is not updated /// during training, i.e. it remains as a fixed "pad". - TORCH_ARG(c10::optional, padding_idx) = c10::nullopt; + TORCH_ARG(std::optional, padding_idx) = c10::nullopt; /// If given, each embedding vector with norm larger than `max_norm` is /// renormalized to have norm `max_norm`. - TORCH_ARG(c10::optional, max_norm) = c10::nullopt; + TORCH_ARG(std::optional, max_norm) = c10::nullopt; /// The p of the p-norm to compute for the `max_norm` option. Default ``2``. TORCH_ARG(double, norm_type) = 2.; /// If given, this will scale gradients by the inverse of frequency of the @@ -120,7 +120,7 @@ struct TORCH_API EmbeddingBagOptions { TORCH_ARG(int64_t, embedding_dim); /// If given, each embedding vector with norm larger than `max_norm` is /// renormalized to have norm `max_norm`. - TORCH_ARG(c10::optional, max_norm) = c10::nullopt; + TORCH_ARG(std::optional, max_norm) = c10::nullopt; /// The p of the p-norm to compute for the `max_norm` option. Default ``2``. TORCH_ARG(double, norm_type) = 2.; /// If given, this will scale gradients by the inverse of frequency of the @@ -148,7 +148,7 @@ struct TORCH_API EmbeddingBagOptions { /// zeros, but can be updated to another value to be used as the padding /// vector. Note that the embedding vector at `padding_idx` is excluded from /// the reduction. - TORCH_ARG(c10::optional, padding_idx) = c10::nullopt; + TORCH_ARG(std::optional, padding_idx) = c10::nullopt; }; // ============================================================================ @@ -161,7 +161,7 @@ struct TORCH_API EmbeddingBagFromPretrainedOptions { TORCH_ARG(bool, freeze) = true; /// If given, each embedding vector with norm larger than `max_norm` is /// renormalized to have norm `max_norm`. - TORCH_ARG(c10::optional, max_norm) = c10::nullopt; + TORCH_ARG(std::optional, max_norm) = c10::nullopt; /// The p of the p-norm to compute for the `max_norm` option. Default ``2``. TORCH_ARG(double, norm_type) = 2.; /// If given, this will scale gradients by the inverse of frequency of the @@ -184,7 +184,7 @@ struct TORCH_API EmbeddingBagFromPretrainedOptions { /// gradient; therefore, the embedding vector at padding_idx is not updated /// during training, i.e. it remains as a fixed "pad". Note that the embedding /// vector at `padding_idx` is excluded from the reduction. - TORCH_ARG(c10::optional, padding_idx) = c10::nullopt; + TORCH_ARG(std::optional, padding_idx) = c10::nullopt; }; // ============================================================================ @@ -205,7 +205,7 @@ struct TORCH_API EmbeddingBagFuncOptions { TORCH_ARG(torch::Tensor, offsets) = Tensor(); /// If given, each embedding vector with norm larger than `max_norm` is /// renormalized to have norm `max_norm`. - TORCH_ARG(c10::optional, max_norm) = c10::nullopt; + TORCH_ARG(std::optional, max_norm) = c10::nullopt; /// The p of the p-norm to compute for the `max_norm` option. Default ``2``. TORCH_ARG(double, norm_type) = 2.; /// If given, this will scale gradients by the inverse of frequency of the @@ -233,7 +233,7 @@ struct TORCH_API EmbeddingBagFuncOptions { /// gradient; therefore, the embedding vector at padding_idx is not updated /// during training, i.e. it remains as a fixed "pad". Note that the embedding /// vector at `padding_idx` is excluded from the reduction. - TORCH_ARG(c10::optional, padding_idx) = c10::nullopt; + TORCH_ARG(std::optional, padding_idx) = c10::nullopt; }; } // namespace functional diff --git a/torch/csrc/api/include/torch/nn/options/loss.h b/torch/csrc/api/include/torch/nn/options/loss.h index c9eb2b66f3e0b..f1fc7a4d41115 100644 --- a/torch/csrc/api/include/torch/nn/options/loss.h +++ b/torch/csrc/api/include/torch/nn/options/loss.h @@ -450,7 +450,7 @@ struct TORCH_API TripletMarginWithDistanceLossOptions { /// Specifies a nonnegative, real-valued function that quantifies the /// closeness of two tensors. If not specified, `F::pairwise_distance` will /// be used. Default: nullopt - TORCH_ARG(c10::optional, distance_function) = + TORCH_ARG(std::optional, distance_function) = c10::nullopt; /// Specifies a nonnegative margin representing the minimum difference /// between the positive and negative distances required for the loss to be 0. @@ -548,7 +548,7 @@ struct TORCH_API SmoothL1LossOptions { /// Specifies the threshold at which to change between L1 and L2 loss. /// If beta is not specified, a value of 1.0 will be used. /// Default: nullopt - TORCH_ARG(c10::optional, beta) = c10::nullopt; + TORCH_ARG(std::optional, beta) = c10::nullopt; }; namespace functional { diff --git a/torch/csrc/api/include/torch/nn/options/normalization.h b/torch/csrc/api/include/torch/nn/options/normalization.h index ae8c206736d50..a1e5b1a0aeab1 100644 --- a/torch/csrc/api/include/torch/nn/options/normalization.h +++ b/torch/csrc/api/include/torch/nn/options/normalization.h @@ -133,7 +133,7 @@ struct TORCH_API NormalizeFuncOptions { TORCH_ARG(double, eps) = 1e-12; /// the output tensor. If `out` is used, this /// operation won't be differentiable. - TORCH_ARG(c10::optional, out) = c10::nullopt; + TORCH_ARG(std::optional, out) = c10::nullopt; }; } // namespace functional diff --git a/torch/csrc/api/include/torch/nn/options/pooling.h b/torch/csrc/api/include/torch/nn/options/pooling.h index 41de605e90fb0..8f6cee99bff6a 100644 --- a/torch/csrc/api/include/torch/nn/options/pooling.h +++ b/torch/csrc/api/include/torch/nn/options/pooling.h @@ -32,7 +32,7 @@ struct AvgPoolOptions { /// if specified, it will be used as divisor, otherwise size of the pooling /// region will be used. - TORCH_ARG(c10::optional, divisor_override) = c10::nullopt; + TORCH_ARG(std::optional, divisor_override) = c10::nullopt; }; /// `AvgPoolOptions` specialized for the `AvgPool1d` module. @@ -401,7 +401,7 @@ struct MaxUnpoolFuncOptions { TORCH_ARG(ExpandingArray, padding) = 0; /// the targeted output size - TORCH_ARG(c10::optional>, output_size) = c10::nullopt; + TORCH_ARG(std::optional>, output_size) = c10::nullopt; }; /// `MaxUnpoolFuncOptions` specialized for @@ -450,12 +450,12 @@ struct FractionalMaxPoolOptions { TORCH_ARG(ExpandingArray, kernel_size); /// the target output size of the image - TORCH_ARG(c10::optional>, output_size) = c10::nullopt; + TORCH_ARG(std::optional>, output_size) = c10::nullopt; /// If one wants to have an output size as a ratio of the input size, this /// option can be given. This has to be a number or tuple in the range (0, 1) using ExpandingArrayDouble = torch::ExpandingArray; - TORCH_ARG(c10::optional, output_ratio) = c10::nullopt; + TORCH_ARG(std::optional, output_ratio) = c10::nullopt; TORCH_ARG(torch::Tensor, _random_samples) = Tensor(); }; diff --git a/torch/csrc/api/include/torch/nn/options/upsampling.h b/torch/csrc/api/include/torch/nn/options/upsampling.h index ca793beb97725..21df2b89998de 100644 --- a/torch/csrc/api/include/torch/nn/options/upsampling.h +++ b/torch/csrc/api/include/torch/nn/options/upsampling.h @@ -20,10 +20,10 @@ namespace nn { /// ``` struct TORCH_API UpsampleOptions { /// output spatial sizes. - TORCH_ARG(c10::optional>, size) = c10::nullopt; + TORCH_ARG(std::optional>, size) = c10::nullopt; /// multiplier for spatial size. - TORCH_ARG(c10::optional>, scale_factor) = c10::nullopt; + TORCH_ARG(std::optional>, scale_factor) = c10::nullopt; /// the upsampling algorithm: one of "nearest", "linear", "bilinear", /// "bicubic" and "trilinear". Default: "nearest" @@ -40,7 +40,7 @@ struct TORCH_API UpsampleOptions { /// aligned, and thus preserving the values at those pixels. This only has /// effect when :attr:`mode` is "linear", "bilinear", "bicubic", or /// "trilinear". Default: "False" - TORCH_ARG(c10::optional, align_corners) = c10::nullopt; + TORCH_ARG(std::optional, align_corners) = c10::nullopt; }; namespace functional { @@ -65,10 +65,10 @@ struct TORCH_API InterpolateFuncOptions { mode_t; /// output spatial sizes. - TORCH_ARG(c10::optional>, size) = c10::nullopt; + TORCH_ARG(std::optional>, size) = c10::nullopt; /// multiplier for spatial size. - TORCH_ARG(c10::optional>, scale_factor) = c10::nullopt; + TORCH_ARG(std::optional>, scale_factor) = c10::nullopt; /// the upsampling algorithm: one of "nearest", "linear", "bilinear", /// "bicubic", "trilinear", "area", "nearest-exact". Default: "nearest" @@ -83,7 +83,7 @@ struct TORCH_API InterpolateFuncOptions { /// this operation *independent* of input size when `scale_factor` is /// kept the same. It is *required* when interpolating mode is "linear", /// "bilinear", "bicubic" or "trilinear". Default: "False" - TORCH_ARG(c10::optional, align_corners) = c10::nullopt; + TORCH_ARG(std::optional, align_corners) = c10::nullopt; /// recompute the scale_factor for use in the /// interpolation calculation. When `scale_factor` is passed as a parameter, @@ -95,7 +95,7 @@ struct TORCH_API InterpolateFuncOptions { /// used in the interpolation computation. Note that when `scale_factor` is /// floating-point, the recomputed scale_factor may differ from the one passed /// in due to rounding and precision issues. - TORCH_ARG(c10::optional, recompute_scale_factor) = c10::nullopt; + TORCH_ARG(std::optional, recompute_scale_factor) = c10::nullopt; /// flag to apply anti-aliasing. Using anti-alias /// option together with :attr:`align_corners` equals "False", interpolation diff --git a/torch/csrc/api/include/torch/nn/options/vision.h b/torch/csrc/api/include/torch/nn/options/vision.h index 814f4b6684d96..c012b40d21f69 100644 --- a/torch/csrc/api/include/torch/nn/options/vision.h +++ b/torch/csrc/api/include/torch/nn/options/vision.h @@ -28,7 +28,7 @@ struct TORCH_API GridSampleFuncOptions { /// padding mode for outside grid values. Default: Zeros TORCH_ARG(padding_mode_t, padding_mode) = torch::kZeros; /// Specifies perspective to pixel as point. Default: false - TORCH_ARG(c10::optional, align_corners) = c10::nullopt; + TORCH_ARG(std::optional, align_corners) = c10::nullopt; }; } // namespace functional diff --git a/torch/csrc/api/include/torch/nn/pimpl.h b/torch/csrc/api/include/torch/nn/pimpl.h index d66d83c257ebd..a5a71a01c833c 100644 --- a/torch/csrc/api/include/torch/nn/pimpl.h +++ b/torch/csrc/api/include/torch/nn/pimpl.h @@ -140,27 +140,13 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator { } private: - /// In C++17, the two methods below could be written as the following: - /// if constexpr (std::is_default_constructible_v) { - /// return std::make_shared(); - /// } else { - /// return nullptr; - /// } - /// In C++11, we use SFINAE instead of `if constexpr`. - - template < - typename T = Contained, - typename = torch::enable_if_t::value>> - std::shared_ptr default_construct() { - return std::make_shared(); - } - template - torch::disable_if_t< - std::is_default_constructible::value, - std::shared_ptr> - default_construct() { - return nullptr; + std::shared_ptr default_construct() { + if constexpr (std::is_default_constructible_v) { + return std::make_shared(); + } else { + return nullptr; + } } }; diff --git a/torch/csrc/api/include/torch/nn/utils/clip_grad.h b/torch/csrc/api/include/torch/nn/utils/clip_grad.h index e1023bd1eb5c7..fbb533662c7be 100644 --- a/torch/csrc/api/include/torch/nn/utils/clip_grad.h +++ b/torch/csrc/api/include/torch/nn/utils/clip_grad.h @@ -64,7 +64,7 @@ inline double clip_grad_norm_( // synchronizing the CPU and the gradients' device until the very end to // preserve async execution on the device. When checking for finite-ness, this // optional ensures we only sync once. - c10::optional total_norm = c10::nullopt; + std::optional total_norm = c10::nullopt; if (error_if_nonfinite) { total_norm = total_norm_tensor.item().toDouble(); TORCH_CHECK( diff --git a/torch/csrc/api/include/torch/nn/utils/convert_parameters.h b/torch/csrc/api/include/torch/nn/utils/convert_parameters.h index 2ac1d317c9922..6f62d483c4d8b 100644 --- a/torch/csrc/api/include/torch/nn/utils/convert_parameters.h +++ b/torch/csrc/api/include/torch/nn/utils/convert_parameters.h @@ -11,9 +11,9 @@ namespace utils { // in the same device. Currently, the conversion between model parameters // and single vector form is not supported for multiple allocations, // e.g. parameters in different GPUs, or mixture of CPU/GPU. -inline c10::optional _check_param_device( +inline std::optional _check_param_device( const torch::Tensor& param, - c10::optional old_param_device) { + std::optional old_param_device) { // Meet the first parameter if (old_param_device == c10::nullopt) { old_param_device = param.is_cuda() ? param.get_device() : -1; @@ -38,7 +38,7 @@ inline c10::optional _check_param_device( // Convert parameters to one vector inline torch::Tensor parameters_to_vector( const std::vector& parameters) { - c10::optional param_device; + std::optional param_device; std::vector vec; vec.reserve(parameters.size()); @@ -58,7 +58,7 @@ inline void vector_to_parameters( const torch::Tensor& vec, const std::vector& parameters) { // Flag for the device where the parameter is located - c10::optional param_device; + std::optional param_device; // Pointer for slicing the vector for each parameter int64_t pointer = 0; diff --git a/torch/csrc/api/include/torch/nn/utils/rnn.h b/torch/csrc/api/include/torch/nn/utils/rnn.h index eea517a2b60f3..ba8b0db427150 100644 --- a/torch/csrc/api/include/torch/nn/utils/rnn.h +++ b/torch/csrc/api/include/torch/nn/utils/rnn.h @@ -247,7 +247,7 @@ inline std::tuple pad_packed_sequence( PackedSequence sequence, bool batch_first = false, double padding_value = 0.0, - c10::optional total_length = torch::nullopt) { + std::optional total_length = torch::nullopt) { int64_t max_seq_length = sequence.batch_sizes().size(0); if (total_length.has_value()) { int64_t total_length_val = total_length.value(); diff --git a/torch/csrc/api/include/torch/optim/lbfgs.h b/torch/csrc/api/include/torch/optim/lbfgs.h index 99aa35d36e4b5..001b0cd33f259 100644 --- a/torch/csrc/api/include/torch/optim/lbfgs.h +++ b/torch/csrc/api/include/torch/optim/lbfgs.h @@ -17,11 +17,11 @@ struct TORCH_API LBFGSOptions : public OptimizerCloneableOptions { LBFGSOptions(double lr = 1); TORCH_ARG(double, lr) = 1; TORCH_ARG(int64_t, max_iter) = 20; - TORCH_ARG(c10::optional, max_eval) = c10::nullopt; + TORCH_ARG(std::optional, max_eval) = c10::nullopt; TORCH_ARG(double, tolerance_grad) = 1e-7; TORCH_ARG(double, tolerance_change) = 1e-9; TORCH_ARG(int64_t, history_size) = 100; - TORCH_ARG(c10::optional, line_search_fn) = c10::nullopt; + TORCH_ARG(std::optional, line_search_fn) = c10::nullopt; public: void serialize(torch::serialize::InputArchive& archive) override; @@ -45,7 +45,7 @@ struct TORCH_API LBFGSParamState TORCH_ARG(std::deque, old_dirs); TORCH_ARG(std::deque, old_stps); TORCH_ARG(std::deque, ro); - TORCH_ARG(c10::optional>, al) = c10::nullopt; + TORCH_ARG(std::optional>, al) = c10::nullopt; public: void serialize(torch::serialize::InputArchive& archive) override; @@ -82,7 +82,7 @@ class TORCH_API LBFGS : public Optimizer { void load(serialize::InputArchive& archive) override; private: - c10::optional _numel_cache; + std::optional _numel_cache; int64_t _numel(); Tensor _gather_flat_grad(); void _add_grad(const double step_size, const Tensor& update); diff --git a/torch/csrc/api/include/torch/serialize/input-archive.h b/torch/csrc/api/include/torch/serialize/input-archive.h index 83d1a543ddacb..f77b34aad0bd4 100644 --- a/torch/csrc/api/include/torch/serialize/input-archive.h +++ b/torch/csrc/api/include/torch/serialize/input-archive.h @@ -76,27 +76,27 @@ class TORCH_API InputArchive final { /// is not specified, the module is loaded to the original device. void load_from( const std::string& filename, - c10::optional device = c10::nullopt); + std::optional device = c10::nullopt); /// Loads the `InputArchive` from a serialized representation stored in the /// given `stream`. Storage are remapped using device option. If device /// is not specified, the module is loaded to the original device. void load_from( std::istream& stream, - c10::optional device = c10::nullopt); + std::optional device = c10::nullopt); // Loads given the specified flat array. void load_from( const char* data, size_t size, - c10::optional device = c10::nullopt); + std::optional device = c10::nullopt); // Loads given the specified read and size functions. void load_from( const std::function& read_func, const std::function& size_func, - c10::optional device = c10::nullopt); + std::optional device = c10::nullopt); // Returns the vector of keys in the input archive. std::vector keys(); diff --git a/torch/csrc/api/include/torch/special.h b/torch/csrc/api/include/torch/special.h index 7ad7e7689ebd6..d8346e1aa1d8c 100644 --- a/torch/csrc/api/include/torch/special.h +++ b/torch/csrc/api/include/torch/special.h @@ -596,7 +596,7 @@ inline Tensor& log1p_out(Tensor& result, const Tensor& self) { inline Tensor log_softmax( const Tensor& self, int64_t dim, - c10::optional dtype) { + std::optional dtype) { return torch::special_log_softmax(self, dim, dtype); } @@ -611,7 +611,7 @@ inline Tensor log_softmax( inline Tensor softmax( const Tensor& self, int64_t dim, - c10::optional dtype) { + std::optional dtype) { return torch::special_softmax(self, dim, dtype); } diff --git a/torch/csrc/api/include/torch/types.h b/torch/csrc/api/include/torch/types.h index 92be710cf4bf4..8a23cd122b8d1 100644 --- a/torch/csrc/api/include/torch/types.h +++ b/torch/csrc/api/include/torch/types.h @@ -39,7 +39,7 @@ namespace torch { using namespace at; // NOLINT using c10::nullopt; -using c10::optional; +using std::optional; using Dtype = at::ScalarType; diff --git a/torch/csrc/api/src/nn/modules/conv.cpp b/torch/csrc/api/src/nn/modules/conv.cpp index 20be11f221838..197c3cf0725cd 100644 --- a/torch/csrc/api/src/nn/modules/conv.cpp +++ b/torch/csrc/api/src/nn/modules/conv.cpp @@ -169,12 +169,12 @@ template class ConvNdImpl<3, Conv3dImpl>; template std::vector ConvTransposeNdImpl::_output_padding( const Tensor& input, - const c10::optional& output_size, + const std::optional& output_size, const ExpandingArray& stride, const ExpandingArray& padding, const ExpandingArray& kernel_size) { std::vector ret; - c10::optional output_size_ = output_size; + std::optional output_size_ = output_size; if (output_size_ == c10::nullopt) { ret = at::IntArrayRef(this->options.output_padding()).vec(); @@ -248,7 +248,7 @@ ConvTranspose1dImpl::ConvTranspose1dImpl(ConvTranspose1dOptions options_) Tensor ConvTranspose1dImpl::forward( const Tensor& input, - const c10::optional& output_size) { + const std::optional& output_size) { if (!std::get_if(&options.padding_mode())) { TORCH_CHECK( false, "Only `zeros` padding mode is supported for ConvTranspose1d"); @@ -285,7 +285,7 @@ ConvTranspose2dImpl::ConvTranspose2dImpl(ConvTranspose2dOptions options_) Tensor ConvTranspose2dImpl::forward( const Tensor& input, - const c10::optional& output_size) { + const std::optional& output_size) { if (!std::get_if(&options.padding_mode())) { TORCH_CHECK( false, "Only `zeros` padding mode is supported for ConvTranspose2d"); @@ -322,7 +322,7 @@ ConvTranspose3dImpl::ConvTranspose3dImpl(ConvTranspose3dOptions options_) Tensor ConvTranspose3dImpl::forward( const Tensor& input, - const c10::optional& output_size) { + const std::optional& output_size) { if (!std::get_if(&options.padding_mode())) { TORCH_CHECK( false, "Only `zeros` padding mode is supported for ConvTranspose3d"); diff --git a/torch/csrc/api/src/nn/modules/pooling.cpp b/torch/csrc/api/src/nn/modules/pooling.cpp index 1a3f29e235507..0b11b914dcc1c 100644 --- a/torch/csrc/api/src/nn/modules/pooling.cpp +++ b/torch/csrc/api/src/nn/modules/pooling.cpp @@ -229,7 +229,7 @@ void MaxUnpoolImpl::pretty_print(std::ostream& stream) const { Tensor MaxUnpool1dImpl::forward( const Tensor& input, const Tensor& indices, - const c10::optional>& output_size) { + const std::optional>& output_size) { return F::detail::max_unpool1d( input, indices, @@ -242,7 +242,7 @@ Tensor MaxUnpool1dImpl::forward( Tensor MaxUnpool2dImpl::forward( const Tensor& input, const Tensor& indices, - const c10::optional>& output_size) { + const std::optional>& output_size) { return F::detail::max_unpool2d( input, indices, @@ -255,7 +255,7 @@ Tensor MaxUnpool2dImpl::forward( Tensor MaxUnpool3dImpl::forward( const Tensor& input, const Tensor& indices, - const c10::optional>& output_size) { + const std::optional>& output_size) { return F::detail::max_unpool3d( input, indices, diff --git a/torch/csrc/api/src/optim/lbfgs.cpp b/torch/csrc/api/src/optim/lbfgs.cpp index bf54e9a878618..10739be623869 100644 --- a/torch/csrc/api/src/optim/lbfgs.cpp +++ b/torch/csrc/api/src/optim/lbfgs.cpp @@ -67,7 +67,7 @@ bool if_container_equal(T lhs, T rhs) { } bool operator==(const LBFGSParamState& lhs, const LBFGSParamState& rhs) { - auto isNull = [](const c10::optional>& val) { + auto isNull = [](const std::optional>& val) { return val == c10::nullopt; }; return (lhs.func_evals() == rhs.func_evals()) && @@ -194,7 +194,7 @@ static double _cubic_interpolate( double x2, double f2, double g2, - c10::optional> bounds = c10::nullopt) { + std::optional> bounds = c10::nullopt) { // ported from https://github.com/torch/optim/blob/master/polyinterp.lua // Compute bounds of interpolation area // NOLINTNEXTLINE(cppcoreguidelines-init-variables) diff --git a/torch/csrc/api/src/serialize/input-archive.cpp b/torch/csrc/api/src/serialize/input-archive.cpp index c18a041293aea..852f4eab1b52b 100644 --- a/torch/csrc/api/src/serialize/input-archive.cpp +++ b/torch/csrc/api/src/serialize/input-archive.cpp @@ -93,20 +93,20 @@ void InputArchive::read(const std::string& key, InputArchive& archive) { void InputArchive::load_from( const std::string& filename, - c10::optional device /*= c10::nullopt*/) { + std::optional device /*= c10::nullopt*/) { module_ = torch::jit::load(filename, std::move(device)); } void InputArchive::load_from( std::istream& stream, - c10::optional device /*= c10::nullopt*/) { + std::optional device /*= c10::nullopt*/) { module_ = torch::jit::load(stream, std::move(device)); } void InputArchive::load_from( const char* data, size_t size, - c10::optional device /*= c10::nullopt*/) { + std::optional device /*= c10::nullopt*/) { using caffe2::serialize::ReadAdapterInterface; class OurAdapter : public ReadAdapterInterface { public: @@ -136,7 +136,7 @@ void InputArchive::load_from( void InputArchive::load_from( const std::function& read_func, const std::function& size_func, - c10::optional device /*= c10::nullopt*/) { + std::optional device /*= c10::nullopt*/) { using caffe2::serialize::ReadAdapterInterface; class OurAdapter : public ReadAdapterInterface { public: diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 4c0c324ad56ec..65c7fbb853610 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -60,19 +60,19 @@ Tensor apply_loss_reduction(const Tensor& unreduced, int64_t reduction) { return unreduced; } -static bool isDefined(const c10::optional& t) { +static bool isDefined(const std::optional& t) { return t.has_value() && t->defined(); } -Tensor toNonOptTensor(const c10::optional& t) { +Tensor toNonOptTensor(const std::optional& t) { return t.has_value() ? *t : Tensor(); } -Tensor toNonOptFwGrad(const c10::optional& t) { +Tensor toNonOptFwGrad(const std::optional& t) { return (t.has_value() && t->defined()) ? t->_fw_grad(/*level */ 0) : Tensor(); } -Tensor toNonOptPrimal(const c10::optional& t) { +Tensor toNonOptPrimal(const std::optional& t) { if (t.has_value() && t->defined()) { if (t->unsafeGetTensorImpl()->is_wrapped_number()) { return *t; @@ -605,7 +605,7 @@ Tensor div_tensor_self_backward( const Tensor& grad, T other, ScalarType self_st, - const c10::optional& rounding_mode) { + const std::optional& rounding_mode) { if (rounding_mode.has_value()) { return at::zeros_like(grad, grad.options().dtype(self_st)); } @@ -617,12 +617,12 @@ template Tensor div_tensor_self_backward( const Tensor&, Tensor, ScalarType, - const c10::optional&); + const std::optional&); template Tensor div_tensor_self_backward( const Tensor&, Scalar, ScalarType, - const c10::optional&); + const std::optional&); template Tensor div_tensor_self_backward( @@ -639,7 +639,7 @@ Tensor div_tensor_other_backward( const Tensor& grad, const Tensor& self, const Tensor& other, - const c10::optional& rounding_mode) { + const std::optional& rounding_mode) { if (rounding_mode.has_value()) { return at::zeros_like(grad, grad.options().dtype(other.scalar_type())); } @@ -1289,7 +1289,7 @@ Tensor convolution_jvp( at::SymIntArrayRef output_padding, const c10::SymInt& groups) { auto bias_t_opt = - bias_t.defined() ? c10::optional(bias_t) : c10::nullopt; + bias_t.defined() ? std::optional(bias_t) : c10::nullopt; return ( at::convolution_symint( input_t, @@ -1331,7 +1331,7 @@ Tensor _convolution_jvp( bool cudnn_enabled, bool allow_tf32) { auto bias_t_opt = - bias_t.defined() ? c10::optional(bias_t) : c10::nullopt; + bias_t.defined() ? std::optional(bias_t) : c10::nullopt; return ( at::_convolution_symint( input_t, @@ -1520,8 +1520,8 @@ static Tensor sparse_mask_like_grad( std::tuple sparse_sampled_addmm_backward( const Tensor& grad, const Tensor& self, - const c10::optional& mat1, - const c10::optional& mat2, + const std::optional& mat1, + const std::optional& mat2, const Scalar& alpha, const Scalar& beta, const std::array& grad_input_mask) { @@ -1819,7 +1819,7 @@ Tensor var_backward( Tensor grad, const Tensor& self, at::OptionalIntArrayRef dim_opt, - const c10::optional& correction_opt, + const std::optional& correction_opt, bool keepdim) { const auto correction = correction_opt.value_or(1).toSymFloat(); if (self.dim() == 0 || !dim_opt.has_value()) { @@ -1852,7 +1852,7 @@ Tensor std_backward( const Tensor& grad, const Tensor& self, at::OptionalIntArrayRef dim, - const c10::optional& correction_opt, + const std::optional& correction_opt, bool keepdim) { auto grad_var = (grad / (result * 2)).masked_fill_(result == 0, 0); return var_backward(std::move(grad_var), self, dim, correction_opt, keepdim); @@ -1863,7 +1863,7 @@ Tensor var_mean_backward( const Tensor& gmean, const Tensor& self, at::OptionalIntArrayRef dim_opt, - const c10::optional& correction_opt, + const std::optional& correction_opt, bool keepdim) { Tensor gself; if (gvar.defined()) { @@ -1887,7 +1887,7 @@ Tensor std_mean_backward( const Tensor& self, const Tensor& std, at::OptionalIntArrayRef dim_opt, - const c10::optional& correction_opt, + const std::optional& correction_opt, bool keepdim) { Tensor gself; if (gstd.defined()) { @@ -2241,7 +2241,7 @@ Tensor infinitely_differentiable_mish_backward( Tensor infinitely_differentiable_logit_backward( const Tensor& grad, const Tensor& self, - c10::optional eps) { + std::optional eps) { if (eps) { const double lo = eps.value(); const double hi = 1.0 - lo; @@ -2262,7 +2262,7 @@ Tensor binary_cross_entropy_target_backward( const Tensor& grad, const Tensor& self, const Tensor& target, - const c10::optional& weight, + const std::optional& weight, int64_t reduction) { auto grad_target = at::logit(self).neg_(); @@ -2295,7 +2295,7 @@ Tensor binary_cross_entropy_double_backward_target( const Tensor& grad_output, const Tensor& self, const Tensor& target, - const c10::optional& weight, + const std::optional& weight, int64_t reduction) { auto res = -grad * grad_output; @@ -2332,8 +2332,8 @@ Tensor binary_cross_entropy_with_logits_backward( const Tensor& grad, const Tensor& input, const Tensor& target, - const c10::optional& weight, - const c10::optional& pos_weight, + const std::optional& weight, + const std::optional& pos_weight, int64_t reduction) { // Trivial case if (grad._is_zerotensor()) { @@ -2387,8 +2387,8 @@ Tensor binary_cross_entropy_with_logits_target_backward( const Tensor& grad_output, const Tensor& self, const Tensor& target, - const c10::optional& weight, - const c10::optional& pos_weight, + const std::optional& weight, + const std::optional& pos_weight, int64_t reduction) { if (grad_output._is_zerotensor()) { return at::_efficientzerotensor(target.sizes(), target.options()); @@ -2479,7 +2479,7 @@ Tensor binary_cross_entropy_double_backward( const Tensor& grad, const Tensor& input, const Tensor& target, - const c10::optional& weight, + const std::optional& weight, int64_t reduction) { auto eps = 1e-12; auto inp_pl_eps = input + eps; @@ -2514,7 +2514,7 @@ Tensor binary_cross_entropy_double_backward_grad_output( const Tensor& grad, const Tensor& input, const Tensor& target, - const c10::optional& weight, + const std::optional& weight, int64_t reduction) { auto eps = 1e-12; // gradient wrt grad_output @@ -3186,7 +3186,7 @@ Tensor as_strided_backward( auto storage = grad.new_zeros_symint(c10::SymIntArrayRef(base_size)); // prepare indices tensor if we will do index_add_ later - c10::optional flatten_full_indices; + std::optional flatten_full_indices; if (inp_maybe_overlap || out_maybe_overlap) { flatten_full_indices = // TODO: should we symint-ify arange? Need SymScalar. @@ -3334,8 +3334,8 @@ Tensor slice_backward_wrapper( const at::Tensor& grad, const c10::SymIntArrayRef& input_sizes, int64_t dim, - c10::optional start, - c10::optional end, + std::optional start, + std::optional end, c10::SymInt step) { auto start_val = start.has_value() ? start.value() : 0; auto end_val = end.has_value() ? end.value() : INT64_MAX; @@ -4617,17 +4617,17 @@ static Tensor expand_as_dim1(const Tensor& src, const Tensor& target) { std::tuple batchnorm_double_backward( const Tensor& input, - const c10::optional& gamma, + const std::optional& gamma, const Tensor& ggI, const Tensor& ggG, const Tensor& ggB, const Tensor& gO, - const c10::optional& running_mean, - const c10::optional& running_var, + const std::optional& running_mean, + const std::optional& running_var, bool training, double eps, - const c10::optional& save_mean, - const c10::optional& save_invstd, + const std::optional& save_mean, + const std::optional& save_invstd, std::array output_mask) { bool affine = isDefined(gamma); // TODO: Do we have a ScalarOrTensor type? Would such a thing exist? @@ -4756,7 +4756,7 @@ std::tuple batchnorm_double_backward( std::tuple layer_norm_double_backward( const Tensor& input_t, - const c10::optional& gamma, + const std::optional& gamma, const Tensor& ggI, const Tensor& ggG, const Tensor& ggB, @@ -4905,7 +4905,7 @@ infinitely_differentiable_native_group_norm_backward( const Tensor& X, const Tensor& mean, const Tensor& rstd, - const c10::optional& gamma, + const std::optional& gamma, c10::SymInt N, const c10::SymInt& C, c10::SymInt HxW, @@ -4987,9 +4987,9 @@ infinitely_differentiable_native_group_norm_backward( std::tuple _trilinear_backward( const Tensor& grad_out, - const c10::optional& i1, - const c10::optional& i2, - const c10::optional& i3, + const std::optional& i1, + const std::optional& i2, + const std::optional& i3, IntArrayRef expand1, IntArrayRef expand2, IntArrayRef expand3, @@ -5083,7 +5083,7 @@ Tensor embedding_dense_double_backward_symint( Tensor index_backward( Tensor zeros_like_self, - const torch::List>& indices, + const torch::List>& indices, const Tensor& grad) { return (areAnyTensorSubclassLike({zeros_like_self, grad}) || areAnyOptionalTensorSubclassLike(indices)) @@ -6120,7 +6120,7 @@ static Tensor _norm_jvp( // Computes the jvp for `input * weight + bias` where weight and bias may be // undefined Possibly modifies the input inplace static Tensor _affine_jvp( - const c10::optional& input_p, + const std::optional& input_p, Tensor& input_t, const Tensor& weight_p, const Tensor& weight_t, @@ -6161,8 +6161,8 @@ Tensor batch_norm_jvp( const Tensor& weight_t, const Tensor& bias_p, const Tensor& bias_t, - const c10::optional& running_mean, - const c10::optional& running_var, + const std::optional& running_mean, + const std::optional& running_var, const Tensor& saved_mean, const Tensor& saved_invstd, bool train, @@ -6198,8 +6198,8 @@ Tensor batch_norm_jvp( result_t = input_t * invstd_p; } - c10::optional result_p = weight_p.defined() - ? c10::optional((input_p - mean_p) * invstd_p) + std::optional result_p = weight_p.defined() + ? std::optional((input_p - mean_p) * invstd_p) : c10::nullopt; return _affine_jvp( result_p, @@ -6237,8 +6237,8 @@ Tensor layer_norm_jvp( auto invstd_p = saved_invstd.view(view_size); auto result_t = _norm_jvp(input_p, input_t, mean_p, invstd_p, dims, numel); - c10::optional result_p = weight_p.defined() - ? c10::optional((input_p - mean_p) * invstd_p) + std::optional result_p = weight_p.defined() + ? std::optional((input_p - mean_p) * invstd_p) : c10::nullopt; return _affine_jvp( result_p, @@ -6280,7 +6280,7 @@ Tensor group_norm_jvp( /*eps=*/0) .view(input_shape); - c10::optional result_p = c10::nullopt; + std::optional result_p = c10::nullopt; if (weight_p.defined()) { std::vector view_size(input_t_reshaped.dim(), 1); view_size[1] = input_t_reshaped.size(1); @@ -6983,9 +6983,9 @@ mkldnn_rnn_layer_differentiable_backward( const Tensor& output, const Tensor& hy_, const Tensor& cy_, - const c10::optional& grad_output_r_opt, - const c10::optional& grad_hy_r_opt, - const c10::optional& grad_cy_r_opt, + const std::optional& grad_output_r_opt, + const std::optional& grad_hy_r_opt, + const std::optional& grad_cy_r_opt, bool reverse, int64_t mode, int64_t hidden_size, diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h index c78f2b80c806a..dedff70be1ba3 100644 --- a/torch/csrc/autograd/FunctionsManual.h +++ b/torch/csrc/autograd/FunctionsManual.h @@ -31,14 +31,14 @@ struct TORCH_API IndexRangeGenerator { size_t i = 0; }; -TORCH_API Tensor toNonOptFwGrad(const c10::optional& t); -TORCH_API Tensor toNonOptPrimal(const c10::optional& t); -TORCH_API Tensor toNonOptTensor(const c10::optional& t); +TORCH_API Tensor toNonOptFwGrad(const std::optional& t); +TORCH_API Tensor toNonOptPrimal(const std::optional& t); +TORCH_API Tensor toNonOptTensor(const std::optional& t); -TORCH_API inline c10::optional wrap_opt_if( +TORCH_API inline std::optional wrap_opt_if( const Tensor& t, const bool cond) { - using OptTensor = c10::optional; + using OptTensor = std::optional; return cond ? OptTensor(t) : static_cast(c10::nullopt); } @@ -154,12 +154,12 @@ at::Tensor div_tensor_self_backward( const Tensor& grad, T other, ScalarType self_st, - const c10::optional& rounding_mode); + const std::optional& rounding_mode); at::Tensor div_tensor_other_backward( const Tensor& grad, const Tensor& self, const Tensor& other, - const c10::optional& rounding_mode); + const std::optional& rounding_mode); at::Tensor mvlgamma_backward( const at::Tensor& grad, const at::Tensor& self, @@ -314,8 +314,8 @@ at::Tensor mm_mat1_sparse_backward( std::tuple sparse_sampled_addmm_backward( const Tensor& grad, const Tensor& self, - const c10::optional& mat1, - const c10::optional& mat2, + const std::optional& mat1, + const std::optional& mat2, const Scalar& alpha, const Scalar& beta, const std::array& grad_input_mask); @@ -367,21 +367,21 @@ at::Tensor var_backward( at::Tensor grad, const at::Tensor& self, at::OptionalIntArrayRef dim, - const c10::optional& correction, + const std::optional& correction, bool keepdim); at::Tensor var_jvp( const at::Tensor& self_t, const at::Tensor& self_p, const at::Tensor& result, at::OptionalIntArrayRef dim_opt, - const c10::optional& correction, + const std::optional& correction, bool keepdim); at::Tensor std_backward( const at::Tensor& result, const at::Tensor& grad, const at::Tensor& self, at::OptionalIntArrayRef dim, - const c10::optional& correction, + const std::optional& correction, bool keepdim); Tensor mean_backward( const Tensor& grad, @@ -394,7 +394,7 @@ Tensor var_mean_backward( const Tensor& gmean, const Tensor& self, at::OptionalIntArrayRef dim_opt, - const c10::optional& correction, + const std::optional& correction, bool keepdim); Tensor std_mean_backward( const Tensor& gstd, @@ -402,7 +402,7 @@ Tensor std_mean_backward( const Tensor& self, const Tensor& std, at::OptionalIntArrayRef dim_opt, - const c10::optional& correction, + const std::optional& correction, bool keepdim); at::Tensor cholesky_backward( const at::Tensor& grad, @@ -465,33 +465,33 @@ at::Tensor infinitely_differentiable_mish_backward( Tensor infinitely_differentiable_logit_backward( const Tensor& grad, const Tensor& self, - c10::optional eps); + std::optional eps); Tensor binary_cross_entropy_target_backward( const Tensor& grad, const Tensor& self, const Tensor& target, - const c10::optional& weight, + const std::optional& weight, int64_t reduction); Tensor binary_cross_entropy_double_backward_target( const Tensor& grad, const Tensor& grad_output, const Tensor& self, const Tensor& target, - const c10::optional& weight, + const std::optional& weight, int64_t reduction); Tensor binary_cross_entropy_with_logits_backward( const Tensor& grad, const Tensor& input, const Tensor& target, - const c10::optional& weight_opt, - const c10::optional& pos_weight_opt, + const std::optional& weight_opt, + const std::optional& pos_weight_opt, int64_t reduction); at::Tensor binary_cross_entropy_with_logits_target_backward( const at::Tensor& grad_output, const at::Tensor& self, const at::Tensor& target, - const c10::optional& weight, - const c10::optional& pos_weight, + const std::optional& weight, + const std::optional& pos_weight, int64_t reduction); at::Tensor log_sigmoid_double_backward( const at::Tensor& grad, @@ -506,13 +506,13 @@ at::Tensor binary_cross_entropy_double_backward( const at::Tensor& grad, const at::Tensor& input, const at::Tensor& target, - const c10::optional& weight, + const std::optional& weight, int64_t reduction); at::Tensor binary_cross_entropy_double_backward_grad_output( const at::Tensor& grad, const at::Tensor& input, const at::Tensor& target, - const c10::optional& weight, + const std::optional& weight, int64_t reduction); at::Tensor smooth_l1_loss_double_backward( const at::Tensor& grad, @@ -577,7 +577,7 @@ at::Tensor embedding_dense_double_backward_symint( const c10::SymInt& padding_idx); at::Tensor index_backward( at::Tensor zeros_like_self, - const torch::List>& indices, + const torch::List>& indices, const at::Tensor& grad); at::Tensor _cudnn_ctc_loss_backward( const at::Tensor& grad_out, @@ -611,8 +611,8 @@ Tensor slice_backward_wrapper( const at::Tensor& grad, const c10::SymIntArrayRef& input_sizes, int64_t dim, - c10::optional start, - c10::optional end, + std::optional start, + std::optional end, c10::SymInt step); std::tuple linalg_eig_jvp( const Tensor& dA, @@ -667,9 +667,9 @@ std::tuple linalg_solve_triangular_backward( std::array output_mask); std::tuple _trilinear_backward( const Tensor& grad_out, - const c10::optional& i1, - const c10::optional& i2, - const c10::optional& i3, + const std::optional& i1, + const std::optional& i2, + const std::optional& i3, IntArrayRef expand1, IntArrayRef expand2, IntArrayRef expand3, @@ -692,17 +692,17 @@ Tensor linalg_matrix_exp_differential( bool adjoint); std::tuple batchnorm_double_backward( const Tensor& input, - const c10::optional& gamma, + const std::optional& gamma, const Tensor& ggI, const Tensor& ggG, const Tensor& ggB, const Tensor& gO, - const c10::optional& running_mean, - const c10::optional& running_var, + const std::optional& running_mean, + const std::optional& running_var, bool training, double eps, - const c10::optional& save_mean, - const c10::optional& save_invstd, + const std::optional& save_mean, + const std::optional& save_invstd, std::array output_mask); std::tuple _euclidean_dist_backward( const Tensor& grad, @@ -752,7 +752,7 @@ infinitely_differentiable_native_group_norm_backward( const Tensor& X, const Tensor& mean, const Tensor& rstd, - const c10::optional& gamma, + const std::optional& gamma, c10::SymInt N, const c10::SymInt& C, c10::SymInt HxW, @@ -790,7 +790,7 @@ Tensor amaxamin_jvp( bool keepdim); std::tuple layer_norm_double_backward( const Tensor& input, - const c10::optional& gamma, + const std::optional& gamma, const Tensor& ggI, const Tensor& ggG, const Tensor& ggB, @@ -919,8 +919,8 @@ Tensor batch_norm_jvp( const Tensor& weight_t, const Tensor& bias_p, const Tensor& bias_t, - const c10::optional& running_mean, - const c10::optional& running_var, + const std::optional& running_mean, + const std::optional& running_var, const Tensor& saved_mean, const Tensor& saved_invstd, bool train, @@ -1082,9 +1082,9 @@ mkldnn_rnn_layer_differentiable_backward( const Tensor& output, const Tensor& hy_, const Tensor& cy_, - const c10::optional& grad_output_r_opt, - const c10::optional& grad_hy_r_opt, - const c10::optional& grad_cy_r_opt, + const std::optional& grad_output_r_opt, + const std::optional& grad_hy_r_opt, + const std::optional& grad_cy_r_opt, bool reverse, int64_t mode, int64_t hidden_size, diff --git a/torch/csrc/autograd/TraceTypeManual.cpp b/torch/csrc/autograd/TraceTypeManual.cpp index 4134ef6d992ba..46e4014d8dd13 100644 --- a/torch/csrc/autograd/TraceTypeManual.cpp +++ b/torch/csrc/autograd/TraceTypeManual.cpp @@ -51,7 +51,7 @@ Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) { const Tensor& resize_( const Tensor& self, IntArrayRef size, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { if (torch::jit::tracer::isTracing()) { if (jit::tracer::ArgumentStash::hasIntArrayRef("size")) { jit::tracer::ArgumentStash::popIntArrayRef("size"); @@ -70,7 +70,7 @@ const Tensor& resize_( const Tensor& resize_as_( const Tensor& self, const Tensor& the_template, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { if (torch::jit::tracer::isTracing()) { jit::tracer::warn("resize_as_", jit::tracer::WARN_RESIZE); jit::tracer::delValueTrace(self); diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp index 38a63640c11e6..20f66694677e8 100644 --- a/torch/csrc/autograd/VariableTypeManual.cpp +++ b/torch/csrc/autograd/VariableTypeManual.cpp @@ -240,7 +240,7 @@ const Tensor& resize_( c10::DispatchKeySet ks, const Tensor& self, SymIntArrayRef size, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { auto& self_ = unpack(self, "self", 0); if (self.requires_grad()) { AT_ERROR("cannot resize variables that require grad"); @@ -262,7 +262,7 @@ const Tensor& resize_as_( c10::DispatchKeySet ks, const Tensor& self, const Tensor& the_template, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { auto& self_ = unpack(self, "self", 0); auto& the_template_ = unpack(the_template, "the_template", 1); if (self.requires_grad()) { @@ -400,7 +400,7 @@ static const Tensor& resize_( c10::DispatchKeySet ks, const Tensor& self, SymIntArrayRef size, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { // Hold sizes to verify if we actually resize `self`. // Explicitly copy data, since resizing can move original data // and make references invalid. @@ -424,7 +424,7 @@ static const Tensor& resize_as_( c10::DispatchKeySet ks, const Tensor& self, const Tensor& the_template, - c10::optional optional_memory_format) { + std::optional optional_memory_format) { // Hold sizes to verify if we actually resize `self`. // Explicitly copy data, since resizing can move original data // and make references invalid. diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h index b8fa4b6c101a7..d5fe8a70dae17 100644 --- a/torch/csrc/autograd/VariableTypeUtils.h +++ b/torch/csrc/autograd/VariableTypeUtils.h @@ -166,7 +166,7 @@ struct Flatten : IterArgs { void operator()(const at::Tensor& x) { out.emplace_back(x); } - void operator()(const c10::optional& x) { + void operator()(const std::optional& x) { if (x.has_value()) out.emplace_back(x.value()); } @@ -233,8 +233,8 @@ inline at::Tensor as_view( } // If they cannot be shared, create the required view infos - c10::optional new_bw_info; - c10::optional new_fw_info; + std::optional new_bw_info; + std::optional new_fw_info; if (is_bw_differentiable) { auto bw_view_func = view_func ? view_func->clone_and_set() : nullptr; @@ -298,7 +298,7 @@ inline void check_no_requires_grad( } inline void check_no_requires_grad( - const c10::optional& tensor, + const std::optional& tensor, const char* name, const char* fn_name = "") { if (tensor.has_value()) { @@ -320,14 +320,14 @@ inline void check_no_requires_grad( } inline void check_no_requires_grad( - const c10::List>& tensors, + const c10::List>& tensors, const char* name, const char* fn_name = "") { // GradMode check is expensive, so check it only once for TensorLists if (!GradMode::is_enabled()) { return; } - for (c10::optional tensor : tensors) { + for (std::optional tensor : tensors) { if (tensor.has_value()) { check_no_requires_grad(*tensor, name, fn_name, /*check_grad_mode*/ false); } @@ -345,11 +345,11 @@ inline std::vector make_saved_variable_list( // Assumed that saved tensor lists are never inplace outputs inline std::vector make_saved_variable_list( - const c10::List>& tensors, + const c10::List>& tensors, const bool is_output = false) { return fmap( tensors, - [&is_output](const c10::optional& tensor) -> SavedVariable { + [&is_output](const std::optional& tensor) -> SavedVariable { if (tensor.has_value()) { return SavedVariable{*tensor, is_output /* is output */}; } else { diff --git a/torch/csrc/autograd/autograd.cpp b/torch/csrc/autograd/autograd.cpp index fd4265619fccd..4a550e7006389 100644 --- a/torch/csrc/autograd/autograd.cpp +++ b/torch/csrc/autograd/autograd.cpp @@ -165,7 +165,7 @@ static variable_list run_backward( void backward( const variable_list& tensors, const variable_list& grad_tensors, - c10::optional retain_graph, + std::optional retain_graph, bool create_graph, const variable_list& inputs) { variable_list gradients = _make_grads(tensors, grad_tensors); @@ -186,7 +186,7 @@ variable_list grad( const variable_list& outputs, const variable_list& inputs, const variable_list& grad_outputs, - c10::optional retain_graph, + std::optional retain_graph, bool create_graph, bool allow_unused) { variable_list gradients = _make_grads(outputs, grad_outputs); diff --git a/torch/csrc/autograd/autograd.h b/torch/csrc/autograd/autograd.h index 3537df9bc4a7d..94ee179225a4c 100644 --- a/torch/csrc/autograd/autograd.h +++ b/torch/csrc/autograd/autograd.h @@ -47,7 +47,7 @@ namespace torch::autograd { TORCH_API void backward( const variable_list& tensors, const variable_list& grad_tensors = {}, - c10::optional retain_graph = c10::nullopt, + std::optional retain_graph = c10::nullopt, bool create_graph = false, const variable_list& inputs = {}); @@ -81,7 +81,7 @@ TORCH_API variable_list grad( const variable_list& outputs, const variable_list& inputs, const variable_list& grad_outputs = {}, - c10::optional retain_graph = c10::nullopt, + std::optional retain_graph = c10::nullopt, bool create_graph = false, bool allow_unused = false); diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp index 2cfca6817e855..acc8986efa6a2 100644 --- a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp +++ b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp @@ -339,12 +339,12 @@ static void autogradNotImplementedFallbackImpl( std::vector(stack->begin() + stack_start, stack->end()); std::vector> impl_saved; impl_saved.reserve(num_tensor_inputs); - std::vector> storage_saved; + std::vector> storage_saved; storage_saved.reserve(num_tensor_inputs); _foreach_tensor( [&](size_t idx, size_t _, const at::Tensor& t) { storage_saved.push_back( - t.has_storage() ? c10::optional(t.storage()) + t.has_storage() ? std::optional(t.storage()) : c10::nullopt); impl_saved.push_back(t.getIntrusivePtr()); }, diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp index 41e2f1991a52b..1cf94bbe048fe 100644 --- a/torch/csrc/autograd/custom_function.cpp +++ b/torch/csrc/autograd/custom_function.cpp @@ -28,7 +28,7 @@ namespace torch::autograd { static void _process_forward_mode_AD( const variable_list& inputs, std::unordered_map inputs_mapping, - const at::ArrayRef> raw_outputs, + const at::ArrayRef> raw_outputs, const optional_variable_list& outputs, const std::unordered_set& non_differentiable, const std::unordered_set& dirty_inputs, @@ -258,7 +258,7 @@ static optional_variable_list _process_backward_mode_ad( const std::unordered_map& inputs_mapping, const std::unordered_set& non_differentiable, const std::unordered_set& dirty_inputs, - const at::ArrayRef> raw_outputs, + const at::ArrayRef> raw_outputs, const std::shared_ptr& cdata, const std::unordered_set& to_save_if_setup_context, const _view_as_self_fn_t& view_as_self_fn) { @@ -438,7 +438,7 @@ optional_variable_list _wrap_outputs( const variable_list& input_vars, const std::unordered_set& non_differentiable, const std::unordered_set& dirty_inputs, - const at::ArrayRef> raw_outputs, + const at::ArrayRef> raw_outputs, const std::shared_ptr& cdata, const _jvp_fn_t& jvp_user_function, const std::unordered_set& to_save_if_setup_context, diff --git a/torch/csrc/autograd/custom_function.h b/torch/csrc/autograd/custom_function.h index ebabc45334a5d..8c20bd8078207 100644 --- a/torch/csrc/autograd/custom_function.h +++ b/torch/csrc/autograd/custom_function.h @@ -12,15 +12,15 @@ namespace torch::autograd { -using optional_variable_list = std::vector>; +using optional_variable_list = std::vector>; using _jvp_fn_t = std::function; using _view_as_self_fn_t = std::function; -TORCH_API std::vector> _wrap_outputs( +TORCH_API std::vector> _wrap_outputs( const variable_list& input_vars, const std::unordered_set& non_differentiable, const std::unordered_set& dirty_inputs, - const at::ArrayRef> raw_outputs, + const at::ArrayRef> raw_outputs, const std::shared_ptr& cdata, const _jvp_fn_t& jvp_user_function, const std::unordered_set& to_save_if_setup_context, @@ -41,7 +41,7 @@ using forward_t = decltype(X::forward(nullptr, std::declval()...)); /// `forward` can take as many arguments as you want and should return either a /// variable list or a Variable. Use of any direct Variable arguments will be /// registered in the graph but no vectors/sets or any other data structures -/// will be traversed. You can use c10::optional as one of the arguments +/// will be traversed. You can use std::optional as one of the arguments /// and it will be registered as a variable in the graph if the argument has a /// value. It should take a pointer to `torch::autograd::AutogradContext` as the /// first argument. Variables can be saved in the `ctx` using @@ -247,7 +247,7 @@ struct ExtractVariables : IterArgs { variable_list& list_; ExtractVariables(std::vector& is_var, variable_list& list) : is_var_(is_var), list_(list) {} - void operator()(const c10::optional& x) { + void operator()(const std::optional& x) { // NOLINTNEXTLINE(bugprone-branch-clone) if (x.has_value() && x.value().defined()) { is_var_.push_back(true); @@ -282,30 +282,30 @@ inline void extract_vars( template std::enable_if_t, T> to_output_type( - std::vector>& output_list) { + std::vector>& output_list) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) variable_list result; std::transform( output_list.begin(), output_list.end(), std::back_inserter(result), - [](const c10::optional& var) { return *var; }); + [](const std::optional& var) { return *var; }); return result; } template std::enable_if_t, T> to_output_type( - std::vector>& output_list) { + std::vector>& output_list) { return *output_list[0]; } -inline std::vector> to_optional(Variable& output) { - return std::vector>{output}; +inline std::vector> to_optional(Variable& output) { + return std::vector>{output}; } -inline std::vector> to_optional(variable_list& output) { +inline std::vector> to_optional(variable_list& output) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - std::vector> result; + std::vector> result; std::transform( output.begin(), output.end(), diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index becc73396e66d..c8c3538a061f1 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -239,7 +239,7 @@ struct TORCH_API Node : std::enable_shared_from_this { * elements are on different devices (across multiple GPUs, for example) * they may have different streams. */ - c10::optional stream() { + std::optional stream() { auto opt_device_type = at::getAccelerator(); if (!opt_device_type.has_value()) { return c10::nullopt; @@ -703,7 +703,7 @@ struct MakeNextFunctionList : IterArgs { void operator()(const Variable* variable) { operator()(*variable); } - void operator()(const c10::optional& variable) { + void operator()(const std::optional& variable) { if (variable.has_value()) { operator()(*variable); } else { diff --git a/torch/csrc/autograd/functions/comm.cpp b/torch/csrc/autograd/functions/comm.cpp index 9bcd511285734..e2f23f363d7a0 100644 --- a/torch/csrc/autograd/functions/comm.cpp +++ b/torch/csrc/autograd/functions/comm.cpp @@ -17,9 +17,9 @@ namespace torch { namespace autograd { Scatter::Scatter( std::vector devices, - c10::optional> chunk_sizes, + std::optional> chunk_sizes, int64_t dim, - c10::optional>> streams, + std::optional>> streams, bool unsqueeze_scalars) : devices_(std::move(devices)), chunk_sizes_(std::move(chunk_sizes)), diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h index 9b1f0daf50bce..b0e6900729955 100644 --- a/torch/csrc/autograd/functions/comm.h +++ b/torch/csrc/autograd/functions/comm.h @@ -17,9 +17,9 @@ namespace autograd { struct TORCH_CUDA_CU_API Scatter : public Node { explicit Scatter( std::vector devices, - c10::optional> chunk_sizes = c10::nullopt, + std::optional> chunk_sizes = c10::nullopt, int64_t dim = 0, - c10::optional>> streams = + std::optional>> streams = c10::nullopt, bool unsqueeze_scalars = false); ~Scatter() override; @@ -27,9 +27,9 @@ struct TORCH_CUDA_CU_API Scatter : public Node { variable_list apply(variable_list&& inputs) override; std::vector devices_; - c10::optional> chunk_sizes_; + std::optional> chunk_sizes_; int64_t dim_; - c10::optional>> streams_; + std::optional>> streams_; bool unsqueeze_scalars_; }; diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h index 3cc2575da8f5d..db916dc0bbbfa 100644 --- a/torch/csrc/autograd/functions/utils.h +++ b/torch/csrc/autograd/functions/utils.h @@ -46,7 +46,7 @@ struct ComputeRequiresGrad : IterArgs { out = true; } } - void operator()(const c10::optional& tensor) { + void operator()(const std::optional& tensor) { if (tensor.has_value()) { (*this)(*tensor); } @@ -88,7 +88,7 @@ inline void set_history( } } -inline bool isFwGradDefined(const c10::optional& t) { +inline bool isFwGradDefined(const std::optional& t) { return t.has_value() && t->defined() && t->_fw_grad(/*level */ 0).defined(); } @@ -101,7 +101,7 @@ inline bool isFwGradDefinedTensorList(const at::ITensorListRef& variables) { } inline bool isFwGradDefinedTensorList( - const c10::List>& li) { + const c10::List>& li) { bool ret = false; for (auto i : c10::irange(li.size())) { auto t = li.get(i); diff --git a/torch/csrc/autograd/graph_task.h b/torch/csrc/autograd/graph_task.h index 03a9647cad833..e4a7ae4dad18e 100644 --- a/torch/csrc/autograd/graph_task.h +++ b/torch/csrc/autograd/graph_task.h @@ -125,7 +125,7 @@ struct GraphTask : std::enable_shared_from_this { // Per-device current streams of the execute() that called this GraphTask. // These will be synced with leaf_streams in exec_post_processing. - std::vector> caller_current_streams_; + std::vector> caller_current_streams_; // Collects caller_current_streams_ for the accelerator device. void stash_current_streams(); diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index e04d853198fbb..9eb1031ff02c0 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -1081,7 +1081,7 @@ static PyObject* push_on_torch_dispatch_stack( using c10::impl::TorchDispatchModeKey; // When we push a mode onto the mode stack, we need to // check if it's an "infra" mode, by checking its _mode_key attribute. - c10::optional mode_key = c10::nullopt; + std::optional mode_key = c10::nullopt; py::object maybe_mode_key_obj = PyObject_FastGetAttrString(arg, "_mode_key"); if (maybe_mode_key_obj) { @@ -1105,7 +1105,7 @@ static PyObject* pop_torch_dispatch_stack( PyObject* _unused, PyObject* maybe_mode_key) { HANDLE_TH_ERRORS - c10::optional mode_key = c10::nullopt; + std::optional mode_key = c10::nullopt; PyObject* r = nullptr; if (maybe_mode_key != Py_None) { mode_key = py::cast(maybe_mode_key); diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp index 2adfc1fc7efae..6c12bbadc5d2d 100644 --- a/torch/csrc/autograd/input_buffer.cpp +++ b/torch/csrc/autograd/input_buffer.cpp @@ -129,8 +129,8 @@ static void accumulate( void InputBuffer::add( size_t pos, Variable&& var, - const c10::optional& opt_producer_stream, - const c10::optional& opt_consumer_stream) { + const std::optional& opt_producer_stream, + const std::optional& opt_consumer_stream) { TORCH_INTERNAL_ASSERT(pos < buffer.size()); if (!var.defined()) { return; @@ -159,7 +159,7 @@ void InputBuffer::add( // Accumulation happens on the var device's default stream. TORCH_INTERNAL_ASSERT(device_of(var)); - c10::optional opt_accumulate_stream = c10::nullopt; + std::optional opt_accumulate_stream = c10::nullopt; const auto device_type = device_of(var).value().type(); // NOLINTNEXTLINE(bugprone-unchecked-optional-access) if (device_of(var)->is_cuda() || device_of(var)->is_privateuseone()) { @@ -179,7 +179,7 @@ void InputBuffer::add( record_stream_any_impl(var, *opt_accumulate_stream); } } else { - c10::optional opt_sync_stream = c10::nullopt; + std::optional opt_sync_stream = c10::nullopt; const auto guard = c10::impl::VirtualGuardImpl{device_type}; if (on_consumer && !on_producer) { // (3a) diff --git a/torch/csrc/autograd/input_buffer.h b/torch/csrc/autograd/input_buffer.h index d8ef3396cb6d8..7e471ef528bb0 100644 --- a/torch/csrc/autograd/input_buffer.h +++ b/torch/csrc/autograd/input_buffer.h @@ -27,8 +27,8 @@ struct InputBuffer { TORCH_API void add( size_t pos, Variable&& var, - const c10::optional& opt_producer_stream, - const c10::optional& opt_consumer_stream); + const std::optional& opt_producer_stream, + const std::optional& opt_consumer_stream); at::Device device() const; diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 0c73c8b7a72a1..64b85dd72f592 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -80,16 +81,18 @@ struct OpArgData { std::vector dtypes; std::vector concrete_inputs; std::vector> shapes_for_kineto_event; + std::vector strides; }; auto parseArgData( const std::vector& input_shapes, const std::vector& concrete_inputs) { if (input_shapes.empty()) { - return OpArgData{false, {}, {}, {}, {}}; + return OpArgData{false, {}, {}, {}, {}, {}}; } std::vector shapes(input_shapes.size()); + std::vector strides(input_shapes.size()); std::vector> shapes_for_kineto_event( input_shapes.size()); @@ -103,14 +106,19 @@ auto parseArgData( shapes[i] = t.sizes_; shapes_for_kineto_event[i] = t.sizes_; dtypes[i] = std::string(scalarTypeToTypeMeta(t.dtype_).name()); + strides[i] = t.strides_; }, [&](const std::vector& l) { std::vector> shape; shape.reserve(l.size()); + std::vector> stride; + stride.reserve(l.size()); for (const auto& t : l) { shape.emplace_back(t.sizes_); + stride.emplace_back(t.strides_); } shapes[i] = shape; + strides[i] = stride; dtypes[i] = "TensorList"; }, [&](const c10::IValue& val) { dtypes[i] = "Scalar"; }, @@ -141,7 +149,12 @@ auto parseArgData( } return OpArgData{ - true, shapes, dtypes, concrete_inputs_list, shapes_for_kineto_event}; + true, + shapes, + dtypes, + concrete_inputs_list, + shapes_for_kineto_event, + strides}; } struct MetadataBase { @@ -194,7 +207,7 @@ struct AddTensorboardFields : public MetadataBase { result->visit_if_base([&, this](const auto& i) -> void { this->addMetadata("Python id", std::to_string(i.id_)); - c10::optional parent_id; + std::optional parent_id; std::shared_ptr parent = result->parent_.lock(); while (parent && !parent_id.has_value()) { parent->visit_if_base( @@ -236,6 +249,7 @@ struct AddGenericMetadata : public MetadataBase { if (arg_data.has_data) { if (get_record_concrete_inputs_enabled()) { addMetadata("Input Dims", variantShapesToStr(arg_data.shapes)); + addMetadata("Input Strides", variantShapesToStr(arg_data.strides)); } else { addMetadata( "Input Dims", shapesToStr(arg_data.shapes_for_kineto_event)); @@ -625,6 +639,9 @@ void enableProfiler( } else if (config.state == ProfilerState::ITT) { torch::profiler::impl::pushITTCallbacks(config, scopes); return; + } else if (config.state == ProfilerState::PRIVATEUSE1) { + torch::profiler::impl::pushPRIVATEUSE1CallbacksStub(config, scopes); + return; } TORCH_CHECK( @@ -660,7 +677,8 @@ std::unique_ptr disableProfiler() { config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK || config.state == ProfilerState::KINETO_ONDEMAND || config.state == ProfilerState::NVTX || - config.state == ProfilerState::ITT), + config.state == ProfilerState::ITT || + config.state == ProfilerState::PRIVATEUSE1), "Can't disable Kineto profiler when it's not running"); state_ptr->removeCallback(); @@ -672,9 +690,11 @@ std::unique_ptr disableProfiler() { return std::make_unique(); } - // Shared among NVTX, KINETO, KINETO_GPU_FALLBACK, KINETO_PRIVATEUSE1_FALLBACK + // Shared among NVTX, PRIVATEUSE1, KINETO, KINETO_GPU_FALLBACK, + // KINETO_PRIVATEUSE1_FALLBACK std::unique_ptr result; - if (state_ptr->config().state == ProfilerState::NVTX) { + if (state_ptr->config().state == ProfilerState::NVTX || + state_ptr->config().state == ProfilerState::PRIVATEUSE1) { result = std::make_unique(); } diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp index 04c676fc2b497..b9387479667e8 100644 --- a/torch/csrc/autograd/profiler_legacy.cpp +++ b/torch/csrc/autograd/profiler_legacy.cpp @@ -169,7 +169,7 @@ struct ProfilerLegacyThreadLocalState : public ProfilerStateBase { std::unordered_map> event_lists_map_; - c10::optional>> remoteProfiledEvents_; + std::optional>> remoteProfiledEvents_; }; thread_event_lists ProfilerLegacyThreadLocalState::consolidate() { @@ -429,7 +429,7 @@ void enableProfilerLegacy( } thread_event_lists disableProfilerLegacy( - c10::optional profilerDisableOptions) { + std::optional profilerDisableOptions) { auto cleanupTLSState = profilerDisableOptions ? profilerDisableOptions->cleanupTLSState : true; auto consolidate = diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h index e74ddd8a2296e..9bd88b0b3dc51 100644 --- a/torch/csrc/autograd/profiler_legacy.h +++ b/torch/csrc/autograd/profiler_legacy.h @@ -335,7 +335,7 @@ TORCH_API void enableProfilerLegacy( const torch::profiler::impl::ProfilerConfig&); using thread_event_lists = std::vector>; TORCH_API thread_event_lists disableProfilerLegacy( - c10::optional profilerDisableOptions = + std::optional profilerDisableOptions = c10::nullopt); // adds profiledEvents to the current thread local recorded events. Each event @@ -376,9 +376,9 @@ struct TORCH_API RecordProfile { struct TORCH_API TLSLegacyProfilerGuard { explicit TLSLegacyProfilerGuard( const torch::profiler::impl::ProfilerConfig& cfg, - c10::optional> + std::optional> resultCallback = c10::nullopt, - c10::optional profilerDisableOptions = + std::optional profilerDisableOptions = c10::nullopt) : cb_(std::move(resultCallback)), profilerDisableOptions_(profilerDisableOptions) { @@ -397,9 +397,9 @@ struct TORCH_API TLSLegacyProfilerGuard { } private: - c10::optional> cb_; + std::optional> cb_; // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) - const c10::optional profilerDisableOptions_; + const std::optional profilerDisableOptions_; }; } // namespace profiler diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp index da1cedfdb5a97..799188be9a686 100644 --- a/torch/csrc/autograd/profiler_python.cpp +++ b/torch/csrc/autograd/profiler_python.cpp @@ -220,7 +220,7 @@ struct ExtendedPyCallConfig { struct Cache { // `nn.Module.forward` or `optim.Optimizer._optimizer_step_code` - c10::optional location_; + std::optional location_; ska::flat_hash_map cls_and_parameters_; ska::flat_hash_map cls_names_; }; @@ -300,7 +300,7 @@ class ValueCache { load(callsite.value_)}; } - c10::optional recordIfTensor(py::handle p); + std::optional recordIfTensor(py::handle p); std::vector> unpackTensorMap( const py::dict& tensor_map); void trimPrefixes(); @@ -348,9 +348,9 @@ TensorMetadata toTensorMetadata(PyObject* self) { m.layout_ == at::kStrided ? t.strides().vec() : std::vector()}; } -c10::optional ValueCache::recordIfTensor(py::handle p) { +std::optional ValueCache::recordIfTensor(py::handle p) { return THPVariable_CheckExact(p.ptr()) - ? c10::optional{toTensorMetadata(p.ptr())} + ? std::optional{toTensorMetadata(p.ptr())} : c10::nullopt; } diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp index 341d2886699a1..33300b001819b 100644 --- a/torch/csrc/autograd/python_function.cpp +++ b/torch/csrc/autograd/python_function.cpp @@ -619,7 +619,7 @@ static void _wrap_outputs( auto non_differentiable = _parse_non_differentiable(self); auto dirty_inputs = _mark_dirty(self); - std::vector> raw_output_vars; + std::vector> raw_output_vars; raw_output_vars.reserve(num_outputs); for (const auto i : c10::irange(num_outputs)) { PyObject* obj = PyTuple_GET_ITEM(raw_output, i); @@ -746,7 +746,7 @@ static void _wrap_outputs( static void _get_tensors_to_save( THPFunction* self, std::unordered_set& to_save_if_setup_context, - std::vector>& tensors_to_save, + std::vector>& tensors_to_save, bool overridden_setup_context, bool is_executable) { if (self->saved_for_forward && overridden_setup_context) { @@ -804,7 +804,7 @@ static void _get_tensors_to_save( } // Save any variables that requested by to_save static void _save_variables( - const std::vector>& tensors_to_save, + const std::vector>& tensors_to_save, const std::shared_ptr& cdata_ptr, THPFunction* self) { if (!self->to_save) @@ -1106,7 +1106,7 @@ PyObject* process_outputs( } std::unordered_set to_save_if_setup_context{}; - std::vector> tensors_to_save{}; + std::vector> tensors_to_save{}; _get_tensors_to_save( grad_fn, to_save_if_setup_context, diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index 5161cbc53a8c4..078b0f92124cb 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -267,7 +267,7 @@ PyObject* THPVariable_Wrap(at::TensorBase var) { c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED); } - c10::optional mb_obj = + std::optional mb_obj = var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( getPyInterpreter(), /*ignore_hermetic_tls=*/false); c10::impl::PyInterpreterStatus status{}; @@ -587,14 +587,14 @@ static PyObject* view_func_impl( auto& view_func = view_info.view_fn(); // Determine new SymInt / tensor state as needed. - c10::optional> new_symints = c10::nullopt; + std::optional> new_symints = c10::nullopt; if (symint_visitor_fn != Py_None) { new_symints = map_py_func( py::cast(symint_visitor_fn), view_func.get_symints()); } - c10::optional> new_tensors = c10::nullopt; + std::optional> new_tensors = c10::nullopt; if (tensor_visitor_fn != Py_None) { new_tensors = map_py_func( py::cast(tensor_visitor_fn), @@ -815,7 +815,7 @@ static PyObject* THPVariable_make_wrapper_subclass( auto sym_sizes = r.symintlist(1); auto sym_strides_own = r.symintlistOptional(2); auto sym_strides = - static_cast>(sym_strides_own); + static_cast>(sym_strides_own); auto sym_storage_offset = r.toSymIntOptional(3); c10::SymInt size_bytes; @@ -1931,7 +1931,7 @@ void THPVariable_subclass_dealloc(PyObject* self) { if (type->tp_del) { PyObject_GC_Track(self); type->tp_del(self); - if (self->ob_refcnt > 0) { + if (Py_REFCNT(self) > 0) { /* Resurrected */ return; } diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp index e3cdd04f0965a..fdcafd6cd7091 100644 --- a/torch/csrc/autograd/python_variable_indexing.cpp +++ b/torch/csrc/autograd/python_variable_indexing.cpp @@ -178,7 +178,7 @@ static inline Variable applySlicing( variable_list& outIndices, bool is_tracing, const at::Device& self_device, - const c10::optional& self_ndim, + const std::optional& self_ndim, int64_t specified_dims) { int64_t size = PyTuple_GET_SIZE(index); // NOLINT(cppcoreguidelines-pro-type-cstyle-cast) @@ -200,9 +200,9 @@ static inline Variable applySlicing( // nested tensor does not have a size (yet) so for now we represent its size // as null may need to be changed after we reach a better solution for // nested tensor size - c10::optional result_sizes = result.is_nested() - ? c10::optional(c10::nullopt) - : c10::optional(result.sym_sizes()); + std::optional result_sizes = result.is_nested() + ? std::optional(c10::nullopt) + : std::optional(result.sym_sizes()); result = at::indexing::handleDimInMultiDimIndexing( /*prev_dim_result=*/result, /*original_tensor=*/self, diff --git a/torch/csrc/autograd/record_function_ops.cpp b/torch/csrc/autograd/record_function_ops.cpp index e5153ae4028aa..e3a3299dc9c59 100644 --- a/torch/csrc/autograd/record_function_ops.cpp +++ b/torch/csrc/autograd/record_function_ops.cpp @@ -20,7 +20,7 @@ namespace profiler { // callbacks. static void record_function_enter( const std::string& name, - const c10::optional& args, + const std::optional& args, at::RecordFunction& rec) { if (rec.isActive()) { if (rec.needsInputs() && args.has_value()) { @@ -35,7 +35,7 @@ static void record_function_enter( // Legacy signature using cpp_custom_type_hack static at::Tensor record_function_enter_legacy( const std::string& name, - const c10::optional& args) { + const std::optional& args) { auto rec = std::make_unique(at::RecordScope::USER_SCOPE); record_function_enter(name, args, *rec); return at::cpp_custom_type_hack::create(std::move(rec), at::TensorOptions()); @@ -44,7 +44,7 @@ static at::Tensor record_function_enter_legacy( // New signature using custom_class c10::intrusive_ptr record_function_enter_new( const std::string& name, - const c10::optional& args) { + const std::optional& args) { auto rec = c10::make_intrusive(at::RecordScope::USER_SCOPE); record_function_enter(name, args, rec->record); diff --git a/torch/csrc/autograd/record_function_ops.h b/torch/csrc/autograd/record_function_ops.h index d37aba7dfff85..a145523c1bf8a 100644 --- a/torch/csrc/autograd/record_function_ops.h +++ b/torch/csrc/autograd/record_function_ops.h @@ -17,7 +17,7 @@ struct PythonRecordFunction : public torch::CustomClassHolder { // callbacks. TORCH_API c10::intrusive_ptr record_function_enter_new( const std::string& name, - const c10::optional& args = c10::nullopt); + const std::optional& args = c10::nullopt); // Schedules RecordFunction's end callbacks to be run on completion of a future. TORCH_API c10::intrusive_ptr _call_end_callbacks_on_fut_new( diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp index 4bd44339c3b45..c4d4566434325 100644 --- a/torch/csrc/autograd/saved_variable.cpp +++ b/torch/csrc/autograd/saved_variable.cpp @@ -117,7 +117,7 @@ void SavedVariable::reset_data() { } SavedVariable::SavedVariable( - const c10::optional& variable, + const std::optional& variable, bool is_output, bool is_inplace_on_view) : SavedVariable( diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h index c9a358ede89e6..e249209f9f63b 100644 --- a/torch/csrc/autograd/saved_variable.h +++ b/torch/csrc/autograd/saved_variable.h @@ -26,7 +26,7 @@ class TORCH_API SavedVariable { bool is_output, bool is_inplace_on_view = false); SavedVariable( - const c10::optional& variable, + const std::optional& variable, bool is_output, bool is_inplace_on_view = false); SavedVariable(SavedVariable&&) = default; diff --git a/torch/csrc/autograd/utils/python_arg_parsing.h b/torch/csrc/autograd/utils/python_arg_parsing.h index 7701e97fe9189..326221e44d147 100644 --- a/torch/csrc/autograd/utils/python_arg_parsing.h +++ b/torch/csrc/autograd/utils/python_arg_parsing.h @@ -12,11 +12,11 @@ namespace utils { // The parameter allow_copy is to accept copy for Tensor.to (and by proxy // PackedSequences.to) but not nn.Module.to. inline std::tuple< - c10::optional, - c10::optional, + std::optional, + std::optional, bool, bool, - c10::optional> + std::optional> parse_to_conversion(PythonArgs& r, bool allow_copy) { if (r.idx == 0) { if (!allow_copy && !r.isNone(3)) diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp index 07e37463cbd38..da987001e2ecc 100644 --- a/torch/csrc/autograd/variable.cpp +++ b/torch/csrc/autograd/variable.cpp @@ -42,8 +42,8 @@ static std::unique_ptr create_view_func_matching(const Variable& t) { DifferentiableViewMeta::DifferentiableViewMeta( at::TensorImpl* self_impl, - c10::optional backward_info, - c10::optional forward_info, + std::optional backward_info, + std::optional forward_info, bool shared_view_info, CreationMeta creation_meta) : AutogradMeta(self_impl), @@ -581,10 +581,10 @@ bool VariableHooks::retains_grad(const at::TensorBase& self) const { void VariableHooks::_backward( const Tensor& self, at::TensorList inputs, - const c10::optional& gradient, - c10::optional keep_graph, + const std::optional& gradient, + std::optional keep_graph, bool create_graph) const { - // TODO torch::autograd::backward should take the c10::optional + // TODO torch::autograd::backward should take the std::optional // gradient directly instead of us having to unwrap it to Tensor _gradient // here. Tensor _gradient = gradient.has_value() ? *gradient : Tensor(); diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h index aa9ee76f3dc95..d60f37085f380 100644 --- a/torch/csrc/autograd/variable.h +++ b/torch/csrc/autograd/variable.h @@ -682,8 +682,8 @@ TORCH_API void handle_view_on_rebase( struct TORCH_API DifferentiableViewMeta : public AutogradMeta { private: /// Information about the views - c10::optional backward_info_; - c10::optional forward_info_; + std::optional backward_info_; + std::optional forward_info_; // Optimization to reduce the number of ViewInfo we create. // In the (very common) case where backward_info_ == forward_info_, we only @@ -766,8 +766,8 @@ struct TORCH_API DifferentiableViewMeta : public AutogradMeta { DifferentiableViewMeta( at::TensorImpl* self_impl, - c10::optional backward_info, - c10::optional forward_info, + std::optional backward_info, + std::optional forward_info, bool shared_view_info, CreationMeta creation_meta = CreationMeta::DEFAULT); }; @@ -796,8 +796,8 @@ struct TORCH_API DifferentiableViewMeta : public AutogradMeta { // Differentiable view. Track history with DifferentiableViewMeta. inline Variable make_variable_differentiable_view( const at::Tensor& data, - c10::optional backward_info, - c10::optional forward_info, + std::optional backward_info, + std::optional forward_info, bool shared_view_info, CreationMeta creation_meta, bool allow_tensor_metadata_change = true) { @@ -927,8 +927,8 @@ struct VariableHooks final : at::impl::VariableHooksInterface { void _backward( const at::Tensor& self, at::TensorList inputs, - const c10::optional& gradient, - c10::optional keep_graph, + const std::optional& gradient, + std::optional keep_graph, bool create_graph) const override; void requires_grad_(const at::TensorBase& self, bool _requires_grad) const override; diff --git a/torch/csrc/cuda/Graph.cpp b/torch/csrc/cuda/Graph.cpp index 83c60d059f8dd..472151fec6097 100644 --- a/torch/csrc/cuda/Graph.cpp +++ b/torch/csrc/cuda/Graph.cpp @@ -30,7 +30,7 @@ void THCPGraph_init(PyObject* module) { .def( "capture_begin", [](::at::cuda::CUDAGraph& self, - c10::optional pool_opt, + std::optional pool_opt, std::string capture_error_mode) { cudaStreamCaptureMode capture_mode; c10::cuda::MempoolId_t pool = pool_opt.has_value() diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp index e622c254a5003..030c5a2b5ccf6 100644 --- a/torch/csrc/cuda/Module.cpp +++ b/torch/csrc/cuda/Module.cpp @@ -956,8 +956,8 @@ static void registerCudaDeviceProperties(PyObject* module) { m.def( "_cuda_record_memory_history", static_cast, - c10::optional, + std::optional, + std::optional, const std::string&, size_t)>(torch::cuda::_record_memory_history)); diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp index c8bbec87caefb..c7c3cb396304c 100644 --- a/torch/csrc/cuda/comm.cpp +++ b/torch/csrc/cuda/comm.cpp @@ -37,7 +37,7 @@ struct unique_type_checker { unique = type_id_.value() == type_id; } - c10::optional type_id_; + std::optional type_id_; bool unique = true; }; @@ -232,7 +232,7 @@ std::vector& scatter_out( const at::Tensor& tensor, std::vector& out_tensors, int64_t dim, - const c10::optional>>& + const std::optional>>& streams) { TORCH_CHECK( !out_tensors.empty(), @@ -313,9 +313,9 @@ std::vector& scatter_out( std::vector scatter( const at::Tensor& tensor, at::IntArrayRef devices, - const c10::optional>& chunk_sizes, + const std::optional>& chunk_sizes, int64_t dim, - const c10::optional>>& + const std::optional>>& streams) { TORCH_CHECK(!devices.empty(), "Expected at least one device to scatter to"); if (chunk_sizes.has_value()) { @@ -446,7 +446,7 @@ at::Tensor& gather_out( at::Tensor gather( at::TensorList tensors, int64_t dim, - c10::optional destination_index) { + std::optional destination_index) { TORCH_CHECK(!tensors.empty(), "Expected at least one tensor to gather from"); int64_t total_size = 0; auto& first = tensors.front(); diff --git a/torch/csrc/cuda/comm.h b/torch/csrc/cuda/comm.h index cf89b365d0ce4..4bc0f60195a26 100644 --- a/torch/csrc/cuda/comm.h +++ b/torch/csrc/cuda/comm.h @@ -28,15 +28,15 @@ TORCH_CUDA_CU_API std::vector& scatter_out( const at::Tensor& tensor, std::vector& out_tensors, int64_t dim = 0, - const c10::optional>>& + const std::optional>>& streams = c10::nullopt); TORCH_CUDA_CU_API std::vector scatter( const at::Tensor& tensor, at::IntArrayRef devices, - const c10::optional>& chunk_sizes = c10::nullopt, + const std::optional>& chunk_sizes = c10::nullopt, int64_t dim = 0, - const c10::optional>>& + const std::optional>>& streams = c10::nullopt); TORCH_CUDA_CU_API at::Tensor& gather_out( @@ -47,6 +47,6 @@ TORCH_CUDA_CU_API at::Tensor& gather_out( TORCH_CUDA_CU_API at::Tensor gather( at::TensorList tensors, int64_t dim, - c10::optional destination_index); + std::optional destination_index); } // namespace torch::cuda diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp index 49fefd97e2da1..82696abaee227 100644 --- a/torch/csrc/cuda/memory_snapshot.cpp +++ b/torch/csrc/cuda/memory_snapshot.cpp @@ -130,8 +130,8 @@ static void checkOptionIn( } void _record_memory_history( - c10::optional enabled, - c10::optional context, + std::optional enabled, + std::optional context, const std::string& stacks, size_t max_entries) { if (enabled) { diff --git a/torch/csrc/cuda/memory_snapshot.h b/torch/csrc/cuda/memory_snapshot.h index f5f9bdbed1620..eb22767a78f90 100644 --- a/torch/csrc/cuda/memory_snapshot.h +++ b/torch/csrc/cuda/memory_snapshot.h @@ -17,8 +17,8 @@ TORCH_CUDA_CU_API void _record_memory_history( bool record_cpp_context = false); TORCH_CUDA_CU_API void _record_memory_history( - c10::optional enabled = "all", - c10::optional context = "all", + std::optional enabled = "all", + std::optional context = "all", const std::string& stacks = "all", size_t max_entries = SIZE_MAX); diff --git a/torch/csrc/cuda/nccl.h b/torch/csrc/cuda/nccl.h index ebf51b7633abb..b118bd4600a56 100644 --- a/torch/csrc/cuda/nccl.h +++ b/torch/csrc/cuda/nccl.h @@ -111,7 +111,7 @@ TORCH_CUDA_CPP_API void check_inputs( } // namespace detail using comm_list = std::vector; -using stream_list = std::vector>; +using stream_list = std::vector>; TORCH_CUDA_CPP_API std::uint64_t version(); TORCH_CUDA_CPP_API const char* version_suffix(); diff --git a/torch/csrc/cuda/python_comm.cpp b/torch/csrc/cuda/python_comm.cpp index e65bb15103aab..ec9da9ac2d679 100644 --- a/torch/csrc/cuda/python_comm.cpp +++ b/torch/csrc/cuda/python_comm.cpp @@ -46,10 +46,10 @@ void initCommMethods(PyObject* module) { "_scatter", [](at::Tensor& tensor, std::vector& devices, - c10::optional> chunk_sizes, + std::optional> chunk_sizes, int64_t dim, - c10::optional py_streams) { - c10::optional>> + std::optional py_streams) { + std::optional>> streams; if (py_streams) { py::handle handle = *py_streams; @@ -69,8 +69,8 @@ void initCommMethods(PyObject* module) { [](at::Tensor& tensor, std::vector& out_tensors, int64_t dim, - c10::optional py_streams) { - c10::optional>> + std::optional py_streams) { + std::optional>> streams; if (py_streams) { py::handle handle = *py_streams; @@ -88,7 +88,7 @@ void initCommMethods(PyObject* module) { "_gather", [](std::vector& tensors, int64_t dim, - c10::optional destination_index) { + std::optional destination_index) { return gather(tensors, dim, destination_index); }, py::arg("tensors"), diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp index db6f6c680701d..5060f9289a9e1 100644 --- a/torch/csrc/cuda/python_nccl.cpp +++ b/torch/csrc/cuda/python_nccl.cpp @@ -56,11 +56,11 @@ static void destroy_nccl_comm(PyObject* capsule) { END_HANDLE_TH_ERRORS_RET() } -static std::vector> unpack_streams( +static std::vector> unpack_streams( PyObject* obj, size_t size) { if (obj == Py_None) { - return std::vector>(size, c10::nullopt); + return std::vector>(size, c10::nullopt); } auto streams = THPUtils_PySequence_to_CUDAStreamList(obj); if (streams.size() != size) { @@ -147,7 +147,7 @@ PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args) { std::vector inputs = extract_tensors(_inputs); auto output = extract_tensor(_output); - std::vector> streams = + std::vector> streams = unpack_streams(_streams, inputs.size()); auto user_comms = unpack_comms(_comms, inputs.size()); diff --git a/torch/csrc/cuda/utils.cpp b/torch/csrc/cuda/utils.cpp index e62e176473f2a..e2ad6622e6ffb 100644 --- a/torch/csrc/cuda/utils.cpp +++ b/torch/csrc/cuda/utils.cpp @@ -6,7 +6,7 @@ #ifdef USE_CUDA // NB: It's a list of *optional* CUDAStream; when nullopt, that means to use // whatever the current stream of the device the input is associated with was. -std::vector> +std::vector> THPUtils_PySequence_to_CUDAStreamList(PyObject* obj) { if (!PySequence_Check(obj)) { throw std::runtime_error( @@ -18,7 +18,7 @@ THPUtils_PySequence_to_CUDAStreamList(PyObject* obj) { "expected PySequence, but got " + std::string(THPUtils_typename(obj))); } - std::vector> streams; + std::vector> streams; Py_ssize_t length = PySequence_Fast_GET_SIZE(seq.get()); for (Py_ssize_t i = 0; i < length; i++) { PyObject* stream = PySequence_Fast_GET_ITEM(seq.get(), i); diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp index 05a39ddc905aa..501cf59d86bad 100644 --- a/torch/csrc/distributed/c10d/Backend.hpp +++ b/torch/csrc/distributed/c10d/Backend.hpp @@ -375,7 +375,7 @@ class TORCH_API Backend : public torch::CustomClassHolder { } // See similar functions in ProcessGroup.hpp for context. - c10::optional getBoundDeviceId() const { + std::optional getBoundDeviceId() const { return bound_device_id_; } @@ -386,7 +386,7 @@ class TORCH_API Backend : public torch::CustomClassHolder { // backends may perform } - void setBoundDeviceId(c10::optional device) { + void setBoundDeviceId(std::optional device) { if (device) { TORCH_CHECK(device->has_index(), "setBoundDeviceId must have an index"); } @@ -410,7 +410,7 @@ class TORCH_API Backend : public torch::CustomClassHolder { std::function)> onCompletionHook_; - c10::optional bound_device_id_; + std::optional bound_device_id_; }; } // namespace c10d diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp index 9a0c77a8623c3..e26ab22f1a9f3 100644 --- a/torch/csrc/distributed/c10d/NCCLUtils.cpp +++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp @@ -159,7 +159,7 @@ std::string ncclGetErrorWithVersion(ncclResult_t error) { // thrown in the NCCL codebase. std::string getNcclErrorDetailStr( ncclResult_t error, - c10::optional processGroupFailureReason /* = c10::nullopt */ + std::optional processGroupFailureReason /* = c10::nullopt */ ) { // Prioritize failure reason provided by PG NCCL first, as it can abort // communicators when it encounters collective timeouts, etc. diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp index a4b96a2a40762..5690c0591a7af 100644 --- a/torch/csrc/distributed/c10d/NCCLUtils.hpp +++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp @@ -182,7 +182,7 @@ int nccl_nonblocking_timeout(); // thrown in the NCCL codebase. std::string getNcclErrorDetailStr( ncclResult_t error, - c10::optional processGroupFailureReason = c10::nullopt); + std::optional processGroupFailureReason = c10::nullopt); // Write NCCL debug info to local disk or any storage users define. // There are some constrains we set for the debug info writer: @@ -339,13 +339,13 @@ class NCCLComm { ncclComm_t getNcclComm(); - c10::optional getNcclCommFailureReason() const { + std::optional getNcclCommFailureReason() const { std::unique_lock lock(mutex_); return commFailureReason_; } void ncclCommAbort( - c10::optional commFailureReason = c10::nullopt) { + std::optional commFailureReason = c10::nullopt) { std::unique_lock lock(mutex_); #ifdef ENABLE_NCCL_ERROR_CHECKING if (aborted_) { @@ -491,7 +491,7 @@ class NCCLComm { int rank_; // Optional reason for communicator failure, provided by ProcessGroupNCCL for // better error messaging. - c10::optional commFailureReason_; + std::optional commFailureReason_; bool initialized_{false}; #ifdef NCCL_HAS_COMM_REGISTER // Stores handlers for tensors registered by NCCL diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp index cf8b7cd966ef5..ae822ad397504 100644 --- a/torch/csrc/distributed/c10d/Ops.cpp +++ b/torch/csrc/distributed/c10d/Ops.cpp @@ -168,7 +168,7 @@ IMPL_BROADCAST(PrivateUse1) at::TensorList tensors, \ const c10::intrusive_ptr& process_group, \ const c10::intrusive_ptr& reduce_op, \ - const c10::optional& sparse_indices, \ + const std::optional& sparse_indices, \ int64_t timeout) { \ auto tensor_vec = tensors.vec(); \ auto work = process_group->getBackend(c10::DeviceType::DEV) -> allreduce( \ @@ -460,7 +460,7 @@ allreduce_sparse_cuda_( at::TensorList tensors, const c10::intrusive_ptr& process_group, const c10::intrusive_ptr& reduce_op, - const c10::optional& sparse_indices, + const std::optional& sparse_indices, int64_t timeout) { auto tensor_vec = tensors.vec(); auto work = process_group->getBackend(c10::DeviceType::CUDA) diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp index 8c805020e8cf6..acf8c9c354a76 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp @@ -162,7 +162,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { at::TensorList, const c10::intrusive_ptr<::c10d::ProcessGroup>&, const c10::intrusive_ptr<::c10d::ReduceOp>&, - const c10::optional& sparse_indices, + const std::optional& sparse_indices, int64_t)>(); return std::get<1>(op.call( @@ -620,7 +620,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { void setBackend( c10::DeviceType deviceType, BackendType backendType, - const c10::optional>& backend) { + const std::optional>& backend) { // TODO: should we add these entries after the backend setting succeeds? deviceTypeToBackendType_[deviceType] = backendType; deviceTypes_.insert(deviceType); @@ -703,11 +703,11 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { // optimizations such as automatic use of ncclCommSplit. The device // is specified in `init_process_group` and eventually makes it // here and then down into the actual backend instances. - c10::optional getBoundDeviceId() const { + std::optional getBoundDeviceId() const { return bound_device_id_; } - void setBoundDeviceId(c10::optional device) { + void setBoundDeviceId(std::optional device) { if (device) { TORCH_CHECK(device->has_index(), "setBoundDeviceId must have an index"); } @@ -742,7 +742,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { std::unordered_map> backendTypeToBackend_; - c10::optional bound_device_id_; + std::optional bound_device_id_; }; } // namespace c10d diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp index ada56cbee1990..cba0249829e68 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp @@ -479,7 +479,7 @@ void returnFutureWithOutput( inline void ProcessGroupGloo::AsyncWork::recordAsyncWorkProfilingInfo( const char* profilingTitle, - const c10::optional>& inputTensors) { + const std::optional>& inputTensors) { auto recordingFunction = std::make_shared(at::RecordScope::USER_SCOPE); if (recordingFunction->isActive()) { @@ -511,7 +511,7 @@ ProcessGroupGloo::AsyncWork::AsyncWork( OpType opType, uint64_t seq, const char* profilingTitle, - const c10::optional>& inputTensors) + const std::optional>& inputTensors) // Profiler: Pass nullptr as profilingTitle to parent constructor to // replace default profiler implementation with async version that reports // correct timestamps for work that is asynchronously executed. @@ -547,7 +547,7 @@ ProcessGroupGloo::SendWork::SendWork( -1, OpType::SEND, "gloo:send", - c10::optional>({tensor})), + std::optional>({tensor})), tensor_(tensor), buffer_(std::move(buffer)), seq_(seq) {} @@ -588,7 +588,7 @@ ProcessGroupGloo::RecvWork::RecvWork( -1, opType, profilingTitle, - c10::optional>({tensor})), + std::optional>({tensor})), tensor_(tensor), buffer_(std::move(buffer)), srcRank_(-1), @@ -2424,7 +2424,7 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork { OpType::SCATTER, seq, "gloo:scatter", - !inputs.empty() ? c10::optional>(inputs[0]) + !inputs.empty() ? std::optional>(inputs[0]) : c10::nullopt), context(context), outputs(outputs), @@ -2620,7 +2620,7 @@ class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork { OpType::ALLTOALL, seq, "gloo:all_to_all", - c10::optional>({inputTensor})), + std::optional>({inputTensor})), context(context), outputTensor(outputTensor), inputTensor(inputTensor), diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp index d40b205c25601..87c87b8f1ae9b 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp @@ -73,7 +73,7 @@ class TORCH_API ProcessGroupGloo : public Backend { OpType opType, uint64_t seq, const char* profilingTitle = nullptr, - const c10::optional>& inputTensors = + const std::optional>& inputTensors = c10::nullopt); ~AsyncWork() override = default; @@ -95,7 +95,7 @@ class TORCH_API ProcessGroupGloo : public Backend { void finishWorkGlooError(const std::exception_ptr& eptr); inline void recordAsyncWorkProfilingInfo( const char* profilingTitle, - const c10::optional>& inputTensors); + const std::optional>& inputTensors); const std::vector> outputTensors_; c10::intrusive_ptr future_; diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp index 29d05a9693b14..6d02f89f6005b 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp @@ -121,7 +121,7 @@ ProcessGroupMPI::AsyncWork::AsyncWork( MPI_Request request, std::vector outputTensors, const char* profilingTitle, - const c10::optional>& inputTensors) + const std::optional>& inputTensors) : Work(-1, OpType::UNKNOWN, profilingTitle, inputTensors), outputTensors_(std::move(outputTensors)), request_(request) { @@ -379,7 +379,7 @@ void ProcessGroupMPI::runLoop() { c10::intrusive_ptr ProcessGroupMPI::enqueue( std::unique_ptr entry, const char* profilingTitle, - const c10::optional>& inputTensors) { + const std::optional>& inputTensors) { auto work = c10::make_intrusive(entry->dst, profilingTitle, inputTensors); std::unique_lock lock(pgMutex_); @@ -410,7 +410,7 @@ c10::intrusive_ptr ProcessGroupMPI::broadcast( return enqueue( std::move(entry), "mpi:broadcast", - c10::optional>(tensors)); + std::optional>(tensors)); } c10::intrusive_ptr ProcessGroupMPI::allreduce( @@ -436,7 +436,7 @@ c10::intrusive_ptr ProcessGroupMPI::allreduce( return enqueue( std::move(entry), "mpi:all_reduce", - c10::optional>(tensors)); + std::optional>(tensors)); } c10::intrusive_ptr ProcessGroupMPI::allreduce_coalesced( @@ -473,7 +473,7 @@ c10::intrusive_ptr ProcessGroupMPI::reduce( return enqueue( std::move(entry), "mpi:reduce", - c10::optional>(tensors)); + std::optional>(tensors)); } c10::intrusive_ptr ProcessGroupMPI::allgather( @@ -522,7 +522,7 @@ c10::intrusive_ptr ProcessGroupMPI::allgather( return enqueue( std::move(entry), "mpi:all_gather", - c10::optional>(inputTensors)); + std::optional>(inputTensors)); } c10::intrusive_ptr ProcessGroupMPI::allgather_coalesced( @@ -598,14 +598,14 @@ c10::intrusive_ptr ProcessGroupMPI::gather( return enqueue( std::move(entry), "mpi:gather", - c10::optional>(inputTensors)); + std::optional>(inputTensors)); } else { auto entry = std::make_unique(&inputTensors, nullptr, std::move(runFunc)); return enqueue( std::move(entry), "mpi:gather", - c10::optional>(inputTensors)); + std::optional>(inputTensors)); } } @@ -672,7 +672,7 @@ c10::intrusive_ptr ProcessGroupMPI::scatter( std::move(entry), "mpi:scatter", !inputTensors.empty() - ? c10::optional>(inputTensors[0]) + ? std::optional>(inputTensors[0]) : c10::nullopt); } else { auto entry = std::make_unique( @@ -681,7 +681,7 @@ c10::intrusive_ptr ProcessGroupMPI::scatter( std::move(entry), "mpi:scatter", !inputTensors.empty() - ? c10::optional>(inputTensors[0]) + ? std::optional>(inputTensors[0]) : c10::nullopt); } } @@ -734,7 +734,7 @@ c10::intrusive_ptr ProcessGroupMPI::alltoall_base( return enqueue( std::move(entry), "mpi:all_to_all", - c10::optional>(inputTensors)); + std::optional>(inputTensors)); } else { // Need alltoallv c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_); @@ -772,7 +772,7 @@ c10::intrusive_ptr ProcessGroupMPI::alltoall_base( return enqueue( std::move(entry), "mpi:all_to_all", - c10::optional>(inputTensors)); + std::optional>(inputTensors)); } } @@ -835,7 +835,7 @@ c10::intrusive_ptr ProcessGroupMPI::alltoall( return enqueue( std::move(entry), "mpi:all_to_all", - c10::optional>(inputTensors)); + std::optional>(inputTensors)); } c10::intrusive_ptr ProcessGroupMPI::send( @@ -864,7 +864,7 @@ c10::intrusive_ptr ProcessGroupMPI::send( request, std::vector(), "mpi:send", - c10::optional>(tensors)); + std::optional>(tensors)); } c10::intrusive_ptr ProcessGroupMPI::recv( @@ -893,7 +893,7 @@ c10::intrusive_ptr ProcessGroupMPI::recv( request, tensors, "mpi:recv", - c10::optional>(tensors)); + std::optional>(tensors)); } c10::intrusive_ptr ProcessGroupMPI::recvAnysource( @@ -921,7 +921,7 @@ c10::intrusive_ptr ProcessGroupMPI::recvAnysource( request, tensors, "mpi:recvAnySource", - c10::optional>(tensors)); + std::optional>(tensors)); } c10::intrusive_ptr ProcessGroupMPI::barrier(const BarrierOptions& opts) { diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp index dd586dda7024b..6e52e680e5c20 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp @@ -86,7 +86,7 @@ class TORCH_API ProcessGroupMPI : public Backend { explicit WorkMPI( std::vector outputTensors, const char* profilingTitle = nullptr, - const c10::optional>& inputTensors = + const std::optional>& inputTensors = c10::nullopt) : Work(-1, OpType::UNKNOWN, profilingTitle, inputTensors), outputTensors_(std::move(outputTensors)), @@ -114,7 +114,7 @@ class TORCH_API ProcessGroupMPI : public Backend { MPI_Request request, std::vector outputTensors, const char* profilingTitle = nullptr, - const c10::optional>& inputTensors = + const std::optional>& inputTensors = c10::nullopt); ~AsyncWork() override; @@ -243,7 +243,7 @@ class TORCH_API ProcessGroupMPI : public Backend { c10::intrusive_ptr enqueue( std::unique_ptr entry, const char* profilingTitle = nullptr, - const c10::optional>& inputTensors = + const std::optional>& inputTensors = c10::nullopt); bool stop_; diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp index 6cca50daff6c4..7437a4ef1846a 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp @@ -352,9 +352,9 @@ std::string dump_nccl_trace() { } #endif -c10::optional)>>& +std::optional)>>& get_cpp_trace_dumper() { - static c10::optional< + static std::optional< std::function)>> dumper(c10::nullopt); return dumper; @@ -431,7 +431,7 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL( OpType opType, uint64_t seq, const char* profilingTitle, - const c10::optional>& inputs, + const std::optional>& inputs, bool desyncDebug, bool enableTiming, DebugLevel distDebugLevel) @@ -546,7 +546,7 @@ bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const { } bool ProcessGroupNCCL::WorkNCCL::checkTimeout( - c10::optional timeout) { + std::optional timeout) { auto currentTimepoint = std::chrono::steady_clock::now(); auto timeElapsed = std::chrono::duration_cast( currentTimepoint - workStartTime_); @@ -1036,7 +1036,7 @@ void ProcessGroupNCCL::waitForFutureOrTimeout( void ProcessGroupNCCL::abortCommsFromMap( std::unordered_map>& ncclCommsMap, - c10::optional abortReason) { + std::optional abortReason) { // The process may control multiple devices, loop through the communicators on // each device for (auto& it : ncclCommsMap) { @@ -1069,7 +1069,7 @@ void ProcessGroupNCCL::abortCommsFromMap( } // Abort all communicators on this rank -bool ProcessGroupNCCL::abort(c10::optional abortReason) { +bool ProcessGroupNCCL::abort(std::optional abortReason) { // Remove record from global ncclCommDevIdxMapMutex before aboarting, // so that a new cache segment would not register to already aborded // communicators. Note that ncclCommDevIdxMap is a global container which may @@ -1088,7 +1088,7 @@ bool ProcessGroupNCCL::abort(c10::optional abortReason) { return true; } -void ProcessGroupNCCL::shutdown(c10::optional reason) { +void ProcessGroupNCCL::shutdown(std::optional reason) { // Don't join threads here since the purpose of this method is to abort all // communicators and signal the threads to exit. Joining on the threads could // potentially block and hence avoid it in this method. @@ -1188,7 +1188,7 @@ void ProcessGroupNCCL::heartbeatMonitor() { : heartbeatTimeoutInSec_ * 1000; auto lastTimePollStore = std::chrono::steady_clock::now(); auto lastTimeHeartBeatCheck = std::chrono::steady_clock::now(); - c10::optional dumpPipe = c10::nullopt; + std::optional dumpPipe = c10::nullopt; if (uid_ == 0) { // DumpPipe is one per-trainer process, and its convenient to name them // after 'global' ranks in the system, So we assume processgroup (uid)==0 is @@ -2241,7 +2241,7 @@ c10::intrusive_ptr ProcessGroupNCCL::initWork( opType, seq_, profilingTitle, - profilingTitle != nullptr ? c10::optional>(inputs) + profilingTitle != nullptr ? std::optional>(inputs) : c10::nullopt, desyncDebug_, enableTiming_.load(), diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp index fac9b6f38204e..4217d2fa4cea5 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp @@ -248,7 +248,7 @@ class TORCH_API ProcessGroupNCCL : public Backend { OpType opType, uint64_t seq, const char* profilingTitle = nullptr, - const c10::optional>& inputs = c10::nullopt, + const std::optional>& inputs = c10::nullopt, bool desyncDebug = false, bool enableTiming = false, DebugLevel distDebugLevel = DebugLevel::Off); @@ -305,7 +305,7 @@ class TORCH_API ProcessGroupNCCL : public Backend { // and False otherwise. // In case of timeout, set exception on the WorkNCCL object. bool checkTimeout( - c10::optional timeout = c10::nullopt); + std::optional timeout = c10::nullopt); std::vector result() override; @@ -399,7 +399,7 @@ class TORCH_API ProcessGroupNCCL : public Backend { bool timingEnabled_; // unique id used to tell the trace buffer that this // work has completed - c10::optional trace_id_; + std::optional trace_id_; DebugLevel distDebugLevel_; friend class ProcessGroupNCCL; }; @@ -621,16 +621,16 @@ class TORCH_API ProcessGroupNCCL : public Backend { // Helper function for iteratively aborting communicators in the provided map void abortCommsFromMap( std::unordered_map>& ncclCommsMap, - c10::optional abortReason); + std::optional abortReason); c10::intrusive_ptr initIntraNodeComm(); // Provides an API to abort the ProcessGroup (similar to ncclCommAbort) // instead of relying on ProcessGroupNCCL destructor. // return true if abort is successful, otherwise false - bool abort(c10::optional abortReason = c10::nullopt); + bool abort(std::optional abortReason = c10::nullopt); - void shutdown(c10::optional reason = c10::nullopt); + void shutdown(std::optional reason = c10::nullopt); void eagerConnectSingleDevice(at::Device device) override; @@ -1092,7 +1092,7 @@ TORCH_API std::string dump_nccl_trace(); // Gets a mutable reference to a global optional function. Heartbeat Monitor // will use this function to dump traces, if available. Inside fbcode, we store // a function here that uses an internal tool for process tracing -TORCH_API c10::optional< +TORCH_API std::optional< std::function)>>& get_cpp_trace_dumper(); diff --git a/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp b/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp index 22fc58134566c..ab1e1e4c4899e 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp @@ -119,7 +119,7 @@ class TORCH_API ProcessGroupUCC : public Backend { OpType opType, uint64_t seq, const char* prof_title, - const c10::optional>& inputs, + const std::optional>& inputs, const c10::intrusive_ptr& logger) : Work(-1, opType, prof_title, inputs), logger_(logger), seq_(seq) {} ~WorkUCC(); diff --git a/torch/csrc/distributed/c10d/Store.hpp b/torch/csrc/distributed/c10d/Store.hpp index 525440e767b47..af715ba98a794 100644 --- a/torch/csrc/distributed/c10d/Store.hpp +++ b/torch/csrc/distributed/c10d/Store.hpp @@ -13,7 +13,7 @@ namespace c10d { // callback function will be given arguments (optional oldValue, // optional newValue) using WatchKeyCallback = - std::function, c10::optional)>; + std::function, c10::optional)>; class TORCH_API Store : public torch::CustomClassHolder { public: diff --git a/torch/csrc/distributed/c10d/TCPStore.cpp b/torch/csrc/distributed/c10d/TCPStore.cpp index a95f0ebdb1e26..aee1d7677dc4d 100644 --- a/torch/csrc/distributed/c10d/TCPStore.cpp +++ b/torch/csrc/distributed/c10d/TCPStore.cpp @@ -268,7 +268,7 @@ using detail::Socket; TCPStore::TCPStore( const std::string& masterAddr, std::uint16_t masterPort, - c10::optional numWorkers, + std::optional numWorkers, bool isServer, const std::chrono::milliseconds& timeout, bool waitWorkers) @@ -277,7 +277,7 @@ TCPStore::TCPStore( TCPStoreOptions{ masterPort, isServer, - numWorkers ? c10::optional(*numWorkers) + numWorkers ? std::optional(*numWorkers) : c10::nullopt, waitWorkers, timeout}} {} diff --git a/torch/csrc/distributed/c10d/TCPStore.hpp b/torch/csrc/distributed/c10d/TCPStore.hpp index 03a7f124ca710..7080d50136e96 100644 --- a/torch/csrc/distributed/c10d/TCPStore.hpp +++ b/torch/csrc/distributed/c10d/TCPStore.hpp @@ -49,7 +49,7 @@ struct TCPStoreOptions { std::uint16_t port = kDefaultPort; bool isServer = false; - c10::optional numWorkers = c10::nullopt; + std::optional numWorkers = c10::nullopt; bool waitWorkers = true; std::chrono::milliseconds timeout = Store::kDefaultTimeout; @@ -60,7 +60,7 @@ struct TCPStoreOptions { // If specified, and if isServer is true, the underlying TCPServer will take // over the bound socket associated to this fd. This option is useful to avoid // port assignment races in certain scenarios. - c10::optional masterListenFd = c10::nullopt; + std::optional masterListenFd = c10::nullopt; // A boolean value indicating whether to use the experimental libUV backend. bool useLibUV = false; @@ -73,7 +73,7 @@ class TORCH_API TCPStore : public Store { [[deprecated("Use TCPStore(host, opts) instead.")]] explicit TCPStore( const std::string& masterAddr, std::uint16_t masterPort, - c10::optional numWorkers = c10::nullopt, + std::optional numWorkers = c10::nullopt, bool isServer = false, const std::chrono::milliseconds& timeout = kDefaultTimeout, bool waitWorkers = true); @@ -152,7 +152,7 @@ class TORCH_API TCPStore : public Store { detail::SocketAddress addr_; std::shared_ptr server_; std::unique_ptr client_; - c10::optional numWorkers_; + std::optional numWorkers_; const std::string initKey_ = "init/"; const std::string keyPrefix_ = "/"; diff --git a/torch/csrc/distributed/c10d/TraceUtils.h b/torch/csrc/distributed/c10d/TraceUtils.h index 32f0e1f41df01..181f2208160b7 100644 --- a/torch/csrc/distributed/c10d/TraceUtils.h +++ b/torch/csrc/distributed/c10d/TraceUtils.h @@ -15,6 +15,34 @@ #include namespace c10d { +static c10::IValue entries_key = "entries"; +static c10::IValue nccl_comm_key = "nccl_comm_state"; +static c10::IValue version_key = "version"; +// Update whenever changing contents or formatting of the dump +// (minor when adding fields, major when changing existing fields) +static c10::IValue version_val = "1.5"; +static c10::IValue pg_config_key = "pg_config"; +static c10::IValue record_id_key = "record_id"; +static c10::IValue pg_id_key = "pg_id"; +static c10::IValue pg_name_key = "process_group"; +static c10::IValue seq_id_key = "seq_id"; +static c10::IValue op_id_key = "op_id"; +static c10::IValue profiling_name_key = "profiling_name"; +static c10::IValue input_sizes_key = "input_sizes"; +static c10::IValue output_sizes_key = "output_sizes"; +static c10::IValue time_created_key = "time_created_ns"; +static c10::IValue duration_key = "duration_ms"; + +static c10::IValue frames_key = "frames"; +static c10::IValue state_key = "state"; +static c10::IValue line_key = "line"; +static c10::IValue name_key = "name"; +static c10::IValue filename_key = "filename"; +static c10::IValue retired_key = "retired"; +static c10::IValue time_discovered_started_key = "time_discovered_started_ns"; +static c10::IValue time_discovered_completed_key = + "time_discovered_completed_ns"; + /* Trace Utils Related to TORCH_NCCL_DESYNC_DEBUG */ inline std::string getTraceStartKey(const std::string& pgName, int rank) { @@ -417,18 +445,18 @@ struct NCCLTraceBuffer { // timestamp when the entry was created, likely close to the time the work // was 'enqueued'- not necessarily started c10::time_t time_created_; - c10::optional duration_; + std::optional duration_; // timestamp when our CPU threads discovered that the kernel started. // will always be _after_ it actually started, and can be very late // if the watchdog thread got stuck on CUDA APIs. - c10::optional time_discovered_started_; + std::optional time_discovered_started_; // timestamp when our CPU threads discovered that the kernel completed. // will always be _after_ it actually complated, and can be the same time // as the discovery of the start if the watchdog thread is stuck on CUDA // APIs - c10::optional time_discovered_completed_; + std::optional time_discovered_completed_; // size information for input/output tensors c10::SmallVector input_dims_; @@ -448,7 +476,7 @@ struct NCCLTraceBuffer { std::map, std::vector> pg_name_to_ranks_ = {}; - c10::optional record( + std::optional record( size_t pg_id, const std::tuple& pg_name, size_t seq_id, @@ -551,7 +579,7 @@ struct NCCLTraceBuffer { never hang. (timing must also be enabled for compute_duration - see TORCH_NCCL_ENABLE_TIMING). */ - void retire_id(c10::optional id, bool compute_duration = true) { + void retire_id(std::optional id, bool compute_duration = true) { if (!enabled_ || !id) { return; } @@ -559,7 +587,7 @@ struct NCCLTraceBuffer { bool can_compute_duration = false; Event* startEvent = nullptr; Event* endEvent = nullptr; - c10::optional duration = c10::nullopt; + std::optional duration = c10::nullopt; std::unique_lock guard(mutex_); @@ -601,37 +629,11 @@ struct NCCLTraceBuffer { } std::string dump( - const c10::optional>>& ncclDumpMap) { auto result = dump_entries(); auto entries = new_list(); - c10::IValue entries_key = "entries"; - c10::IValue nccl_comm_key = "nccl_comm_state"; - c10::IValue version_key = "version"; - // Update whenever changing contents or formatting of the dump - // (minor when adding fields, major when changing existing fields) - c10::IValue version_val = "1.5"; - c10::IValue pg_config_key = "pg_config"; - c10::IValue record_id_key = "record_id"; - c10::IValue pg_id_key = "pg_id"; - c10::IValue pg_name_key = "process_group"; - c10::IValue seq_id_key = "seq_id"; - c10::IValue op_id_key = "op_id"; - c10::IValue profiling_name_key = "profiling_name"; - c10::IValue input_sizes_key = "input_sizes"; - c10::IValue output_sizes_key = "output_sizes"; - c10::IValue time_created_key = "time_created_ns"; - c10::IValue duration_key = "duration_ms"; - - c10::IValue frames_key = "frames"; - c10::IValue state_key = "state"; - c10::IValue line_key = "line"; - c10::IValue name_key = "name"; - c10::IValue filename_key = "filename"; - c10::IValue retired_key = "retired"; - c10::IValue time_discovered_started_key = "time_discovered_started_ns"; - c10::IValue time_discovered_completed_key = "time_discovered_completed_ns"; std::vector tracebacks; for (auto& e : result) { diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp index fab819798e555..669957a726735 100644 --- a/torch/csrc/distributed/c10d/Types.hpp +++ b/torch/csrc/distributed/c10d/Types.hpp @@ -121,7 +121,7 @@ struct BroadcastOptions { struct AllreduceOptions { ReduceOp reduceOp = ReduceOp::SUM; std::chrono::milliseconds timeout = kUnsetTimeout; - c10::optional sparseIndices = c10::nullopt; + std::optional sparseIndices = c10::nullopt; }; struct AllreduceCoalescedOptions : AllreduceOptions {}; @@ -162,7 +162,7 @@ struct AllToAllOptions { struct BarrierOptions { std::vector device_ids; std::chrono::milliseconds timeout = kUnsetTimeout; - c10::optional device; + std::optional device; }; struct DistributedBackendOptions { diff --git a/torch/csrc/distributed/c10d/Work.cpp b/torch/csrc/distributed/c10d/Work.cpp index 66c35b11e6c0f..8beb8f2936208 100644 --- a/torch/csrc/distributed/c10d/Work.cpp +++ b/torch/csrc/distributed/c10d/Work.cpp @@ -9,7 +9,7 @@ Work::Work( int rank, OpType opType, const char* profilingTitle, - const c10::optional>& inputTensors) + const std::optional>& inputTensors) : rank_(rank), opType_(opType) { if (profilingTitle != nullptr) { auto recordingFunction = diff --git a/torch/csrc/distributed/c10d/Work.hpp b/torch/csrc/distributed/c10d/Work.hpp index d106183231706..d29b838321176 100644 --- a/torch/csrc/distributed/c10d/Work.hpp +++ b/torch/csrc/distributed/c10d/Work.hpp @@ -50,7 +50,7 @@ class TORCH_API Work : public torch::CustomClassHolder { int rank = -1, OpType opType = OpType::UNKNOWN, const char* profilingTitle = nullptr, - const c10::optional>& inputTensors = + const std::optional>& inputTensors = c10::nullopt); ~Work() override; diff --git a/torch/csrc/distributed/c10d/comm.hpp b/torch/csrc/distributed/c10d/comm.hpp index d2c608532ba53..6f9203e214348 100644 --- a/torch/csrc/distributed/c10d/comm.hpp +++ b/torch/csrc/distributed/c10d/comm.hpp @@ -26,7 +26,7 @@ class TORCH_API GradBucket { std::vector lengths, std::vector sizes_vec, std::vector parameters, - c10::optional sparse_grad_indices) + std::optional sparse_grad_indices) : index_(index), bucket_count_(bucket_count), buffer_(std::move(tensor)), @@ -72,7 +72,7 @@ class TORCH_API GradBucket { return index_ == bucket_count_ - 1; } - c10::optional& getSparseGradIndices() { + std::optional& getSparseGradIndices() { return sparse_grad_indices_; } @@ -92,7 +92,7 @@ class TORCH_API GradBucket { // Predefined sparse indices for this bucket (only used for sparse tensors). // The gradients will be updated to have indices with these tensor values - c10::optional sparse_grad_indices_; + std::optional sparse_grad_indices_; }; // Base class of both `PythonCommHook` and `CppCommHook`. diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 7cbd898499c38..483becbce0094 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -1409,14 +1409,14 @@ Example:: .def( py::init([](const std::string& host, uint16_t port, - c10::optional worldSize, + std::optional worldSize, bool isServer, std::chrono::milliseconds timeout, bool waitWorkers, bool multiTenant, - c10::optional masterListenFd, + std::optional masterListenFd, bool useLibUV) { - c10::optional numWorkers = c10::nullopt; + std::optional numWorkers = c10::nullopt; if (worldSize.has_value() && worldSize.value() > -1) { numWorkers = static_cast(worldSize.value()); } @@ -1801,14 +1801,14 @@ that adds a prefix to each key inserted to the store. [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self, const c10::Device& device, const ::c10d::ProcessGroup::BackendType& backendType, - const c10::optional>& + const std::optional>& backend) { self->setBackend(device.type(), backendType, backend); }, py::arg("device"), py::arg("backend_type"), py::arg("backend") = - c10::optional>(), + std::optional>(), py::call_guard()) .def( "_get_backend", @@ -2432,7 +2432,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`). py::init([](const c10::intrusive_ptr<::c10d::Store>& store, size_t rank, size_t world_size, - c10::optional buffer_size) { + std::optional buffer_size) { auto comm = c10::make_intrusive( store, rank, world_size, buffer_size); if (!comm->rendezvous()) { @@ -2744,7 +2744,7 @@ such as `dist.all_reduce(tensor, async_op=True)`. const std::vector& bucket_size_limits, const std::vector& expect_sparse_gradient, const std::vector& tensor_indices, - const c10::optional>& logger) { + const std::optional>& logger) { if (logger.has_value()) { std::weak_ptr<::c10d::Logger> logger_weakref = logger.value(); return ::c10d::compute_bucket_assignment_by_size( @@ -2766,14 +2766,14 @@ such as `dist.all_reduce(tensor, async_op=True)`. py::arg("bucket_size"), py::arg("expect_sparse_gradient") = std::vector(), py::arg("tensor_indices") = std::vector(), - py::arg("logger") = c10::optional>{}, + py::arg("logger") = std::optional>{}, py::call_guard()); module.def( "_verify_params_across_processes", [](const c10::intrusive_ptr<::c10d::ProcessGroup>& process_group, const std::vector& params, - const c10::optional>& logger) { + const std::optional>& logger) { if (logger.has_value()) { std::weak_ptr<::c10d::Logger> logger_weakref = logger.value(); verify_params_across_processes( @@ -2784,7 +2784,7 @@ such as `dist.all_reduce(tensor, async_op=True)`. }, py::arg("process_group"), py::arg("params"), - py::arg("logger") = c10::optional>{}, + py::arg("logger") = std::optional>{}, py::call_guard()); module.def( diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cpp b/torch/csrc/distributed/c10d/intra_node_comm.cpp index d18262ecfa3f5..ceec7bbd0f9ce 100644 --- a/torch/csrc/distributed/c10d/intra_node_comm.cpp +++ b/torch/csrc/distributed/c10d/intra_node_comm.cpp @@ -207,7 +207,7 @@ IntraNodeComm::IntraNodeComm( c10::intrusive_ptr store, size_t rank, size_t worldSize, - c10::optional bufferSize) + std::optional bufferSize) : store_(std::move(store)), rank_(rank), worldSize_(worldSize), diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cu b/torch/csrc/distributed/c10d/intra_node_comm.cu index 6d72bde221253..ce479cd187bc4 100644 --- a/torch/csrc/distributed/c10d/intra_node_comm.cu +++ b/torch/csrc/distributed/c10d/intra_node_comm.cu @@ -732,7 +732,7 @@ static __global__ void barrierKernel( } } -void IntraNodeComm::barrier(c10::optional> ranks) { +void IntraNodeComm::barrier(std::optional> ranks) { if (!ranks.has_value()) { ranks = std::vector(worldSize_); std::iota(ranks->begin(), ranks->end(), 0); diff --git a/torch/csrc/distributed/c10d/intra_node_comm.hpp b/torch/csrc/distributed/c10d/intra_node_comm.hpp index ab27ecef97338..fe591978c5332 100644 --- a/torch/csrc/distributed/c10d/intra_node_comm.hpp +++ b/torch/csrc/distributed/c10d/intra_node_comm.hpp @@ -33,7 +33,7 @@ class TORCH_API IntraNodeComm : public c10::intrusive_ptr_target { c10::intrusive_ptr store, size_t rank, size_t worldSize, - c10::optional bufferSize = c10::nullopt); + std::optional bufferSize = c10::nullopt); ~IntraNodeComm() override; @@ -61,7 +61,7 @@ class TORCH_API IntraNodeComm : public c10::intrusive_ptr_target { /** * Perform a barrier among the specified ranks. */ - void barrier(c10::optional> ranks = c10::nullopt); + void barrier(std::optional> ranks = c10::nullopt); /** * Puts the given tensor into the p2p buffer of the current rank at the diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp index 3ce4880930cb2..711039bf48595 100644 --- a/torch/csrc/distributed/c10d/logger.cpp +++ b/torch/csrc/distributed/c10d/logger.cpp @@ -247,7 +247,7 @@ void Logger::calculate_avg_time( Timer::Event start_event, Timer::Event end_event) { TORCH_CHECK(num_iterations_stats_recorded_ > 0); - c10::optional maybe_time_duration = + std::optional maybe_time_duration = timer.measureDifference(start_event, end_event); if (!maybe_time_duration.has_value()) { return; diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index a885bd2e9e7cb..d600426192cef 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -51,7 +51,7 @@ class CpuTimer : public Timer { public: explicit CpuTimer(c10::Device /* unused */) {} - c10::optional measureDifference(Event start, Event end) override { + std::optional measureDifference(Event start, Event end) override { int64_t start_time = getTimeRef(start); int64_t end_time = getTimeRef(end); // If cpu_end_time is not recorded in this iteration, @@ -2096,7 +2096,7 @@ compute_bucket_assignment_by_size( const std::vector& bucket_size_limits, const std::vector& expect_sparse_gradient, const std::vector& tensor_indices, - const c10::optional>& logger) { + const std::optional>& logger) { // Either expect_sparse_gradient is not specified or it has as many elements // as the vector with tensors. TORCH_INTERNAL_ASSERT( @@ -2221,7 +2221,7 @@ compute_bucket_assignment_by_size( void verify_params_across_processes( const c10::intrusive_ptr& process_group, const std::vector& params, - const c10::optional>& logger) { + const std::optional>& logger) { // First verify number of parameters to avoid inconsistent inputs into // broadcast which can cause a crash. // See https://github.com/pytorch/pytorch/issues/73547 diff --git a/torch/csrc/distributed/c10d/reducer.hpp b/torch/csrc/distributed/c10d/reducer.hpp index e940a56bd650a..1f72b0eb37b9f 100644 --- a/torch/csrc/distributed/c10d/reducer.hpp +++ b/torch/csrc/distributed/c10d/reducer.hpp @@ -261,10 +261,10 @@ class TORCH_API Reducer { std::weak_ptr logger_; // List of futures installed by Reducer::install_futures that should be // awaited at the end of backwards pass. - c10::optional>> + std::optional>> installed_futures_{c10::nullopt}; // Mixed precision parameter dtype for bucket type checking. - c10::optional mixed_precision_param_dtype_{c10::nullopt}; + std::optional mixed_precision_param_dtype_{c10::nullopt}; // Work handle for allreduce on local_used_map_ c10::intrusive_ptr local_used_work_; @@ -389,7 +389,7 @@ class TORCH_API Reducer { bool expect_sparse_gradient = false; // Sparse indices tensor - c10::optional sparse_tensor_indices = c10::nullopt; + std::optional sparse_tensor_indices = c10::nullopt; // TODO(@pietern) // Memory copies from gradient tensors into the bucket are potentially @@ -576,12 +576,12 @@ compute_bucket_assignment_by_size( const std::vector& bucket_size, const std::vector& expect_sparse_gradient = {}, const std::vector& tensor_indices = {}, - const c10::optional>& logger = {}); + const std::optional>& logger = {}); // Verify models across all processes are the same as model on rank 0 with // respect to no. of params and matching dtype/size/layout. TORCH_API void verify_params_across_processes( const c10::intrusive_ptr& process_group, const std::vector& params, - const c10::optional>& logger); + const std::optional>& logger); } // namespace c10d diff --git a/torch/csrc/distributed/c10d/reducer_cuda.cpp b/torch/csrc/distributed/c10d/reducer_cuda.cpp index b63e9d3d6f3c8..84bff02072b60 100644 --- a/torch/csrc/distributed/c10d/reducer_cuda.cpp +++ b/torch/csrc/distributed/c10d/reducer_cuda.cpp @@ -48,7 +48,7 @@ class CudaTimer : public Timer { getEvent(event).record(); } - c10::optional measureDifference(Event start, Event end) override { + std::optional measureDifference(Event start, Event end) override { c10::DeviceGuard g(device); at::cuda::CUDAEvent& start_event = getEvent(start); at::cuda::CUDAEvent& end_event = getEvent(end); diff --git a/torch/csrc/distributed/c10d/reducer_timer.hpp b/torch/csrc/distributed/c10d/reducer_timer.hpp index 5f57051455f62..f9b9f11c8c963 100644 --- a/torch/csrc/distributed/c10d/reducer_timer.hpp +++ b/torch/csrc/distributed/c10d/reducer_timer.hpp @@ -39,12 +39,12 @@ class TORCH_API Timer { // Return the difference between when two events occurred, in nanoseconds. // Or nullopt if one of them hasn't been recorded. - virtual c10::optional measureDifference(Event start, Event end) = 0; + virtual std::optional measureDifference(Event start, Event end) = 0; virtual ~Timer() = default; // Return host-side timestamp, or nullopt if it has not yet been recorded. - c10::optional getTimestamp(Event event) { + std::optional getTimestamp(Event event) { auto time = getTimeRef(event); if (time == kUnsetTime) { return c10::nullopt; diff --git a/torch/csrc/distributed/c10d/sequence_num.hpp b/torch/csrc/distributed/c10d/sequence_num.hpp index 8c80642f42784..ce31f4b552728 100644 --- a/torch/csrc/distributed/c10d/sequence_num.hpp +++ b/torch/csrc/distributed/c10d/sequence_num.hpp @@ -59,7 +59,7 @@ class TORCH_API SequenceNum { SequenceNum(const SequenceNum& other); private: - c10::optional num_; + std::optional num_; mutable std::mutex lock_; }; diff --git a/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.cpp b/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.cpp index 255a16af6bb0d..3a37e7b02a5f0 100644 --- a/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.cpp +++ b/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.cpp @@ -9,7 +9,7 @@ namespace distributed { namespace rpc { const std::string REMOTE_PROFILING_KEY_PREFIX = "#remote_op: "; constexpr int kAutoIncrementBits = 48; -/*static */ thread_local c10::optional +/*static */ thread_local std::optional RemoteProfilerManager::currentThreadLocalKey_ = c10::nullopt; /*static */ RemoteProfilerManager& RemoteProfilerManager::getInstance() { static RemoteProfilerManager* handler = new RemoteProfilerManager(); diff --git a/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.h b/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.h index d85ee5a393078..c6f8b353806b5 100644 --- a/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.h +++ b/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.h @@ -50,7 +50,7 @@ class TORCH_API RemoteProfilerManager { local_id_t getNextLocalId(); std::unordered_map profiledRpcKeys_; - static thread_local c10::optional currentThreadLocalKey_; + static thread_local std::optional currentThreadLocalKey_; std::mutex mutex_; local_id_t currentLocalId_; }; diff --git a/torch/csrc/distributed/rpc/py_rref.h b/torch/csrc/distributed/rpc/py_rref.h index 432141a97cf5c..2c9fd3433d045 100644 --- a/torch/csrc/distributed/rpc/py_rref.h +++ b/torch/csrc/distributed/rpc/py_rref.h @@ -75,8 +75,8 @@ class PYBIND11_EXPORT PyRRef { private: c10::intrusive_ptr rref_; - c10::optional> profilingFuture_; - c10::optional type_; + std::optional> profilingFuture_; + std::optional type_; }; } // namespace rpc diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h index 0b04c08287087..8f9222a2e8647 100644 --- a/torch/csrc/distributed/rpc/rpc_agent.h +++ b/torch/csrc/distributed/rpc/rpc_agent.h @@ -170,7 +170,7 @@ class TORCH_API RpcAgent { RpcRetryOptions retryOptions = RpcRetryOptions()); // Return a reference to the ``WorkerInfo`` of this RpcAgent. - // NB: not using ``c10::optional`` here because we might + // NB: not using ``std::optional`` here because we might // need to create a separate RPC API lib and avoid forcing all ``RpcAgent`` // implementations to depend on libtorch. const WorkerInfo& getWorkerInfo() const; diff --git a/torch/csrc/distributed/rpc/rref_impl.cpp b/torch/csrc/distributed/rpc/rref_impl.cpp index a770379438901..98d8f1afcb86b 100644 --- a/torch/csrc/distributed/rpc/rref_impl.cpp +++ b/torch/csrc/distributed/rpc/rref_impl.cpp @@ -248,7 +248,7 @@ OwnerRRef::OwnerRRef( worker_id_t ownerId, const RRefId& rrefId, TypePtr type, - c10::optional value, + std::optional value, std::vector devices) : RRef(ownerId, rrefId, type) { future_ = c10::make_intrusive(type_, std::move(devices)); diff --git a/torch/csrc/distributed/rpc/rref_impl.h b/torch/csrc/distributed/rpc/rref_impl.h index ccb00b45e1d5e..d6da3f2ea455f 100644 --- a/torch/csrc/distributed/rpc/rref_impl.h +++ b/torch/csrc/distributed/rpc/rref_impl.h @@ -366,7 +366,7 @@ class TORCH_API OwnerRRef final : public RRef { worker_id_t ownerId, const RRefId& rrefId, TypePtr type, - c10::optional value, + std::optional value, std::vector devices); inline bool isOwner() const override { diff --git a/torch/csrc/distributed/rpc/script_call.h b/torch/csrc/distributed/rpc/script_call.h index 2fc0efb8cdc71..dacded5cc1e62 100644 --- a/torch/csrc/distributed/rpc/script_call.h +++ b/torch/csrc/distributed/rpc/script_call.h @@ -58,10 +58,10 @@ class TORCH_API ScriptCall : public RpcCommandBase { // This field has value if this ScriptCall represents invocation of a builtin // operator. - c10::optional> op_; + std::optional> op_; // This field has non empty string if this ScriptCall represents invocation of // an annotated torchscript function defined by users. - c10::optional qualifiedName_; + std::optional qualifiedName_; std::vector stack_; const bool isAsyncExecution_; }; diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp index 0f0cf00201612..8af4336c07467 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -111,7 +111,7 @@ std::vector getCurrentStreamsForDevices( std::vector getDevicesOfTensors( const std::vector& tensors) { - c10::optional impl; + std::optional impl; size_t deviceCount = 0; std::vector indexBitset; for (const torch::Tensor& tensor : tensors) { diff --git a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp index 968f599752d64..50cc97785f61d 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp @@ -74,7 +74,7 @@ C10_REGISTER_CREATOR( class TensorpipeCudaConverter : public TensorpipeDeviceTypeConverter { public: - c10::optional> prepareTensorForSending( + std::optional> prepareTensorForSending( const c10::Storage& storage, const std::vector& streams, tensorpipe::Message& message) const override { diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp index 0b3715f44f86d..929ae30f8a6d4 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp @@ -38,7 +38,7 @@ inline c10::Device indexToDevice(c10::DeviceIndex index) { class TensorpipeCpuConverter : public TensorpipeDeviceTypeConverter { public: - c10::optional> prepareTensorForSending( + std::optional> prepareTensorForSending( const c10::Storage& storage, const std::vector& /* streams */, tensorpipe::Message& message) const override { @@ -192,7 +192,7 @@ std::tuple tensorpipeSerialize( tensor.device()); TORCH_INTERNAL_ASSERT(tpMessage.tensors.size() == i); - c10::optional> maybeCopiedTensor = + std::optional> maybeCopiedTensor = converter->prepareTensorForSending( tensor.storage(), streams, tpMessage); TORCH_INTERNAL_ASSERT(tpMessage.tensors.size() == i + 1); diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.h b/torch/csrc/distributed/rpc/tensorpipe_utils.h index 1011a9c34c3d8..d269a5bfbf565 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_utils.h +++ b/torch/csrc/distributed/rpc/tensorpipe_utils.h @@ -27,7 +27,7 @@ class TensorpipeDeviceTypeConverter { // cannot include the TensorPipe headers because it's a private dependency. // Thus we bend over backwards and entrust this method with appending that // object to the `tensors` field of the tensorpipe::Message object we pass. - virtual c10::optional> prepareTensorForSending( + virtual std::optional> prepareTensorForSending( const c10::Storage& storage, const std::vector& streams, tensorpipe::Message& message) const = 0; diff --git a/torch/csrc/dynamo/compiled_autograd.h b/torch/csrc/dynamo/compiled_autograd.h index a92d6ade0c002..ca2fd412cf8d4 100644 --- a/torch/csrc/dynamo/compiled_autograd.h +++ b/torch/csrc/dynamo/compiled_autograd.h @@ -232,7 +232,7 @@ class CompiledNodeArgs { collect(t.list); } template - void collect(const c10::optional& t) { + void collect(const std::optional& t) { if (cond(t.has_value())) { collect(*t); } @@ -520,20 +520,20 @@ class CompiledNodeArgs { struct TraceState { TraceState( - const std::vector>& ss, + const std::vector>& ss, size_t num_outputs) : sym_sizes(ss), outputs(num_outputs) {} void debug_asserts() { TORCH_INTERNAL_ASSERT(sym_sizes_index == sym_sizes.size()); } - c10::optional next_sym_size() { + std::optional next_sym_size() { TORCH_INTERNAL_ASSERT(sym_sizes_index < sym_sizes.size()); return sym_sizes[sym_sizes_index++]; } size_t sym_sizes_index{0}; - std::vector> sym_sizes; + std::vector> sym_sizes; variable_list outputs; }; @@ -664,13 +664,13 @@ class SwapSavedVariables { } template - void before(c10::optional& t) { + void before(std::optional& t) { if (t.has_value()) { before(*t); } } template - void after(c10::optional& t) { + void after(std::optional& t) { if (t.has_value()) { after(*t); } diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c index bf710b9ff7e9f..c301da9829f50 100644 --- a/torch/csrc/dynamo/cpython_defs.c +++ b/torch/csrc/dynamo/cpython_defs.c @@ -13,6 +13,17 @@ } else { \ } +#if IS_PYTHON_3_13_PLUS +// Gave up after fixing a few of these +// pycore_opcode.h is gone (new is pycore_opcode_metadata.h ?) +// f_code is gone (new is f_executable?) + +// Fake definitions for what we removed +const uint8_t* THP_PyOpcode_Caches = NULL; +const int THP_PyOpcode_Caches_size = 0; + +#else + // NOTE: all `assert`s below are converted to `CHECK`s #if IS_PYTHON_3_11_PLUS @@ -29,8 +40,8 @@ #define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt, _PyOpcode_Caches #include #undef NEED_OPCODE_TABLES -#undef Py_BUILD_CORE #include +#undef Py_BUILD_CORE // As a simple way to reduce the impact of ABI changes on the CPython side, this check forces // us to manually re-check that the function didn't change on the next major version @@ -677,3 +688,5 @@ const uint8_t* THP_PyOpcode_Caches = NULL; const int THP_PyOpcode_Caches_size = 0; #endif + +#endif // CPython 3.13 \ No newline at end of file diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h index b762f87d69df3..d4432b8bb43d4 100644 --- a/torch/csrc/dynamo/cpython_defs.h +++ b/torch/csrc/dynamo/cpython_defs.h @@ -8,7 +8,9 @@ #if IS_PYTHON_3_11_PLUS +#define Py_BUILD_CORE #include +#undef Py_BUILD_CORE int THP_PyFrame_FastToLocalsWithError( _PyInterpreterFrame* frame, diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c index b6a26f635ec4c..cbe9ab37a5dd6 100644 --- a/torch/csrc/dynamo/eval_frame.c +++ b/torch/csrc/dynamo/eval_frame.c @@ -8,6 +8,31 @@ #include #include + + +PyObject* guard_error_hook = NULL; +const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup"; + +static int active_dynamo_threads = 0; + +static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT; + +inline static PyObject* eval_frame_callback_get(void) { + void* result = PyThread_tss_get(&eval_frame_callback_key); + if (unlikely(result == NULL)) { + return (PyObject*)Py_None; + } else { + return (PyObject*)result; + } +} + +inline static void eval_frame_callback_set(PyObject* obj) { + PyThread_tss_set(&eval_frame_callback_key, obj); +} + +// 3.13 Not supported at all. See cpython_defs.c for hints +#if !(IS_PYTHON_3_13_PLUS) + // Problem in CPython includes when mixing core and non-core build // The fix was not backported to 3.12 so this is needed here // https://github.com/python/cpython/issues/105268 @@ -138,24 +163,6 @@ THP_PyFrame_FastToLocalsWithError(THP_EVAL_API_FRAME_OBJECT *frame, int *free_va } #endif -PyObject* guard_error_hook = NULL; -const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup"; - -static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT; - -inline static PyObject* eval_frame_callback_get(void) { - void* result = PyThread_tss_get(&eval_frame_callback_key); - if (unlikely(result == NULL)) { - return (PyObject*)Py_None; - } else { - return (PyObject*)result; - } -} - -inline static void eval_frame_callback_set(PyObject* obj) { - PyThread_tss_set(&eval_frame_callback_key, obj); -} - static PyObject* _custom_eval_frame_shim( PyThreadState* tstate, THP_EVAL_API_FRAME_OBJECT* frame, @@ -674,7 +681,29 @@ static PyObject* _custom_eval_frame( } } -static int active_dynamo_threads = 0; +#else // IS_PYTHON_3_13_PLUS + +// Fake definitions for everything we removed + +typedef struct THPPyInterpreterFrame { + PyObject_HEAD + _PyInterpreterFrame* frame; // Borrowed reference +} THPPyInterpreterFrame; + +inline static void enable_eval_frame_shim(PyThreadState* tstate) {} +inline static void enable_eval_frame_default(PyThreadState* tstate) {} + +static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL}; + +static PyTypeObject THPPyInterpreterFrameType = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "torch._C.dynamo.eval_frame._PyInterpreterFrame", + .tp_basicsize = sizeof(THPPyInterpreterFrame), + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_getset = THPPyInterpreterFrame_properties, +}; + +#endif // CPython 3.13 static PyObject* increment_working_threads(PyThreadState* tstate) { active_dynamo_threads = active_dynamo_threads + 1; diff --git a/torch/csrc/dynamo/python_compiled_autograd.cpp b/torch/csrc/dynamo/python_compiled_autograd.cpp index dd5ea7cbd094f..fb27b39b28e6a 100644 --- a/torch/csrc/dynamo/python_compiled_autograd.cpp +++ b/torch/csrc/dynamo/python_compiled_autograd.cpp @@ -203,12 +203,12 @@ struct CacheNode { return pyinput; } - std::vector> unwrap_dynamic_inputs( + std::vector> unwrap_dynamic_inputs( PyObject* pyresult) const { TORCH_INTERNAL_ASSERT(PyList_CheckExact(pyresult)); size_t idx = 0; size_t result_len = PyList_GET_SIZE(pyresult); - std::vector> result; + std::vector> result; result.reserve(expected_sizes.size()); for (const auto& i : expected_sizes) { if (i.dyn_type == SizeInput::DYNAMIC) { diff --git a/torch/csrc/functorch/init.cpp b/torch/csrc/functorch/init.cpp index c2996fe5278a7..6bce80ad27766 100644 --- a/torch/csrc/functorch/init.cpp +++ b/torch/csrc/functorch/init.cpp @@ -375,7 +375,7 @@ static int64_t currentLevel() { return current_level; } -static c10::optional maybe_current_level() { +static std::optional maybe_current_level() { auto maybe_layer = maybeCurrentDynamicLayer(); if (maybe_layer.has_value()) { int current_level = maybe_layer->layerId(); @@ -438,7 +438,7 @@ struct PreserveDynamicLayerStack { } // anonymous namespace -static std::tuple> unwrapBatched( +static std::tuple> unwrapBatched( const Tensor& tensor, int64_t level) { auto* batched = maybeGetBatchedImpl(tensor); @@ -534,7 +534,7 @@ void initFuncTorchBindings(PyObject* module) { return maybe_get_level(tensor) != -1; }); m.def( - "get_interpreter_stack", []() -> c10::optional> { + "get_interpreter_stack", []() -> std::optional> { const auto& stack = getDynamicLayerStack(); if (stack.empty()) { return c10::nullopt; @@ -545,7 +545,7 @@ void initFuncTorchBindings(PyObject* module) { } return result; }); - m.def("peek_interpreter_stack", []() -> c10::optional { + m.def("peek_interpreter_stack", []() -> std::optional { const auto& stack = getDynamicLayerStack(); if (stack.empty()) { return c10::nullopt; diff --git a/torch/csrc/inductor/aoti_eager/kernel_holder.cpp b/torch/csrc/inductor/aoti_eager/kernel_holder.cpp index 55c0d71c55f4b..238050f501223 100644 --- a/torch/csrc/inductor/aoti_eager/kernel_holder.cpp +++ b/torch/csrc/inductor/aoti_eager/kernel_holder.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -12,6 +13,11 @@ #endif #include +#include +#include +#include +#include + namespace torch::inductor { namespace { @@ -75,8 +81,8 @@ bool unpack_ivalue( // ivalue is scalar unpack_scalar_ivalue(ivalue, device, inputs); } else if ( - *argument.real_type() == *c10::getTypePtr>()) { - // ivalue is c10::optional + *argument.real_type() == *c10::getTypePtr>()) { + // ivalue is std::optional unpack_optional_tensor_ivalue(ivalue, device, inputs); } else { // Unsupport IValue type. @@ -115,14 +121,16 @@ AOTIPythonKernelHolder::AOTIPythonKernelHolder( (device_.type() == c10::DeviceType::CPU) || (device_.type() == c10::DeviceType::CUDA), "Unsupported device type"); + init_aoti_kernel_cache(); } void AOTIPythonKernelHolder::operator()( const c10::OperatorHandle& op, c10::DispatchKeySet keyset, torch::jit::Stack* stack) { - if (cache_lookup(op, keyset, stack)) { - cache_hit(op, keyset, stack); + AOTIKernelState kernel_state; + if (cache_lookup(op, keyset, stack, kernel_state)) { + cache_hit(kernel_state, op, keyset, stack); } else { cache_miss(op, keyset, stack); } @@ -130,23 +138,190 @@ void AOTIPythonKernelHolder::operator()( bool AOTIPythonKernelHolder::cache_lookup( const c10::OperatorHandle& op, - c10::DispatchKeySet keyset, - torch::jit::Stack* stack) { - // TODO: Always return false now to implement cache_miss. Later, we will add - // cache lookup and implement cache hit. - return false; + const c10::DispatchKeySet& keyset, + const torch::jit::Stack* stack, + AOTIKernelState& kernel_state) { + TORCH_CHECK_NOT_IMPLEMENTED( + op.schema().returns().size() == 1, + "Not implemented for operations that return either multiple values or no value."); + TORCH_CHECK_NOT_IMPLEMENTED( + op.schema().returns()[0].type()->isSubtypeOf(c10::TensorType::get()), + "Not implemented for operations that return a non-Tensor value."); + + std::vector inputs; + auto res = unpack_tensors(op.schema().arguments(), *stack, device_, inputs); + TORCH_CHECK_NOT_IMPLEMENTED( + res && inputs.size() > 0, + "Not implemented for operations that contain a parameter which is ", + "not one of the following types: at::Tensor, at::TensorList, ", + "std::optional, std::vector>."); + + auto inputs_metadata = get_inputs_metadata(inputs); + auto aoti_kernel_state = aoti_kernel_cache_.find(inputs_metadata); + if (aoti_kernel_state == aoti_kernel_cache_.end()) { + return false; + } + + if (aoti_kernel_state->second.tensor_checks_.size() != inputs.size()) { + return false; + } + + torch::dynamo::LocalState local_state; + local_state.overrideDispatchKeySet(c10::DispatchKeySet(dispatch_key_)); + + for (size_t i = 0; i < inputs.size(); ++i) { + bool pass = aoti_kernel_state->second.tensor_checks_[i].check( + local_state, inputs[i]); + if (!pass) { + return false; + } + } + + kernel_state = aoti_kernel_state->second; + return true; } void AOTIPythonKernelHolder::cache_hit( + const AOTIKernelState& kernel_state, const c10::OperatorHandle& op, - c10::DispatchKeySet keyset, + const c10::DispatchKeySet& keyset, torch::jit::Stack* stack) { - TORCH_INTERNAL_ASSERT(false); + std::vector inputs; + unpack_tensors(op.schema().arguments(), *stack, device_, inputs); + torch::jit::drop(*stack, op.schema().arguments().size()); + + auto outputs = kernel_state.kernel_runner_->run(inputs); + for (auto& output : outputs) { + stack->push_back(output); + } +} + +AOTIKernelMetadata AOTIPythonKernelHolder::get_inputs_metadata( + const std::vector& inputs) { + AOTIKernelMetadata inputs_metadata; + for (const auto& input : inputs) { + auto device = input.device(); + if (device.is_cpu()) { + // If the device is CPU, set the device index to -1. + device = c10::Device(device.type(), -1); + } + + inputs_metadata.emplace_back( + false, // is symbloic + input.scalar_type(), + device, + input.sizes().vec(), + input.strides().vec()); + } + return inputs_metadata; +} + +void AOTIPythonKernelHolder::init_aoti_kernel_cache() { + if (device_.type() == c10::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES) { + return; + } + + py::gil_scoped_acquire gil; + + py::handle load_aoti_eager_cache_function = + py::module::import("torch._inductor.utils").attr("load_aoti_eager_cache"); + TORCH_INTERNAL_ASSERT( + load_aoti_eager_cache_function.ptr() != nullptr, + "Failed to import - torch._inductor.utils.load_aoti_eager_cache"); + + auto result = py::reinterpret_steal(PyObject_CallFunctionObjArgs( + load_aoti_eager_cache_function.ptr(), + py::str(ns_).ptr(), + py::str(op_name_with_overload_).ptr(), + py::str(c10::DeviceTypeName(device_.type(), true)).ptr(), + nullptr)); + TORCH_INTERNAL_ASSERT( + result.ptr() != nullptr && result.ptr() != Py_None, + "Failed to load AOTI kernel. Operator Name is ", + op_name_with_overload_); + + auto kernel_info_list = result.cast(); + for (auto kernel_info : kernel_info_list) { + auto item_dict = kernel_info.cast(); + + // Access the kernel_path field + auto kernel_path = item_dict["kernel_path"].cast(); + + // Access the meta_info list + auto inputs_metadata = item_dict["meta_info"].cast(); + + std::vector tensor_checks; + std::vector tensor_metadata_list; + + torch::dynamo::LocalState state; + // Loop over the meta_info list + for (auto item : inputs_metadata) { + // Convert the handle to a dict + auto metadata = item.cast(); + + // Access the fields of each metadata dict + auto is_dynamic = metadata["is_dynamic"].cast(); + auto device_type = metadata["device_type"].cast(); + auto device_index = metadata["device_index"].cast(); + auto data_type_obj = metadata["dtype"].cast(); + TORCH_INTERNAL_ASSERT(THPDtype_Check(data_type_obj.ptr())); + auto data_type = + reinterpret_cast(data_type_obj.ptr())->scalar_type; + auto sizes = metadata["sizes"].cast>(); + auto strides = metadata["strides"].cast>(); + + std::vector> sym_optional_sizes; + std::vector> sym_optional_strides; + for (int64_t size : sizes) { + sym_optional_sizes.push_back(std::optional(size)); + } + for (int64_t stride : strides) { + sym_optional_strides.push_back(std::optional(stride)); + } + + // Now you can use these variables in your code + tensor_metadata_list.emplace_back( + is_dynamic, + data_type, + c10::Device(c10::Device(device_type).type(), device_index), + sizes, + strides); + tensor_checks.emplace_back( + state, + nullptr, + uint64_t(c10::DispatchKeySet(dispatch_key_).raw_repr()), + data_type, + c10::DeviceIndex(device_index), + sym_optional_sizes, + sym_optional_strides); + } + + AOTIKernelState aoti_kernel_state; + aoti_kernel_state.kernel_runner_ = load_aoti_model_runner(kernel_path); + aoti_kernel_state.tensor_checks_ = tensor_checks; + aoti_kernel_cache_[tensor_metadata_list] = aoti_kernel_state; + } +} + +std::shared_ptr AOTIPythonKernelHolder:: + load_aoti_model_runner(const std::string& so_path) { + if (device_.type() == c10::DeviceType::CUDA) { +#ifdef USE_CUDA + return std::make_shared(so_path); +#else + return nullptr; +#endif + } else if (device_.type() == c10::DeviceType::CPU) { + return std::make_shared(so_path); + } else { + TORCH_WARN("Unsupported device type"); + return nullptr; + } } void AOTIPythonKernelHolder::cache_miss( const c10::OperatorHandle& op, - c10::DispatchKeySet keyset, + const c10::DispatchKeySet& keyset, torch::jit::Stack* stack) { auto kernel_lib_path = produce_aoti_kernel_lib(op, keyset, stack); std::shared_ptr kernel = nullptr; @@ -167,41 +342,41 @@ void AOTIPythonKernelHolder::cache_miss( unpack_tensors(op.schema().arguments(), *stack, device_, inputs), "Failed to unpack tensors for the stack to run the AOTI kernel."); auto outputs = kernel->run(inputs); - if (outputs.size() > 0) { - torch::jit::drop(*stack, op.schema().arguments().size()); - // TODO: Get the output type of this operation and then convert to the - // output type. - for (auto& output : outputs) { - torch::jit::push(*stack, std::move(output)); - } + torch::jit::drop(*stack, op.schema().arguments().size()); + // TODO: Get the output type of this operation and then convert to the + // output type. + for (auto& output : outputs) { + torch::jit::push(*stack, std::move(output)); } } std::string AOTIPythonKernelHolder::produce_aoti_kernel_lib( const c10::OperatorHandle& op, - c10::DispatchKeySet keyset, - torch::jit::Stack* stack) { + const c10::DispatchKeySet& keyset, + const torch::jit::Stack* stack) { auto arguments = torch::jit::last(*stack, op.schema().arguments().size()); - py::gil_scoped_acquire gil; - - // Get the corresponding python operation for the current operator and the - // python operation will pass to the AOT Inductor to generate the kernel - // library. const auto& schema = op.schema(); const auto& qualified_name = op.operator_name().name; const auto& overload_name = schema.overload_name().empty() ? "default" : schema.overload_name(); auto pos = qualified_name.find("::"); TORCH_INTERNAL_ASSERT(pos != std::string::npos, qualified_name); - // Make me some null terminated strings - std::string ns_str = qualified_name.substr(0, pos); - const char* ns = ns_str.c_str(); - const char* func_name = qualified_name.c_str() + pos + strlen("::"); + std::string ns_str(qualified_name.begin(), qualified_name.begin() + pos); + std::string func_name( + qualified_name.begin() + pos + strlen("::"), qualified_name.end()); + + py::gil_scoped_acquire gil; py::handle op_py_func = op.getPythonOp(pyinterpreter_, [&]() -> PyObject* { - py::handle torch_api_function = - py::module::import("torch").attr("ops").attr(ns).attr(func_name); - return torch_api_function.attr(overload_name.c_str()).ptr(); + py::handle torch_api_function = py::module::import("torch") + .attr("ops") + .attr(ns_str.c_str()) + .attr(func_name.c_str()); + if (overload_name.empty()) { + return torch_api_function.attr("default").ptr(); + } else { + return torch_api_function.attr(overload_name.c_str()).ptr(); + } }); TORCH_INTERNAL_ASSERT( @@ -212,17 +387,22 @@ std::string AOTIPythonKernelHolder::produce_aoti_kernel_lib( overload_name); py::handle aot_compile_function = - py::module::import("torch._export").attr("aot_compile"); + py::module::import("torch._inductor.utils") + .attr("aoti_compile_with_persistent_cache"); TORCH_INTERNAL_ASSERT( aot_compile_function.ptr() != nullptr && aot_compile_function.ptr() != Py_None, - "Failed to import - torch._export.aot_compile"); + "Failed to import - torch._inductor.utils.aoti_compile_with_persistent_cache"); // Pass the python operation to the AOT Inductor to generate the kernel // library. auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments.vec()); auto result = py::reinterpret_steal(PyObject_CallFunctionObjArgs( aot_compile_function.ptr(), + py::str(ns_str).ptr(), + py::str(op_name_with_overload_).ptr(), + py::str(c10::DeviceTypeName(device_.type(), true)).ptr(), + py::bool_(false).ptr(), op_py_func.ptr(), args_kwargs.first.ptr(), args_kwargs.second.ptr(), diff --git a/torch/csrc/inductor/aoti_eager/kernel_holder.h b/torch/csrc/inductor/aoti_eager/kernel_holder.h index f7a886eb266bd..9cbcc217d7c30 100644 --- a/torch/csrc/inductor/aoti_eager/kernel_holder.h +++ b/torch/csrc/inductor/aoti_eager/kernel_holder.h @@ -4,6 +4,8 @@ #include #include +#include +#include #include #include @@ -11,6 +13,11 @@ namespace torch::inductor { +struct AOTIKernelState { + std::shared_ptr kernel_runner_; + std::vector tensor_checks_; +}; + // The AOTIPythonKernelHolder class uses the AOT Inductor to generate a kernel // for a specified operation. To speed up this process, the generated kernel // library is cached on disk. Detailed information from the input tensors is @@ -31,6 +38,10 @@ class AOTIPythonKernelHolder : public c10::OperatorKernel { // op_overload_name. c10::impl::PyInterpreter* pyinterpreter_; + std:: + unordered_map + aoti_kernel_cache_; + public: AOTIPythonKernelHolder( c10::DispatchKey dispatch_key, @@ -45,20 +56,36 @@ class AOTIPythonKernelHolder : public c10::OperatorKernel { private: bool cache_lookup( const c10::OperatorHandle& op, - c10::DispatchKeySet keyset, - torch::jit::Stack* stack); + const c10::DispatchKeySet& keyset, + const torch::jit::Stack* stack, + AOTIKernelState& kernel_state); void cache_miss( const c10::OperatorHandle& op, - c10::DispatchKeySet keyset, + const c10::DispatchKeySet& keyset, torch::jit::Stack* stack); void cache_hit( + const AOTIKernelState& kernel_state, const c10::OperatorHandle& op, - c10::DispatchKeySet keyset, + const c10::DispatchKeySet& keyset, torch::jit::Stack* stack); + // Invoke python utility function on the Inductor side to produce AOTI kernel + // for the given operation. + // Inductor utility function - + // torch._inductor.utils.aoti_compile_with_persistent_cache std::string produce_aoti_kernel_lib( const c10::OperatorHandle& op, - c10::DispatchKeySet keyset, - torch::jit::Stack* stack); + const c10::DispatchKeySet& keyset, + const torch::jit::Stack* stack); + // Invoke python utility function on the Inductor side to load AOTI kernel for + // the given operation. + // Inductor utility function - torch._inductor.utils.load_aoti_eager_cache + void init_aoti_kernel_cache(); + // Abstract the meta information of each tensor for the given operation. The + // meta infomation will be used for cache lookup as the key. + AOTIKernelMetadata get_inputs_metadata(const std::vector&); + // Load the AOTIModelContainerRunner object from the given file path. + std::shared_ptr load_aoti_model_runner( + const std::string&); }; } // namespace torch::inductor diff --git a/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp b/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp new file mode 100644 index 0000000000000..e89c59142328f --- /dev/null +++ b/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp @@ -0,0 +1,64 @@ +#if !defined(C10_MOBILE) && !defined(ANDROID) +#include + +namespace torch::inductor { + +TensorMetadata::TensorMetadata(const at::Tensor& src_tensor) + : is_symbolic_(false), + device_(src_tensor.device()), + sizes_(src_tensor.sizes().vec()), + strides_(src_tensor.sizes().vec()) {} + +TensorMetadata::TensorMetadata( + bool is_symbolic, + c10::ScalarType dtype, + c10::Device device, + std::vector sizes, + std::vector strides) + : is_symbolic_(is_symbolic), + dtype_(dtype), + device_(device), + sizes_(sizes), + strides_(strides) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + !is_symbolic_, "Not support symbolic shape now"); +} + +bool TensorMetadata::operator==(const TensorMetadata& other) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + !is_symbolic_, "Not support symbolic shape now"); + return this->is_symbolic_ == other.is_symbolic_ && + this->dtype_ == other.dtype_ && + this->device_.type() == other.device_.type() && + this->sizes_ == other.sizes_ && this->strides_ == other.strides_; +} + +size_t TensorMetadataHash::operator()( + const TensorMetadata& tensor_metadata) const { + auto hash = std::hash()(tensor_metadata.is_symbolic_); + hash = c10::hash_combine( + hash, std::hash()(tensor_metadata.dtype_)); + hash = c10::hash_combine( + hash, std::hash()(tensor_metadata.device_.type())); + + for (auto& e : tensor_metadata.sizes_) { + hash = c10::hash_combine(hash, std::hash()(e)); + } + + for (auto& e : tensor_metadata.strides_) { + hash = c10::hash_combine(hash, std::hash()(e)); + } + return hash; +} + +size_t AOTIKernelMetadataHash::operator()( + const AOTIKernelMetadata& aoti_kernel_metadata) const { + size_t hash = 0; + for (auto& e : aoti_kernel_metadata) { + hash = c10::hash_combine(hash, TensorMetadataHash()(e)); + } + return hash; +} + +} // namespace torch::inductor +#endif diff --git a/torch/csrc/inductor/aoti_eager/kernel_meta_info.h b/torch/csrc/inductor/aoti_eager/kernel_meta_info.h new file mode 100644 index 0000000000000..c7f8315d2707a --- /dev/null +++ b/torch/csrc/inductor/aoti_eager/kernel_meta_info.h @@ -0,0 +1,67 @@ +#if !defined(C10_MOBILE) && !defined(ANDROID) +#pragma once + +#include +#include + +#include + +namespace torch::inductor { + +// Regarding a aten operation implemented by AOTI, the metadata of the input +// tensors will be cached on the disk to acclerate next run. TensorMetada +// structure is to represent the metadata of each input tensor. it includes +// whether the tensor is symbolic, the dtype, the device, the sizes and the +// strides of the tensor. When the metadata of the input tensors is the same as +// the cached metadata, the cached kernel library will be loaded and executed. +// Otherwise, the AOT Inductor will be called again to generate the kernel +// library. +// Beyond the TensorMetadata, we build guard/TensorCheck for each input tensor +// as well to support symbolic shape. We intend to utilize TensorCheck to find +// out the proper kernel rather than TensorMetada comparison. Suppose an +// operation with a single input tensor and two kernels: +// kernel1: TensorMetadata(is_symbolic=false, dtype=Float, device=CPU, +// sizes=[s0, s1, s2], strides=[s1 * s2, s2, 1]) kernel2: +// TensorMetadata(is_symbolic=false, dtype=Float, device=CPU, sizes=[3, s1, +// s2], strides=[s1 * s2, s2, 1]) +// If a tensor with sizes=[3, 4, 5] is passed to the operation, both kernel1 and +// kernel2 support the tensor shape. In this case, we need to use TensorCheck +// plus some heruistic rules to find out the proper kernel. +struct TensorMetadata { + // Indicate whether the tensor is symbolic and it may be concluded by sizes_ + // and strides_ in the future. + bool is_symbolic_; + // Dtype of a tensor(For scalar, we will wrap it as a scalar tensor) + c10::ScalarType dtype_; + // Device of a tensor. + c10::Device device_; + // Sizes of a tensor. Currently, we only support static shape and use int64_t + // to represent the sizes. In the future, we will create symbolic size and use + // SymInt to represent it to support symbolic shape. + std::vector sizes_; + // Strides of a tensor. For symbolic shape support, it is the same as sizes_ + std::vector strides_; + + TensorMetadata(const at::Tensor& src_tensor); + TensorMetadata( + bool is_symbolic, + c10::ScalarType dtype, + c10::Device device, + std::vector sizes, + std::vector strides); + + bool operator==(const TensorMetadata& other) const; +}; + +struct TensorMetadataHash { + size_t operator()(const TensorMetadata&) const; +}; + +using AOTIKernelMetadata = std::vector; + +struct AOTIKernelMetadataHash { + size_t operator()(const AOTIKernelMetadata&) const; +}; + +} // namespace torch::inductor +#endif diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h index f31c52408aa77..8058618f97486 100644 --- a/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h +++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h @@ -67,6 +67,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_geqrf(AtenTensorHandle self, Ate AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bin_ct(AtenTensorHandle self, int64_t bins, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0, AtenTensorHandle* ret1); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_reduce(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0); @@ -83,11 +85,17 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_median(AtenTensorHandle self, At AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Scalar(double self, AtenTensorHandle exponent, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Tensor_Scalar(AtenTensorHandle self, double exponent, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Tensor_Tensor(AtenTensorHandle self, AtenTensorHandle exponent, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0); @@ -96,10 +104,13 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad2d_backward(AtenT AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort_stable(AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_bicubic2d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w, AtenTensorHandle* ret0); diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h index 37e45a7030a56..1382be18573f0 100644 --- a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h +++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h @@ -74,6 +74,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_gcd(AtenTensorHandle self, Aten AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_geqrf(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_reduce(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0); @@ -90,11 +92,17 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_median(AtenTensorHandle self, A AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Scalar(double self, AtenTensorHandle exponent, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Tensor_Scalar(AtenTensorHandle self, double exponent, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Tensor_Tensor(AtenTensorHandle self, AtenTensorHandle exponent, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0); @@ -103,10 +111,13 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad2d_backward(Aten AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1); +AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort_stable(AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1); AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_bicubic2d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w, AtenTensorHandle* ret0); diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp index bd45a4a9f0f87..79cea0cb45ec8 100644 --- a/torch/csrc/inductor/aoti_torch/shim_common.cpp +++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp @@ -775,7 +775,7 @@ AOTITorchError aoti_torch_index_put_out( const AtenTensorHandle values, bool accumulate) { AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({ - c10::List> indices_; + c10::List> indices_; indices_.reserve(num_indices); for (size_t i = 0; i < num_indices; i++) { indices_.emplace_back( diff --git a/torch/csrc/inductor/aoti_torch/utils.h b/torch/csrc/inductor/aoti_torch/utils.h index a0739afabd5ee..0964479caabd8 100644 --- a/torch/csrc/inductor/aoti_torch/utils.h +++ b/torch/csrc/inductor/aoti_torch/utils.h @@ -39,29 +39,29 @@ inline AtenTensorHandle new_tensor_handle(at::Tensor&& tensor) { // utility functions to convert a pointer to an optional value template -inline c10::optional pointer_to_optional(T* ptr) { +inline std::optional pointer_to_optional(T* ptr) { return ptr ? c10::make_optional(*ptr) : c10::nullopt; } template >> -inline c10::optional pointer_to_optional(U* ptr) { +inline std::optional pointer_to_optional(U* ptr) { return ptr ? c10::make_optional(T(*ptr)) : c10::nullopt; } template <> -inline c10::optional pointer_to_optional(AtenTensorHandle* ptr) { +inline std::optional pointer_to_optional(AtenTensorHandle* ptr) { return ptr ? c10::make_optional(*tensor_handle_to_tensor_pointer(*ptr)) : c10::nullopt; } template <> -inline c10::optional pointer_to_optional( +inline std::optional pointer_to_optional( const AtenTensorHandle* ptr) { return ptr ? c10::make_optional(*tensor_handle_to_tensor_pointer(*ptr)) : c10::nullopt; } -inline c10::optional pointer_to_optional_device( +inline std::optional pointer_to_optional_device( int32_t* device_type, int32_t device_index) { return device_type ? c10::make_optional(c10::Device( @@ -74,7 +74,7 @@ inline c10::optional pointer_to_optional_device( template struct is_optional : std::false_type {}; template -struct is_optional> : std::true_type {}; +struct is_optional> : std::true_type {}; template inline c10::ArrayRef pointer_to_list(T* ptr, int64_t len) { @@ -123,10 +123,10 @@ inline std::vector pointer_to_list( } template <> -inline std::vector> pointer_to_list( +inline std::vector> pointer_to_list( const AtenTensorHandle** ptr, int64_t len) { - std::vector> result; + std::vector> result; result.reserve(len); for (int64_t i = 0; i < len; i++) { result.emplace_back(pointer_to_optional(ptr[i])); @@ -143,7 +143,7 @@ inline std::array pointer_to_list(const int32_t* ptr) { // Utility function to convert a pointer to an optional list of values template -inline c10::optional> pointer_to_optional_list( +inline std::optional> pointer_to_optional_list( U** ptr, int64_t len) { return ptr diff --git a/torch/csrc/jit/api/compilation_unit.h b/torch/csrc/jit/api/compilation_unit.h index 6203905732667..8e28ef4717b93 100644 --- a/torch/csrc/jit/api/compilation_unit.h +++ b/torch/csrc/jit/api/compilation_unit.h @@ -86,7 +86,7 @@ struct TORCH_API CompilationUnit { // for historic reasons, these are defined in ir_emitter.cpp // Returns the list of Functions just defined. std::vector define( - const c10::optional& prefix, + const std::optional& prefix, const std::vector& properties, const std::vector& propResolvers, const std::vector& definitions, @@ -97,10 +97,10 @@ struct TORCH_API CompilationUnit { const Self* self, // see [name mangling] bool shouldMangle = false, - c10::optional operator_set_version = c10::nullopt); + std::optional operator_set_version = c10::nullopt); void define_hooks( - const c10::optional& prefix, + const std::optional& prefix, const std::vector& hookDefs, const std::vector& hookResolvers, const std::vector& preHookDefs, @@ -112,7 +112,7 @@ struct TORCH_API CompilationUnit { // Returns the list of Functions just defined. std::vector define( // prefix namespace to put all the defined functions into - const c10::optional& prefix, + const std::optional& prefix, const std::string& source, const ResolverPtr& resolver, const Self* self); @@ -286,19 +286,19 @@ struct TORCH_API CompilationUnit { private: std::unique_ptr define( - const c10::optional& prefix, + const std::optional& prefix, const Def& def, const ResolverPtr& resolver, const Self* self, const std::unordered_map& function_table, bool shouldMangle = false, FunctionType type = FunctionType::Method, - c10::optional version = c10::nullopt) const; + std::optional version = c10::nullopt) const; // Define a property on \p self. struct PropertyPair; PropertyPair define_property( - const c10::optional& prefix, + const std::optional& prefix, const Property& prop, const ResolverPtr& resolver, const Self* self, diff --git a/torch/csrc/jit/api/function_impl.h b/torch/csrc/jit/api/function_impl.h index 74663cfb41ce7..6ed8cb36199ef 100644 --- a/torch/csrc/jit/api/function_impl.h +++ b/torch/csrc/jit/api/function_impl.h @@ -12,7 +12,7 @@ struct TORCH_API GraphFunction : public Function { c10::QualifiedName name, std::shared_ptr graph, std::function function_creator, - c10::optional executor_execution_mode = + std::optional executor_execution_mode = c10::nullopt) : name_(std::move(name)), graph_(std::move(graph)), @@ -108,7 +108,7 @@ struct TORCH_API GraphFunction : public Function { using Function::call; bool call( Stack& stack, - c10::optional bailOut, + std::optional bailOut, c10::function_ref f) override { f(get_executor().getPlanFor(stack, bailOut).code); return true; @@ -139,7 +139,7 @@ struct TORCH_API GraphFunction : public Function { // allows users to specify Simple/Profiling Executor for function // TODO: add more executors - mutable c10::optional executor_execution_mode_; + mutable std::optional executor_execution_mode_; // if invoked on a graph that has already traced through amp // don't invoke amp pass @@ -159,7 +159,7 @@ struct TORCH_API GraphFunction : public Function { // executor_[1] - autocast cpu on // executor_[2] - autocast gpu on // executor_[3] - autocast cpu & gpu on - std::array, SpecializationKey::TotalCount> + std::array, SpecializationKey::TotalCount> executors_; // an optional function that actually creates the method when diff --git a/torch/csrc/jit/api/module.cpp b/torch/csrc/jit/api/module.cpp index e32d2bba34501..1b9932ed34d4d 100644 --- a/torch/csrc/jit/api/module.cpp +++ b/torch/csrc/jit/api/module.cpp @@ -167,8 +167,8 @@ void Module::to(at::Device device, bool non_blocking) { static void module_state_to( const autograd::Variable& variable, - const c10::optional& device, - const c10::optional& dtype, + const std::optional& device, + const std::optional& dtype, bool non_blocking) { // Need to access the `at::Tensor` as a `Variable` here. // Use the data's original device or dtype if not supplied here. @@ -180,8 +180,8 @@ static void module_state_to( } void Module::to_impl( - const c10::optional& device, - const c10::optional& dtype, + const std::optional& device, + const std::optional& dtype, bool non_blocking) { for (at::Tensor e : parameters()) { module_state_to(e, device, dtype, non_blocking); @@ -317,7 +317,7 @@ Module Module::copy() const { return Module(_ivalue()->copy()); } -Module Module::deepcopy(c10::optional device) const { +Module Module::deepcopy(std::optional device) const { return Module(_ivalue()->deepcopy(device)); } @@ -476,7 +476,7 @@ IValue Module::create_class(const c10::QualifiedName& name, Stack stack) const { Module freeze( const Module& module, - const c10::optional>& preserved_attrs, + const std::optional>& preserved_attrs, bool optimize_numerics) { TORCH_CHECK( !module.hasattr("training") || !module.is_training(), diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h index 6c49b695cb6b5..0787210a4aefe 100644 --- a/torch/csrc/jit/api/module.h +++ b/torch/csrc/jit/api/module.h @@ -238,7 +238,7 @@ struct TORCH_API Module : public Object { Module copy() const; - Module deepcopy(c10::optional device = c10::nullopt) const; + Module deepcopy(std::optional device = c10::nullopt) const; // Clones both the underlying `ClassType` and the module instance(data), this // function creates a new `ClassType` and returns a new instance that has the @@ -315,8 +315,8 @@ struct TORCH_API Module : public Object { } void to_impl( - const c10::optional& device, - const c10::optional& dtype, + const std::optional& device, + const std::optional& dtype, bool non_blocking); // Extra handle for the module to delete when itself is deleted @@ -333,7 +333,7 @@ struct TORCH_API Module : public Object { // details. TORCH_API Module freeze( const Module& module, - const c10::optional>& preserved_attrs = + const std::optional>& preserved_attrs = c10::nullopt, bool optimize_numerics = true); @@ -566,7 +566,7 @@ struct slot_list_impl { bool return_module_; // size of this list, cached on first request // when we need to filter the slot list - mutable c10::optional size_; + mutable std::optional size_; friend struct Module; }; diff --git a/torch/csrc/jit/api/object.cpp b/torch/csrc/jit/api/object.cpp index 0593916dbbaea..b707e76772765 100644 --- a/torch/csrc/jit/api/object.cpp +++ b/torch/csrc/jit/api/object.cpp @@ -14,7 +14,7 @@ Object::Object( c10::StrongTypePtr(std::move(cu), type), type->numAttributes())) {} -c10::optional Object::find_method(const std::string& basename) const { +std::optional Object::find_method(const std::string& basename) const { for (Function* fn : type()->methods()) { if (fn->name() == basename) { return Method(_ivalue(), fn); diff --git a/torch/csrc/jit/api/object.h b/torch/csrc/jit/api/object.h index 7ccacf385be53..164f6e2ac073a 100644 --- a/torch/csrc/jit/api/object.h +++ b/torch/csrc/jit/api/object.h @@ -46,7 +46,7 @@ struct TORCH_API Object { struct Property { std::string name; Method getter_func; - c10::optional setter_func; + std::optional setter_func; }; void setattr(const std::string& name, c10::IValue v) { @@ -129,7 +129,7 @@ struct TORCH_API Object { const Property get_property(const std::string& name) const { for (const auto& prop : type()->properties()) { if (prop.name == name) { - c10::optional setter = c10::nullopt; + std::optional setter = c10::nullopt; if (prop.setter) { setter = Method(_ivalue(), prop.setter); } @@ -142,7 +142,7 @@ struct TORCH_API Object { const std::vector get_properties() const { return c10::fmap(type()->properties(), [&](ClassType::Property prop) { - c10::optional setter = c10::nullopt; + std::optional setter = c10::nullopt; if (prop.setter) { setter = Method(_ivalue(), prop.setter); } @@ -153,7 +153,7 @@ struct TORCH_API Object { }); } - c10::optional find_method(const std::string& basename) const; + std::optional find_method(const std::string& basename) const; /// Run a method from this module. /// diff --git a/torch/csrc/jit/backends/backend_debug_info.h b/torch/csrc/jit/backends/backend_debug_info.h index 1d07beb6bdb3c..291eb48132e8e 100644 --- a/torch/csrc/jit/backends/backend_debug_info.h +++ b/torch/csrc/jit/backends/backend_debug_info.h @@ -27,7 +27,7 @@ class TORCH_API PyTorchBackendDebugInfo : public torch::CustomClassHolder { public: PyTorchBackendDebugInfo() = default; - c10::optional& getDebugInfoMap() { + std::optional& getDebugInfoMap() { return debug_info_map_; } @@ -36,7 +36,7 @@ class TORCH_API PyTorchBackendDebugInfo : public torch::CustomClassHolder { } private: - c10::optional debug_info_map_; + std::optional debug_info_map_; }; #else diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp index 5bdcbe63797c4..a0b59a73f46f9 100644 --- a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp +++ b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp @@ -233,7 +233,7 @@ void XNNGraph::defineAllTensorValues() { size_t buffer_idx = 0; size_t num_bytes = 0; if (val->node()->kind() == prim::Constant) { - c10::optional constant = val->node()->t(attr::value); + std::optional constant = val->node()->t(attr::value); auto const_val = constant->toIValue().toTensor(); // Need tensor data to be contiguous for serialization auto cont_const_val = const_val.contiguous(); diff --git a/torch/csrc/jit/codegen/fuser/codegen.cpp b/torch/csrc/jit/codegen/fuser/codegen.cpp index 10ddf2267b21d..2f9217e133697 100644 --- a/torch/csrc/jit/codegen/fuser/codegen.cpp +++ b/torch/csrc/jit/codegen/fuser/codegen.cpp @@ -364,7 +364,7 @@ static void emitCheckFor( std::string generateKernel( const std::string& name, const Graph& graph, - const std::vector>>& + const std::vector>>& inputs, const std::vector>& outputs, const bool use_cuda) { diff --git a/torch/csrc/jit/codegen/fuser/codegen.h b/torch/csrc/jit/codegen/fuser/codegen.h index fc0b34e55fe7e..e42adc1314320 100644 --- a/torch/csrc/jit/codegen/fuser/codegen.h +++ b/torch/csrc/jit/codegen/fuser/codegen.h @@ -18,7 +18,7 @@ namespace fuser { TORCH_API std::string generateKernel( const std::string& name, const Graph& graph, - const std::vector>>& + const std::vector>>& inputs, const std::vector>& outputs, const bool use_cuda); diff --git a/torch/csrc/jit/codegen/fuser/compiler.cpp b/torch/csrc/jit/codegen/fuser/compiler.cpp index 52dc3a07fe765..3c05b70e8341a 100644 --- a/torch/csrc/jit/codegen/fuser/compiler.cpp +++ b/torch/csrc/jit/codegen/fuser/compiler.cpp @@ -225,7 +225,7 @@ std::shared_ptr compileKernel( // Creates chunk and flattened input descriptions std::vector chunk_desc; - std::vector>> + std::vector>> flat_inputs; { size_t input_index = 0; diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp index c930f3293aa56..5f692d50e6b54 100644 --- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp +++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp @@ -59,7 +59,7 @@ static bool programExists(const std::string& program) { } #ifdef _MSC_VER -c10::optional exec(const std::wstring& cmd) { +std::optional exec(const std::wstring& cmd) { std::array buffer; std::wstring result; std::unique_ptr pipe( @@ -82,7 +82,7 @@ inline std::wstring& rtrim(std::wstring& s, const wchar_t* t = L" \t\n\r\f\v") { void activate() { wchar_t* root = nullptr; std::wstring cmd; - c10::optional exec_out; + std::optional exec_out; std::wstring path; std::wstring vcruntime_plat; std::wstring envvars; diff --git a/torch/csrc/jit/codegen/fuser/executor.cpp b/torch/csrc/jit/codegen/fuser/executor.cpp index fad7cfcd630da..8abb99283ffc7 100644 --- a/torch/csrc/jit/codegen/fuser/executor.cpp +++ b/torch/csrc/jit/codegen/fuser/executor.cpp @@ -26,7 +26,7 @@ namespace fuser { // Returns the "map size" for this run, which is the common size for all // intermediate tensors. -static c10::optional> getMapSize( +static std::optional> getMapSize( const KernelSpec& spec, at::TensorList args, at::IntArrayRef arg_subset) { @@ -67,7 +67,7 @@ static c10::optional> getMapSize( } // Tries to determine a map size for the instantiated kernel (see above) -static c10::optional> canRunKernel( +static std::optional> canRunKernel( const KernelSpec& spec, at::TensorList args) { // Short-circuits on size mismatch @@ -78,7 +78,7 @@ static c10::optional> canRunKernel( " arguments, but got ", args.size()); - c10::optional> map_size; + std::optional> map_size; for (const auto& broadcast_group : spec.inputBroadcastGroups()) { if (!map_size) { map_size = getMapSize(spec, args, broadcast_group); diff --git a/torch/csrc/jit/codegen/fuser/kernel_spec.h b/torch/csrc/jit/codegen/fuser/kernel_spec.h index 57806ed436311..2fc52f2d76f0f 100644 --- a/torch/csrc/jit/codegen/fuser/kernel_spec.h +++ b/torch/csrc/jit/codegen/fuser/kernel_spec.h @@ -117,7 +117,7 @@ struct TORCH_API KernelSpec { } // Cache functions - c10::optional> findKernel( + std::optional> findKernel( const ArgSpec& arg_spec) const { std::lock_guard guard{mutex_}; const auto it = kernels_.find(arg_spec); diff --git a/torch/csrc/jit/codegen/onednn/defer_size_check.cpp b/torch/csrc/jit/codegen/onednn/defer_size_check.cpp index 1dbef6643dba8..4d0f12564bd9c 100644 --- a/torch/csrc/jit/codegen/onednn/defer_size_check.cpp +++ b/torch/csrc/jit/codegen/onednn/defer_size_check.cpp @@ -41,7 +41,7 @@ class SizeCheckMover { // tensorexpr_elementwise_set that's defined in // torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp OperatorMap schemaMap = get_tensorexpr_elementwise_set(); - c10::optional mapping = + std::optional mapping = schemaMap.find(u.user->getOperator()); return mapping == "unary"; }); diff --git a/torch/csrc/jit/codegen/onednn/graph_fuser.h b/torch/csrc/jit/codegen/onednn/graph_fuser.h index ee83edc68fc41..ab37ad0211b7a 100644 --- a/torch/csrc/jit/codegen/onednn/graph_fuser.h +++ b/torch/csrc/jit/codegen/onednn/graph_fuser.h @@ -39,7 +39,7 @@ class GraphRewriter { std::pair scanNode( Node* consumer, graph_node_list::iterator workblock_begin); - c10::optional tryMerge(Node* consumer, Node* producer); + std::optional tryMerge(Node* consumer, Node* producer); }; // This pass creates the subgraphs for oneDNN Graph Fusion Nodes. diff --git a/torch/csrc/jit/codegen/onednn/graph_helper.cpp b/torch/csrc/jit/codegen/onednn/graph_helper.cpp index fdd69f85c5d52..f8e54c8743216 100644 --- a/torch/csrc/jit/codegen/onednn/graph_helper.cpp +++ b/torch/csrc/jit/codegen/onednn/graph_helper.cpp @@ -22,7 +22,7 @@ static void fixConvOptionalBias(Node* node) { } } -static c10::optional getDimensions(Value* v) { +static std::optional getDimensions(Value* v) { if (v->type()->isSubtypeOf(TensorType::get())) { return v->type()->cast()->sizes().size(); } else { diff --git a/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp b/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp index c91ff9b3917a4..dfbfe467e9765 100644 --- a/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp +++ b/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp @@ -127,7 +127,7 @@ std::pair GraphRewriter::scanNode( // Try to merge `producer` into `consumer`. If successful, this destroys // `producer` and returns the `consumer` group. -c10::optional GraphRewriter::tryMerge(Node* consumer, Node* producer) { +std::optional GraphRewriter::tryMerge(Node* consumer, Node* producer) { AT_ASSERT(llgaHelper_.isLlgaSubgraph(consumer)); bool canMerge = llgaHelper_.shouldMerge(producer, consumer) && aliasDb_.moveBeforeTopologicallyValid(producer, consumer); diff --git a/torch/csrc/jit/codegen/onednn/prepare_binary.cpp b/torch/csrc/jit/codegen/onednn/prepare_binary.cpp index 795fce27e0083..a4f6d268694e3 100644 --- a/torch/csrc/jit/codegen/onednn/prepare_binary.cpp +++ b/torch/csrc/jit/codegen/onednn/prepare_binary.cpp @@ -47,7 +47,7 @@ static void handleBinaryOpInputs(Node* node) { // 42 : Scalar --> tensor(42.0) : Float([]) auto t = g->insert(aten::as_tensor, {scalar}, {{"dtype", promotedDtype}}); // add dim & stride info to IR - c10::optional t_dim = 1; + std::optional t_dim = 1; auto target_type = TensorTypePtr( TensorType::create(promotedDtype, at::kCPU, t_dim, false)); target_type = target_type->withSizes({1}); @@ -67,7 +67,7 @@ static void handleBinaryOpInputs(Node* node) { // are the same dtype, as oneDNN Graph requires both inputs to have the // same dtype. We'll follow PyTorch's type-promotion rules here. auto second_input_typeptr = node->input(1)->type()->expect(); - c10::optional second_input_type = + std::optional second_input_type = second_input_typeptr->scalarType(); if (second_input_type != c10::nullopt) { // dtype of the second tensor might not be available in the IR diff --git a/torch/csrc/jit/cuda/cuda.h b/torch/csrc/jit/cuda/cuda.h index e8a0d04aa935e..80b2e2a82f788 100644 --- a/torch/csrc/jit/cuda/cuda.h +++ b/torch/csrc/jit/cuda/cuda.h @@ -15,7 +15,7 @@ class CUDAStream final : public CustomClassHolder { public: // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) CUDAStream( - c10::optional device = c10::nullopt, + std::optional device = c10::nullopt, int64_t priority = 0) { c10::DeviceIndex device_index = device.has_value() ? device->index() : c10::cuda::current_device(); @@ -155,7 +155,7 @@ void CUDAEvent::wait(c10::intrusive_ptr stream) { TORCH_LIBRARY(cuda, m) { auto stream_class = m.class_("Stream").def( - torch::init, int64_t>(), + torch::init, int64_t>(), "", {torch::arg("device") = c10::nullopt, torch::arg("priority") = 0}); auto event_class = m.class_("Event").def( diff --git a/torch/csrc/jit/frontend/concrete_module_type.cpp b/torch/csrc/jit/frontend/concrete_module_type.cpp index b18917d0dc01f..c15116ac3e244 100644 --- a/torch/csrc/jit/frontend/concrete_module_type.cpp +++ b/torch/csrc/jit/frontend/concrete_module_type.cpp @@ -149,14 +149,14 @@ TypePtr ConcreteModuleType::getJitType() const { return jitType_; } -c10::optional ConcreteModuleType::getPyClass() const { +std::optional ConcreteModuleType::getPyClass() const { if (!data_.pyClass_) { return c10::nullopt; } return data_.pyClass_; } -c10::optional> ConcreteModuleType::findOverloads( +std::optional> ConcreteModuleType::findOverloads( const std::string& name) const { const auto it = data_.overloads_.find(name); if (it != data_.overloads_.end()) { @@ -165,7 +165,7 @@ c10::optional> ConcreteModuleType::findOverloads( return c10::nullopt; } -c10::optional ConcreteModuleType::findFunctionAttribute( +std::optional ConcreteModuleType::findFunctionAttribute( const std::string& name) const { const auto it = data_.functionAttributes_.find(name); if (it != data_.functionAttributes_.end()) { @@ -174,7 +174,7 @@ c10::optional ConcreteModuleType::findFunctionAttribute( return c10::nullopt; } -c10::optional ConcreteModuleType::findBuiltinFunction( +std::optional ConcreteModuleType::findBuiltinFunction( const std::string& name) const { const auto it = data_.builtinFunctions_.find(name); if (it != data_.builtinFunctions_.end()) { @@ -183,7 +183,7 @@ c10::optional ConcreteModuleType::findBuiltinFunction( return c10::nullopt; } -c10::optional ConcreteModuleType::findFailedAttribute( +std::optional ConcreteModuleType::findFailedAttribute( const std::string& name) const { const auto it = data_.failedAttributes_.find(name); if (it != data_.failedAttributes_.end()) { diff --git a/torch/csrc/jit/frontend/concrete_module_type.h b/torch/csrc/jit/frontend/concrete_module_type.h index 22349936687ce..b3c3221253563 100644 --- a/torch/csrc/jit/frontend/concrete_module_type.h +++ b/torch/csrc/jit/frontend/concrete_module_type.h @@ -195,15 +195,15 @@ class VISIBILITY_HIDDEN ConcreteModuleType { static std::shared_ptr fromJitType(TypePtr type); TypePtr getJitType() const; - c10::optional getPyClass() const; + std::optional getPyClass() const; IterableModuleKind getIterableModuleKind() const; - c10::optional> findOverloads( + std::optional> findOverloads( const std::string& name) const; - c10::optional findFunctionAttribute(const std::string& name) const; - c10::optional findBuiltinFunction(const std::string& name) const; + std::optional findFunctionAttribute(const std::string& name) const; + std::optional findBuiltinFunction(const std::string& name) const; std::shared_ptr findSubmoduleConcreteType( const std::string& name) const; - c10::optional findFailedAttribute(const std::string& name) const; + std::optional findFailedAttribute(const std::string& name) const; bool isIgnoredAttribute(const std::string& name) const; // These getters are only here to return things as types that can be diff --git a/torch/csrc/jit/frontend/function_schema_parser.cpp b/torch/csrc/jit/frontend/function_schema_parser.cpp index 4b681055bd075..a651b35786cea 100644 --- a/torch/csrc/jit/frontend/function_schema_parser.cpp +++ b/torch/csrc/jit/frontend/function_schema_parser.cpp @@ -149,9 +149,9 @@ struct SchemaParser { auto fake_type = std::move(std::get<0>(p)); auto real_type = std::move(std::get<1>(p)); auto alias_info = std::move(std::get<2>(p)); - c10::optional N; - c10::optional default_value; - c10::optional alias_set; + std::optional N; + std::optional default_value; + std::optional alias_set; std::string name; if (L.nextIf('[')) { // note: an array with a size hint can only occur at the Argument level @@ -162,7 +162,7 @@ struct SchemaParser { auto container = type_parser.parseAliasAnnotation(); if (alias_info) { if (!container) { - container = c10::optional(at::AliasInfo()); + container = std::optional(at::AliasInfo()); container->setIsWrite(alias_info->isWrite()); } container->addContainedType(std::move(*alias_info)); @@ -297,7 +297,7 @@ struct SchemaParser { IValue parseDefaultValue( const c10::Type& arg_type, TypeKind kind, - c10::optional arg_N) { + std::optional arg_N) { auto range = L.cur().range; switch (kind) { case TypeKind::TensorType: diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp index 989a6eaf2dfe0..0aca3ea800623 100644 --- a/torch/csrc/jit/frontend/ir_emitter.cpp +++ b/torch/csrc/jit/frontend/ir_emitter.cpp @@ -168,7 +168,7 @@ struct CondValue { CondValue( Value* value, RefinementSet refinements, - c10::optional static_if) + std::optional static_if) : value_(value), refinements_(std::move(refinements)), static_if_(static_if) {} @@ -186,14 +186,14 @@ struct CondValue { const RefinementSet& refinements() const { return refinements_; } - c10::optional staticIf() const { + std::optional staticIf() const { return static_if_; } private: Value* value_; RefinementSet refinements_; - c10::optional + std::optional static_if_; // certain expression cause us to emit a static if statement // this value is present if this is the case. // this is not equivalent to value_ being a constant @@ -283,7 +283,7 @@ struct Environment { } // see if type error has been set for a variable - c10::optional findVariableTypeError(const std::string& name) { + std::optional findVariableTypeError(const std::string& name) { auto runner = this; while (runner->next) { runner = runner->next.get(); @@ -1200,7 +1200,7 @@ struct to_ir { } if (const auto union_type = lhs_value->type()->cast()) { std::vector to_subtract{NoneType::get()}; - c10::optional remaining = + std::optional remaining = union_type->subtractTypeSet(to_subtract); std::vector all_present; if (remaining) { @@ -1228,7 +1228,7 @@ struct to_ir { CondValue v = emitCondExpr(Expr(expr.tree()->trees()[0])); Value* result = emitBuiltinCall( expr.range(), *graph, aten::__not__, {v.value()}, {}); - c10::optional static_if; + std::optional static_if; if (v.staticIf()) { static_if = !*v.staticIf(); } @@ -1294,7 +1294,7 @@ struct to_ir { } } auto expr_out = emitToBool(expr.range(), emitExpr(expr)); - c10::optional static_if = c10::nullopt; + std::optional static_if = c10::nullopt; auto kind = expr_out->node()->kind(); if (kind == aten::is_scripting) { static_if = true; @@ -1559,7 +1559,7 @@ struct to_ir { ? refined_type_hint->cast()->getElementType() : nullptr; - c10::optional unified_elem_type = unifyTypes( + std::optional unified_elem_type = unifyTypes( list_value->type()->expect()->getElementType(), out->type(), /*default_to_union=*/true, @@ -1740,7 +1740,7 @@ struct to_ir { ? refined_type_hint->expect()->getValueType() : nullptr; - c10::optional unified_value_type = unifyTypes( + std::optional unified_value_type = unifyTypes( first_generated_value_type, v->type(), /*default_to_union=*/true, @@ -1832,7 +1832,7 @@ struct to_ir { // and the second expr in the false branch, if it's an AND the opposite auto get_const_expr = [&] { return graph->insertConstant(is_or, loc); }; - c10::optional rhs; + std::optional rhs; auto get_continue_expr = [&] { rhs = emitCondExpr(second_expr); return rhs->value(); @@ -1842,8 +1842,8 @@ struct to_ir { // If this is an AND, eval second expression if first expr is True // NOLINTNEXTLINE(cppcoreguidelines-init-variables) Value* new_result; - c10::optional refinements; - c10::optional static_if; + std::optional refinements; + std::optional static_if; if (is_or) { new_result = emitIfExpr(loc, lhs, get_const_expr, get_continue_expr); refinements = lhs.refinements().Or(rhs->refinements()); @@ -2320,8 +2320,8 @@ struct to_ir { const SourceRange& range, const std::function& emit_body, const SugaredValuePtr& iter_val, - c10::optional> targets, - c10::optional cond) { + std::optional> targets, + std::optional cond) { Value* max_trip_count_val = nullptr; if (iter_val != nullptr) { max_trip_count_val = iter_val->len(range, method); @@ -2968,7 +2968,7 @@ struct to_ir { auto outputs = rhs_output->asTuple( rhs_loc, method, - starred_unpack ? c10::nullopt : c10::optional{n_binders}); + starred_unpack ? c10::nullopt : std::optional{n_binders}); if (outputs.size() < n_binders) { throw ErrorReport(tl) << "need " << (starred_unpack ? "at least " : "") << n_binders @@ -3655,7 +3655,7 @@ struct to_ir { auto iterable_value = expr_sv->iter(loc, method); // range should have the same static length as the other iterable - c10::optional iter_static_len = iterable_value->staticLen(); + std::optional iter_static_len = iterable_value->staticLen(); SugaredValuePtr range_sv = std::make_shared( loc, method, range_inputs, iter_static_len); @@ -4454,7 +4454,7 @@ struct to_ir { ? refined_type_hint->cast()->getElementType() : nullptr; - c10::optional unified_elem_type = unifyTypeList( + std::optional unified_elem_type = unifyTypeList( types, nowhere, /*default_to_union=*/true, elem_type_hint); if (!refined_type_hint && @@ -4885,7 +4885,7 @@ struct to_ir { return graph->insertConstant(dim, loc); }; std::vector dims(subscript_exprs.size()); - std::vector> exprs( + std::vector> exprs( subscript_exprs.size(), c10::nullopt); auto handle_indexing = [&](const Expr& subscript_expr, @@ -5352,7 +5352,7 @@ struct CompilationUnit::PropertyPair }; CompilationUnit::PropertyPair CompilationUnit::define_property( - const c10::optional& prefix, + const std::optional& prefix, const Property& prop, const ResolverPtr& resolver, const Self* self, @@ -5386,14 +5386,14 @@ CompilationUnit::PropertyPair CompilationUnit::define_property( } std::unique_ptr CompilationUnit::define( - const c10::optional& prefix, + const std::optional& prefix, const Def& def, const ResolverPtr& resolver, const Self* self, const std::unordered_map& function_table, bool shouldMangle, CompilationUnit::FunctionType type, - c10::optional operator_set_version) const { + std::optional operator_set_version) const { TORCH_INTERNAL_ASSERT(resolver); auto _resolver = resolver; if (!self) { @@ -5444,14 +5444,14 @@ std::unique_ptr CompilationUnit::define( } std::vector CompilationUnit::define( - const c10::optional& prefix, + const std::optional& prefix, const std::vector& properties, const std::vector& propResolvers, const std::vector& definitions, const std::vector& defResolvers, const Self* self, bool shouldMangle, - c10::optional operator_set_version) { + std::optional operator_set_version) { TORCH_INTERNAL_ASSERT(definitions.size() == defResolvers.size()); TORCH_INTERNAL_ASSERT(properties.size() == propResolvers.size()); std::vector functions; @@ -5515,7 +5515,7 @@ std::vector CompilationUnit::define( } void CompilationUnit::define_hooks( - const c10::optional& prefix, + const std::optional& prefix, const std::vector& hookDefs, const std::vector& hookResolvers, const std::vector& preHookDefs, @@ -5620,7 +5620,7 @@ void CompilationUnit::define_hooks( } std::vector CompilationUnit::define( - const c10::optional& prefix, + const std::optional& prefix, const std::string& source, const ResolverPtr& resolver, const Self* self) { diff --git a/torch/csrc/jit/frontend/parse_string_literal.h b/torch/csrc/jit/frontend/parse_string_literal.h index 2ca1f150aacdd..5b924864bebd8 100644 --- a/torch/csrc/jit/frontend/parse_string_literal.h +++ b/torch/csrc/jit/frontend/parse_string_literal.h @@ -12,7 +12,7 @@ inline bool isCharCount(char c, const std::string& str, size_t start, int len) { std::count(str.begin() + start, str.begin() + start + len, c) == len; } -inline c10::optional parseOctal(const std::string& str, size_t pos) { +inline std::optional parseOctal(const std::string& str, size_t pos) { //\xxx where x are 0-7 if (pos + 3 >= str.size()) return c10::nullopt; diff --git a/torch/csrc/jit/frontend/parser.cpp b/torch/csrc/jit/frontend/parser.cpp index 02e22547edd44..ae2c98028e071 100644 --- a/torch/csrc/jit/frontend/parser.cpp +++ b/torch/csrc/jit/frontend/parser.cpp @@ -210,7 +210,7 @@ struct ParserImpl { } return prefix; } - c10::optional maybeParseAssignmentOp() { + std::optional maybeParseAssignmentOp() { auto r = L.cur().range; switch (L.cur().kind) { case TK_PLUS_EQ: diff --git a/torch/csrc/jit/frontend/schema_matching.cpp b/torch/csrc/jit/frontend/schema_matching.cpp index 0b4fa8ef65b2e..87ec9992141d8 100644 --- a/torch/csrc/jit/frontend/schema_matching.cpp +++ b/torch/csrc/jit/frontend/schema_matching.cpp @@ -247,7 +247,7 @@ static Value* tryMatchArgument( return value; } -c10::optional findInputWithName( +std::optional findInputWithName( const std::string& name, at::ArrayRef kwargs, bool is_aten) { @@ -354,13 +354,13 @@ bool isBlockListedSchema(const FunctionSchema& schema) { return false; } -static c10::optional tryMatchSchema( +static std::optional tryMatchSchema( const FunctionSchema& schema, const SourceRange& loc, Graph& graph, at::ArrayRef args, at::ArrayRef kwargs, - c10::optional self, + std::optional self, std::ostream* failure_messages, bool allow_conversions) { if (isBlockListedSchema(schema)) { @@ -389,7 +389,7 @@ static c10::optional tryMatchSchema( size_t used_args = 0; for (const auto schema_i : c10::irange(schema.arguments().size())) { const auto& arg = schema.arguments()[schema_i]; - c10::optional actual_named_value; + std::optional actual_named_value; if (arg.name() == "self" && self) { actual_named_value = self; self = c10::nullopt; @@ -540,7 +540,7 @@ MatchedSchema matchSchema( Graph& graph, at::ArrayRef args, at::ArrayRef kwargs, - const c10::optional& self) { + const std::optional& self) { std::stringstream failure_messages; if (auto result = tryMatchSchema( schema, @@ -576,7 +576,7 @@ std::pair matchSchemas( Graph& graph, at::ArrayRef args, at::ArrayRef kwargs, - const c10::optional& self, + const std::optional& self, bool render_errors) { TORCH_INTERNAL_ASSERT(!schemas.empty()); // if there is only one schema, we do not need to try without conversions @@ -645,7 +645,7 @@ static Value* emitBuiltinNode( const SourceRange& loc, Graph& graph, Symbol name, - c10::optional version) { + std::optional version) { auto n = graph.insertNode(graph.create(name, matched_schema.inputs, 0)) ->setSourceRange(loc); @@ -681,7 +681,7 @@ Value* emitBuiltinCall( Symbol name, at::ArrayRef args, at::ArrayRef kwargs, - const c10::optional& self) { + const std::optional& self) { const auto& variants = getAllOperatorsFor(name); const auto& builtin_functions = getAllBuiltinFunctionsFor(name); diff --git a/torch/csrc/jit/frontend/schema_matching.h b/torch/csrc/jit/frontend/schema_matching.h index 754ede24597e5..0c69df521df6a 100644 --- a/torch/csrc/jit/frontend/schema_matching.h +++ b/torch/csrc/jit/frontend/schema_matching.h @@ -28,7 +28,7 @@ TORCH_API MatchedSchema matchSchema( Graph& graph, at::ArrayRef args, at::ArrayRef kwargs, - const c10::optional& self = c10::nullopt); + const std::optional& self = c10::nullopt); TORCH_API std::pair matchSchemas( const std::vector& schemas, @@ -36,7 +36,7 @@ TORCH_API std::pair matchSchemas( Graph& graph, at::ArrayRef args, at::ArrayRef kwargs, - const c10::optional& self = c10::nullopt, + const std::optional& self = c10::nullopt, bool render_errors = false); TORCH_API bool convertibleToList( @@ -51,9 +51,9 @@ TORCH_API Value* emitBuiltinCall( Symbol name, at::ArrayRef args, at::ArrayRef kwargs, - const c10::optional& self = c10::nullopt); + const std::optional& self = c10::nullopt); -TORCH_API c10::optional findInputWithName( +TORCH_API std::optional findInputWithName( const std::string& name, at::ArrayRef kwargs, bool is_aten = false); diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp index 7c4b8ba0cac26..89465bca3f7a3 100644 --- a/torch/csrc/jit/frontend/schema_type_parser.cpp +++ b/torch/csrc/jit/frontend/schema_type_parser.cpp @@ -98,7 +98,7 @@ TypePtr SchemaTypeParser::parseBaseType() { // Tensor! // shorthand for Tensor(fresh_identifier!) // Tensor(a! -> a|b) // Tensor is in set a, written to, // and after the write is in set a AND b. -c10::optional SchemaTypeParser::parseAliasAnnotation() { +std::optional SchemaTypeParser::parseAliasAnnotation() { AliasInfo alias_info; if (L.nextIf('(')) { // optional 'alias set annotation' @@ -147,7 +147,7 @@ c10::optional SchemaTypeParser::parseAliasAnnotation() { return alias_info; } -c10::optional SchemaTypeParser::parseTensorDType( +std::optional SchemaTypeParser::parseTensorDType( const std::string& dtype) { #define DEFINE_SCALAR_TYPE(_1, n) {#n, at::ScalarType::n}, @@ -161,7 +161,7 @@ c10::optional SchemaTypeParser::parseTensorDType( return c10::nullopt; } -c10::optional SchemaTypeParser::tryToParseDeviceType() { +std::optional SchemaTypeParser::tryToParseDeviceType() { L.expect('='); const std::string& dev = L.expect(TK_IDENT).text(); @@ -195,7 +195,7 @@ c10::optional SchemaTypeParser::tryToParseDeviceType() { throw ErrorReport(L.cur()) << "cannot parse device type '" << dev << "'\n"; } -c10::optional SchemaTypeParser::tryToParseRequiresGrad() { +std::optional SchemaTypeParser::tryToParseRequiresGrad() { L.expect('='); const std::string& num = L.expect(TK_NUMBER).text(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -218,8 +218,8 @@ TypePtr SchemaTypeParser::parseRefinedTensor() { TypePtr ptr; L.expect('('); TypePtr tensor_type; - c10::optional device; - c10::optional requires_grad; + std::optional device; + std::optional requires_grad; // Parse a type with either no ranks, known ranks with sizes, ranks with // unknown sizes, a mix of ranks with known and unknown sizes, or ranks with // known sizes and strides. The type might also have requires_grad and/or @@ -227,7 +227,7 @@ TypePtr SchemaTypeParser::parseRefinedTensor() { // Long(10, 8, 6, strides=[48, 6, 1], requires_grad=0, device=cuda:1) // Float(10, *, 20, device=cuda:1) // Float(requires_grad=1) - std::vector> dims; + std::vector> dims; bool seen_strides = false; std::vector strides; parseList(TK_NOTHING, ',', ')', [&] { @@ -339,16 +339,16 @@ TypePtr SchemaTypeParser::parseRefinedTensor() { return ptr; } -std::pair> SchemaTypeParser::parseType() { +std::pair> SchemaTypeParser::parseType() { auto r = parseFakeAndRealType(); return std::make_pair(std::move(std::get<0>(r)), std::move(std::get<2>(r))); } -std::tuple> +std::tuple> SchemaTypeParser::parseFakeAndRealType() { TypePtr fake_value; TypePtr real_value; - c10::optional alias_info; + std::optional alias_info; // Tuple type if (L.cur().kind == '(') { std::vector types; @@ -465,7 +465,7 @@ SchemaTypeParser::parseFakeAndRealType() { auto container = parseAliasAnnotation(); if (alias_info) { if (!container) { - container = c10::optional(AliasInfo()); + container = std::optional(AliasInfo()); container->setIsWrite(alias_info->isWrite()); } container->addContainedType(std::move(*alias_info)); diff --git a/torch/csrc/jit/frontend/schema_type_parser.h b/torch/csrc/jit/frontend/schema_type_parser.h index c43e4363da386..e8c830cd5ae06 100644 --- a/torch/csrc/jit/frontend/schema_type_parser.h +++ b/torch/csrc/jit/frontend/schema_type_parser.h @@ -13,19 +13,19 @@ using TypePtr = c10::TypePtr; struct TORCH_API SchemaTypeParser { TypePtr parseBaseType(); - c10::optional parseAliasAnnotation(); - std::pair> parseType(); - std::tuple> + std::optional parseAliasAnnotation(); + std::pair> parseType(); + std::tuple> parseFakeAndRealType(); - c10::optional parseTensorDType(const std::string& dtype); + std::optional parseTensorDType(const std::string& dtype); TypePtr parseRefinedTensor(); SchemaTypeParser(Lexer& L, bool parse_complete_tensor_types) : complete_tensor_types(parse_complete_tensor_types), L(L) {} private: - c10::optional tryToParseRequiresGrad(); - c10::optional tryToParseDeviceType(); + std::optional tryToParseRequiresGrad(); + std::optional tryToParseDeviceType(); void parseList( int begin, int sep, diff --git a/torch/csrc/jit/frontend/script_type_parser.cpp b/torch/csrc/jit/frontend/script_type_parser.cpp index 245a7496d8f36..9295a3ed4007a 100644 --- a/torch/csrc/jit/frontend/script_type_parser.cpp +++ b/torch/csrc/jit/frontend/script_type_parser.cpp @@ -118,7 +118,7 @@ TypePtr ScriptTypeParser::subscriptToType( } } -c10::optional> ScriptTypeParser::parseBroadcastList( +std::optional> ScriptTypeParser::parseBroadcastList( const Expr& expr) const { // Alias torch.nn._common_types._size_?_t to BroadcastingList?[int] if (expr.kind() == TK_VAR) { @@ -191,7 +191,7 @@ c10::optional> ScriptTypeParser::parseBroadcastList( // gets the base type name given namespaces where the types live // turns torch.Tensor -> Tensor, X -> X -c10::optional ScriptTypeParser::parseBaseTypeName( +std::optional ScriptTypeParser::parseBaseTypeName( const Expr& expr) const { switch (expr.kind()) { case TK_VAR: { @@ -407,7 +407,7 @@ std::vector ScriptTypeParser::parseArgsFromDecl( auto decl_arg = *it; TypePtr type; - c10::optional N = c10::nullopt; + std::optional N = c10::nullopt; if (!decl_arg.type().present()) { // If this param doesn't have a type, default to "tensor" type = TensorType::getInferred(); @@ -421,7 +421,7 @@ std::vector ScriptTypeParser::parseArgsFromDecl( type = parseTypeFromExpr(decl_arg.type().get()); } } - c10::optional default_value = c10::nullopt; + std::optional default_value = c10::nullopt; if (decl_arg.defaultValue().present()) { default_value = *defaults_it++; } diff --git a/torch/csrc/jit/frontend/script_type_parser.h b/torch/csrc/jit/frontend/script_type_parser.h index 3a05af9c598ab..66c963b7d6d3d 100644 --- a/torch/csrc/jit/frontend/script_type_parser.h +++ b/torch/csrc/jit/frontend/script_type_parser.h @@ -21,7 +21,7 @@ class TORCH_API ScriptTypeParser { c10::TypePtr parseTypeFromExpr(const Expr& expr) const; - c10::optional> parseBroadcastList( + std::optional> parseBroadcastList( const Expr& expr) const; c10::TypePtr parseType(const std::string& str); @@ -33,7 +33,7 @@ class TORCH_API ScriptTypeParser { private: c10::TypePtr parseTypeFromExprImpl(const Expr& expr) const; - c10::optional parseBaseTypeName(const Expr& expr) const; + std::optional parseBaseTypeName(const Expr& expr) const; at::TypePtr subscriptToType( const std::string& typeName, const Subscript& subscript) const; diff --git a/torch/csrc/jit/frontend/source_range.cpp b/torch/csrc/jit/frontend/source_range.cpp index 03c366878af99..20ffbfd4601e3 100644 --- a/torch/csrc/jit/frontend/source_range.cpp +++ b/torch/csrc/jit/frontend/source_range.cpp @@ -151,7 +151,7 @@ size_t SourceRangeHasher::operator()(const torch::jit::SourceRange& key) const { std::hash()(key.start()) ^ std::hash()(key.end())); } -c10::optional Source::findSourceRangeThatGenerated( +std::optional Source::findSourceRangeThatGenerated( const SourceRange& range) { if (!gen_ranges_) { return c10::nullopt; diff --git a/torch/csrc/jit/frontend/source_range.h b/torch/csrc/jit/frontend/source_range.h index 72710a94ed210..1f8715ad00969 100644 --- a/torch/csrc/jit/frontend/source_range.h +++ b/torch/csrc/jit/frontend/source_range.h @@ -190,7 +190,7 @@ struct TORCH_API Source { explicit Source( c10::string_view text_view, - c10::optional filename = c10::nullopt, + std::optional filename = c10::nullopt, size_t starting_line_no = 0, std::shared_ptr gen_ranges = nullptr, CopiesString copies_str = COPIES_STRING) @@ -210,7 +210,7 @@ struct TORCH_API Source { explicit Source( StringCordView str, - c10::optional filename = c10::nullopt, + std::optional filename = c10::nullopt, size_t starting_line_no = 0, std::shared_ptr gen_ranges = nullptr) : text_view_(std::move(str)), @@ -266,7 +266,7 @@ struct TORCH_API Source { return text_view_.size(); } - c10::optional& filename() { + std::optional& filename() { return filename_; } @@ -274,7 +274,7 @@ struct TORCH_API Source { return starting_line_no_; } - c10::optional findSourceRangeThatGenerated( + std::optional findSourceRangeThatGenerated( const SourceRange& range); ~Source() = default; @@ -291,7 +291,7 @@ struct TORCH_API Source { StringCordView text_view_; - c10::optional filename_; + std::optional filename_; // If filename_ is not present, starting_line_no_ is don't care size_t starting_line_no_; // Starting offsets for lines into the source. e.g. line 0 starts at @@ -358,14 +358,14 @@ struct TORCH_API SourceRange { return ss.str(); } - c10::optional> file_line_col() const { + std::optional> file_line_col() const { if (!source_view_ || !source()->filename()) { return c10::nullopt; } auto lineno = source_view_->lineno_for_offset(start_); auto col_offset = (int)start_ - (int)source_view_->offset_for_line(lineno); - // TODO: c10::optional<>::value returns an rvalue ref so can't use it here?? + // TODO: std::optional<>::value returns an rvalue ref so can't use it here?? return std::make_tuple( source_view_->filename().value_or(""), source_view_->lineno_to_source_lineno(lineno), @@ -381,7 +381,7 @@ struct TORCH_API SourceRange { return !(*this == rhs); } - c10::optional findSourceRangeThatGenerated() const { + std::optional findSourceRangeThatGenerated() const { if (!source_view_) { return c10::nullopt; } diff --git a/torch/csrc/jit/frontend/sugared_value.cpp b/torch/csrc/jit/frontend/sugared_value.cpp index 80b5d27fba079..4b65903529d23 100644 --- a/torch/csrc/jit/frontend/sugared_value.cpp +++ b/torch/csrc/jit/frontend/sugared_value.cpp @@ -283,7 +283,7 @@ std::shared_ptr SimpleValue::attr( std::vector> SimpleValue::asTuple( const SourceRange& loc, GraphFunction& m, - const c10::optional& size_hint) { + const std::optional& size_hint) { static const auto make_simple_value = [](Value* v) -> std::shared_ptr { return std::make_shared(v); @@ -525,7 +525,7 @@ RangeValue::RangeValue( const SourceRange& loc, GraphFunction& m, std::vector inputs, - c10::optional static_len) { + std::optional static_len) { for (const auto i : c10::irange(inputs.size())) { auto typ = inputs[i]->type(); if (!typ->cast()) { @@ -645,7 +645,7 @@ void IterableTree::addChild( const SourceRange& range, GraphFunction& m, const SugaredValuePtr& iter_value) { - c10::optional child_len = iter_value->staticLen(); + std::optional child_len = iter_value->staticLen(); if (children_.empty()) { unroll_length_ = child_len; } else { @@ -748,7 +748,7 @@ std::shared_ptr NamedTupleConstructor::call( std::shared_ptr BuiltinFunction::tryCreate( Symbol symbol, - c10::optional self) { + std::optional self) { for (const std::shared_ptr& op : getAllOperatorsFor(symbol)) { if (!self) { return std::make_shared(symbol, nullptr); diff --git a/torch/csrc/jit/frontend/sugared_value.h b/torch/csrc/jit/frontend/sugared_value.h index 9bf09f4a56e17..97b092cad3ce7 100644 --- a/torch/csrc/jit/frontend/sugared_value.h +++ b/torch/csrc/jit/frontend/sugared_value.h @@ -67,7 +67,7 @@ struct TORCH_API SugaredValue virtual std::vector> asTuple( const SourceRange& loc, GraphFunction& m, - const c10::optional& size_hint = {}) { + const std::optional& size_hint = {}) { throw ErrorReport(loc) << kind() << " cannot be used as a tuple"; } @@ -121,7 +121,7 @@ struct TORCH_API SugaredValue // function, then we emit an unrolled loop over the variable. This allows us // to support containers of Heterogenous types, like Module Containers & // Tuples - virtual c10::optional staticLen() { + virtual std::optional staticLen() { return c10::nullopt; } @@ -169,7 +169,7 @@ struct TORCH_API SimpleValue : public SugaredValue { std::vector> asTuple( const SourceRange& loc, GraphFunction& m, - const c10::optional& size_hint = {}) override; + const std::optional& size_hint = {}) override; std::shared_ptr attr( const SourceRange& loc, GraphFunction& m, @@ -213,14 +213,14 @@ struct TORCH_API SimpleValue : public SugaredValue { }; struct TORCH_API BuiltinFunction : public SugaredValue { - BuiltinFunction(Symbol symbol, c10::optional self) + BuiltinFunction(Symbol symbol, std::optional self) : symbol(symbol), self(std::move(self)) {} // The symbol of the function (e.g. `aten::relu`). Symbol symbol; // if this is method, then this is the self argument. - c10::optional self; + std::optional self; std::string kind() const override { return "builtin"; } @@ -236,7 +236,7 @@ struct TORCH_API BuiltinFunction : public SugaredValue { // not clear if it is a valid builtin static std::shared_ptr tryCreate( Symbol symbol, - c10::optional self); + std::optional self); }; struct TORCH_API SugaredTupleValue : public SugaredValue { @@ -246,7 +246,7 @@ struct TORCH_API SugaredTupleValue : public SugaredValue { std::vector> asTuple( const SourceRange& loc, GraphFunction& m, - const c10::optional& size_hint = {}) override { + const std::optional& size_hint = {}) override { return tup_; }; @@ -297,7 +297,7 @@ struct TORCH_API SugaredTupleValue : public SugaredValue { // Because this is used to contain SugaredValues of Heterogenous types, // we define staticLen() so that when this is iterated over it is emitted // as an unrolled loop. - c10::optional staticLen() override { + std::optional staticLen() override { return static_cast(tup_.size()); } @@ -305,7 +305,7 @@ struct TORCH_API SugaredTupleValue : public SugaredValue { }; struct TORCH_API BuiltinModule : public SugaredValue { - BuiltinModule(std::string name, c10::optional version = at::nullopt) + BuiltinModule(std::string name, std::optional version = at::nullopt) : name(std::move(name)), version(version) {} std::string kind() const override { @@ -330,7 +330,7 @@ struct TORCH_API BuiltinModule : public SugaredValue { std::string name; // when we add operator versioning, emit this op as it exising at 'version' // if not set, use the latest version - c10::optional version; + std::optional version; }; // Represents a class, analagous to `int` or `dict`. Instances of classes, @@ -638,7 +638,7 @@ struct TORCH_API RangeValue : SugaredValue { const SourceRange& loc, GraphFunction& m, std::vector input, - c10::optional static_len = c10::nullopt); + std::optional static_len = c10::nullopt); std::string kind() const override { return "range"; @@ -654,7 +654,7 @@ struct TORCH_API RangeValue : SugaredValue { // When Range is instantiated via enumerate(iterable_with_static_len), // then it takes the static length of the iterable - c10::optional staticLen() override { + std::optional staticLen() override { return static_len_; } @@ -667,7 +667,7 @@ struct TORCH_API RangeValue : SugaredValue { // derivation nodes to simplify the graph and enable more possible // optimizations bool has_only_end_{}; - c10::optional static_len_; + std::optional static_len_; }; // Specialized Tree structure to matched against for special handling @@ -712,7 +712,7 @@ struct TORCH_API IterableTree : SugaredValue { // If this iterable contains a ModuleList or Tuple, then it will have a // static length, and we will emit it as an unrolled for loop. - c10::optional staticLen() override { + std::optional staticLen() override { return unroll_length_; } @@ -730,7 +730,7 @@ struct TORCH_API IterableTree : SugaredValue { TypePtr type_hint = nullptr) override; private: - c10::optional unroll_length_ = c10::nullopt; + std::optional unroll_length_ = c10::nullopt; std::vector children_; }; diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp index 823b27f30fcb1..9616e0f83dfbe 100644 --- a/torch/csrc/jit/frontend/tracer.cpp +++ b/torch/csrc/jit/frontend/tracer.cpp @@ -44,7 +44,7 @@ template void genericAddOptionalInput( Node* n, const char* name, - const c10::optional& value) { + const std::optional& value) { if (value) { jit::tracer::addInputs(n, name, *value); } else { @@ -110,7 +110,7 @@ void TracingState::delValue(const IValue& var) { Value* getValueTrace(const IValue& var) { return getTracingState()->getValue(var); } -static Value* getOptTensorValueTrace(const c10::optional& var) { +static Value* getOptTensorValueTrace(const std::optional& var) { return getValueTrace(IValue(var)); } Value* TracingState::getValue(const IValue& var) { @@ -617,7 +617,7 @@ void addInputs(Node* n, const char* name, c10::SymInt value) { addInputs(n, name, value.guard_int(__FILE__, __LINE__)); } -void addInputs(Node* n, const char* name, c10::optional value) { +void addInputs(Node* n, const char* name, std::optional value) { using ArgumentStash = jit::tracer::ArgumentStash; if (ArgumentStash::hasValue(name)) { Value* v = ArgumentStash::popValue(name); @@ -633,13 +633,13 @@ void addInputs(Node* n, const char* name, c10::optional value) { void addInputs(Node* n, const char* name, bool value) { detail::genericAddInput(n, value); } -void addInputs(Node* n, const char* name, const c10::optional& value) { +void addInputs(Node* n, const char* name, const std::optional& value) { detail::genericAddOptionalInput(n, name, value); } void addInputs(Node* n, const char* name, double value) { detail::genericAddInput(n, value); } -void addInputs(Node* n, const char* name, const c10::optional& value) { +void addInputs(Node* n, const char* name, const std::optional& value) { detail::genericAddOptionalInput(n, name, value); } void addInputs(Node* n, const char* name, const at::Scalar& value) { @@ -654,7 +654,7 @@ void addInputs(Node* n, const char* name, const at::Scalar& value) { void addInputs( Node* n, const char* name, - const c10::optional& value) { + const std::optional& value) { detail::genericAddOptionalInput(n, name, value); } void addInputs(Node* n, const char* name, const c10::string_view value) { @@ -663,7 +663,7 @@ void addInputs(Node* n, const char* name, const c10::string_view value) { void addInputs( Node* n, const char* name, - const c10::optional& value) { + const std::optional& value) { detail::genericAddOptionalInput(n, name, value); } void addInputs(Node* n, const char* name, const at::Tensor& value) { @@ -672,13 +672,13 @@ void addInputs(Node* n, const char* name, const at::Tensor& value) { void addInputs( Node* n, const char* name, - const c10::optional& value) { + const std::optional& value) { detail::genericAddOptionalInput(n, name, value); } void addInputs( Node* n, const char* name, - const c10::optional& value) { + const std::optional& value) { Graph* g = n->owningGraph(); if (value.has_value() && value->defined()) { @@ -706,31 +706,31 @@ void addInputs(Node* n, const char* name, at::MemoryFormat value) { void addInputs( Node* n, const char* name, - const c10::optional& value) { + const std::optional& value) { detail::genericAddOptionalInput(n, name, value); } void addInputs( Node* n, const char* name, - const c10::optional& value) { + const std::optional& value) { detail::genericAddOptionalInput(n, name, value); } void addInputs( Node* n, const char* name, - const c10::optional& value) { + const std::optional& value) { detail::genericAddOptionalInput(n, name, value); } void addInputs( Node* n, const char* name, - c10::optional value) { + std::optional value) { TORCH_CHECK(false, "NYI: Named tensors are not supported with the tracer"); } void addInputs( Node* n, const char* name, - const c10::optional& value) { + const std::optional& value) { detail::genericAddOptionalInput(n, name, value); } void addInputs( @@ -767,7 +767,7 @@ void addInputs( TORCH_API void addInputs( Node* n, const char* name, - const List>& value) { + const List>& value) { Graph* g = n->owningGraph(); Node* list_node = nullptr; list_node = g->insertNode(g->createList( @@ -813,7 +813,7 @@ void addInputs(Node* n, const char* name, c10::SymIntArrayRef value) { addInputs(n, name, C10_AS_INTARRAYREF_SLOW(value)); } -void addInputs(Node* n, const char* name, c10::optional value) { +void addInputs(Node* n, const char* name, std::optional value) { addInputs( n, name, @@ -825,7 +825,7 @@ void addInputs(Node* n, const char* name, c10::optional value) { void addInputs( Node* n, const char* name, - const c10::optional& opt_value) { + const std::optional& opt_value) { detail::genericAddOptionalInput(n, name, opt_value); } @@ -869,7 +869,7 @@ void addInputs(Node* n, const char* name, ArrayRef value) { void addInputs( Node* n, const char* name, - const c10::optional>& opt_value) { + const std::optional>& opt_value) { detail::genericAddOptionalInput(n, name, opt_value); } @@ -995,7 +995,7 @@ void ensureUniqueIfOutOfPlaced(const char* name, const at::Tensor& tensor) { } void ensureUniqueIfOutOfPlaced( const char* name, - const c10::optional& tensor) { + const std::optional& tensor) { ensureUniqueIfOutOfPlaced(name, tensor.has_value() ? *tensor : at::Tensor()); } diff --git a/torch/csrc/jit/frontend/tracer.h b/torch/csrc/jit/frontend/tracer.h index f265d57b649dd..a1cc856a22e19 100644 --- a/torch/csrc/jit/frontend/tracer.h +++ b/torch/csrc/jit/frontend/tracer.h @@ -236,37 +236,37 @@ TORCH_API void addInputs(Node* n, const char* name, c10::SymInt value); TORCH_API void addInputs( Node* n, const char* name, - c10::optional value); + std::optional value); TORCH_API void addInputs(Node* n, const char* name, bool value); TORCH_API void addInputs( Node* n, const char* name, - const c10::optional& value); + const std::optional& value); TORCH_API void addInputs(Node* n, const char* name, double value); TORCH_API void addInputs( Node* n, const char* name, - const c10::optional& value); + const std::optional& value); TORCH_API void addInputs(Node* n, const char* name, const at::Scalar& value); TORCH_API void addInputs( Node* n, const char* name, - const c10::optional& value); + const std::optional& value); TORCH_API void addInputs(Node* n, const char* name, const at::Tensor& value); TORCH_API void addInputs( Node* n, const char* name, - const c10::optional& value); + const std::optional& value); TORCH_API void addInputs(Node* n, const char* name, ArrayRef value); TORCH_API void addInputs(Node* n, const char* name, c10::SymIntArrayRef value); TORCH_API void addInputs( Node* n, const char* name, - c10::optional value); + std::optional value); TORCH_API void addInputs( Node* n, const char* name, - const c10::optional>& value); + const std::optional>& value); TORCH_API void addInputs( Node* n, const char* name, @@ -293,7 +293,7 @@ TORCH_API void addInputs( TORCH_API void addInputs( Node* n, const char* name, - const List>& value); + const List>& value); TORCH_API void addInputs( Node* n, const char* name, @@ -303,7 +303,7 @@ TORCH_API void addInputs(Node* n, const char* name, ArrayRef value); TORCH_API void addInputs( Node* n, const char* name, - const c10::optional>& value); + const std::optional>& value); TORCH_API void addInputs( Node* n, const char* name, @@ -311,7 +311,7 @@ TORCH_API void addInputs( TORCH_API void addInputs( Node* n, const char* name, - const c10::optional& value); + const std::optional& value); TORCH_API void addInputs(Node* n, const char* name, at::Device value); TORCH_API void addInputs(Node* n, const char* name, c10::Stream stream); TORCH_API void addInputs(Node* n, const char* name, at::Layout value); @@ -319,28 +319,28 @@ TORCH_API void addInputs(Node* n, const char* name, at::ScalarType value); TORCH_API void addInputs( Node* n, const char* name, - const c10::optional& value); + const std::optional& value); TORCH_API void addInputs( Node* n, const char* name, - const c10::optional& value); + const std::optional& value); TORCH_API void addInputs( Node* n, const char* name, - const c10::optional& value); + const std::optional& value); TORCH_API void addInputs(Node* n, const char* name, at::MemoryFormat value); TORCH_API void addInputs( Node* n, const char* name, - c10::optional value); + std::optional value); TORCH_API void addInputs( Node* n, const char* name, - const c10::optional& value); + const std::optional& value); TORCH_API void addInputs( Node* n, const char* name, - const c10::optional& value); + const std::optional& value); inline void addInputs( Node* n, @@ -377,7 +377,7 @@ TORCH_API void ensureUniqueIfOutOfPlaced( const at::Tensor& tensor); TORCH_API void ensureUniqueIfOutOfPlaced( const char* name, - const c10::optional& tensor); + const std::optional& tensor); template < typename T, diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp index 29953ecd19a3e..f9b2ed5dd7ce9 100644 --- a/torch/csrc/jit/ir/alias_analysis.cpp +++ b/torch/csrc/jit/ir/alias_analysis.cpp @@ -54,7 +54,7 @@ class MutableTypePtrHelper { // of dimension 4 would map to the same type as a Tensor of // dimension 1. This allows us to treat all subclasses of Tensor // as a single, homogenous "Tensor" type. - c10::optional mapTypeToAliasTypeSet(const TypePtr& type) { + std::optional mapTypeToAliasTypeSet(const TypePtr& type) { if (mutable_type_cache_) { const AliasTypeSet* result = mapTypeToBorrowedAliasTypeSet(type); if (result) { @@ -82,7 +82,7 @@ class MutableTypePtrHelper { } private: - c10::optional mapTypeToAliasTypeSetImpl(const TypePtr& type) { + std::optional mapTypeToAliasTypeSetImpl(const TypePtr& type) { switch (type->kind()) { case TypeKind::ListType: case TypeKind::DictType: @@ -1097,7 +1097,7 @@ void AliasDb::analyzeRpcAsync(Node* node) { } namespace { -c10::optional getConstantBooleanInput( +std::optional getConstantBooleanInput( Node* node, const std::string& inputName) { TORCH_INTERNAL_ASSERT( @@ -1893,7 +1893,7 @@ bool AliasDb::mayAliasWildcard(const at::ArrayRef vs) const { vs.begin(), vs.end(), [&](Value* v) { return mayAliasWildcard(v); }); } -c10::optional AliasDb::tryGetOrCreateWildcard(const TypePtr& type) { +std::optional AliasDb::tryGetOrCreateWildcard(const TypePtr& type) { auto maybe_mut_types = mapTypeToAliasTypeSetPtr(type); if (!maybe_mut_types) { return c10::nullopt; @@ -1966,8 +1966,8 @@ Element* AliasDb::getWildcard(const TypePtr& type) const { } // Register `v` as a wildcard value. -c10::optional AliasDb::setWildcard(const Value* v) { - c10::optional maybe_wildcardElement = +std::optional AliasDb::setWildcard(const Value* v) { + std::optional maybe_wildcardElement = tryGetOrCreateWildcard(v->type()); if (!maybe_wildcardElement) { return c10::nullopt; diff --git a/torch/csrc/jit/ir/alias_analysis.h b/torch/csrc/jit/ir/alias_analysis.h index 380943635ea35..c06a4a88080b4 100644 --- a/torch/csrc/jit/ir/alias_analysis.h +++ b/torch/csrc/jit/ir/alias_analysis.h @@ -203,7 +203,7 @@ class AliasDb { * Wildcard methods */ // Register `v` as a wildcard value. - c10::optional setWildcard(const Value* v); + std::optional setWildcard(const Value* v); // Is this a value which will not alias? bool nonAliasingValue(const Value* elem) const; @@ -274,7 +274,7 @@ class AliasDb { // All wildcard Elements (one for each unique mutable type) ska::flat_hash_map wildcardIndex_; Element* getWildcard(const TypePtr& type) const; - c10::optional tryGetOrCreateWildcard(const TypePtr& type); + std::optional tryGetOrCreateWildcard(const TypePtr& type); void addContainedTypesToFreshElement( Element* container_elem, const AliasTypeSet& mut_types); @@ -301,9 +301,9 @@ class AliasDb { // Map of nodes to the memory locations that they write to using TWriteIndex = ska::flat_hash_map; - c10::optional writeIndex_; + std::optional writeIndex_; // Collection of all memory locations that are written to. - c10::optional writtenToLocationsIndex_; + std::optional writtenToLocationsIndex_; void buildWrittenToLocationsIndex(); std::unordered_set wildcards_; diff --git a/torch/csrc/jit/ir/constants.cpp b/torch/csrc/jit/ir/constants.cpp index 905088a20d1e2..ef697a5af7680 100644 --- a/torch/csrc/jit/ir/constants.cpp +++ b/torch/csrc/jit/ir/constants.cpp @@ -48,8 +48,8 @@ static bool insertableIValue(const IValue& ivalue) { Value* insertConstant( Graph& g, const IValue& val, - c10::optional loc, - c10::optional scope) { + std::optional loc, + std::optional scope) { auto value = tryInsertConstant(g, val, std::move(loc), std::move(scope)); if (value) { return *value; @@ -59,11 +59,11 @@ Value* insertConstant( } // IValue -> Constant node -c10::optional tryInsertConstant( +std::optional tryInsertConstant( Graph& g, const IValue& val, - c10::optional loc, - c10::optional scope) { + std::optional loc, + std::optional scope) { Node* n = g.create(prim::Constant); if (val.isTensor()) { at::Tensor ref = val.toTensor(); @@ -153,7 +153,7 @@ c10::optional tryInsertConstant( return g.insertNode(n)->output(); } -c10::optional toIValue(const Value* v) { +std::optional toIValue(const Value* v) { if (v->node()->kind() != prim::Constant || v->type()->cast()) { return c10::nullopt; } diff --git a/torch/csrc/jit/ir/constants.h b/torch/csrc/jit/ir/constants.h index d9d11075dd204..118da1e932d9c 100644 --- a/torch/csrc/jit/ir/constants.h +++ b/torch/csrc/jit/ir/constants.h @@ -25,8 +25,8 @@ struct TORCH_API constant_not_supported_error : public std::runtime_error { TORCH_API Value* insertConstant( Graph& g, const IValue& val, - c10::optional loc = c10::nullopt, - c10::optional scope = c10::nullopt); + std::optional loc = c10::nullopt, + std::optional scope = c10::nullopt); // note: prefer g.insertConsant(val, loc) which does exactly the same thing // this function is only declared/defined here because its implementation is @@ -34,11 +34,11 @@ TORCH_API Value* insertConstant( // constants.cpp. // // returns a c10::nullopt if the IValue kind cannot be inserted as a constant -TORCH_API c10::optional tryInsertConstant( +TORCH_API std::optional tryInsertConstant( Graph& g, const IValue& val, - c10::optional loc = c10::nullopt, - c10::optional scope = c10::nullopt); + std::optional loc = c10::nullopt, + std::optional scope = c10::nullopt); //////////////////////////////////////////////////////////////////////////////// // Helper for retrieving constants @@ -46,12 +46,12 @@ TORCH_API c10::optional tryInsertConstant( // attempt to convert a (possibly constant) Value* into an interpreter value // (IValue). returns c10::nullopt if the Value* was not constant -TORCH_API c10::optional toIValue(const Value* v); +TORCH_API std::optional toIValue(const Value* v); // if a value is a constant then try to turn into type T using the // same rules as the interpreter template -c10::optional constant_as(const Value* v) { +std::optional constant_as(const Value* v) { if (auto ivalue = toIValue(v)) { return ivalue->to(); } diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp index a320570de5ca9..e288f78875c62 100644 --- a/torch/csrc/jit/ir/ir.cpp +++ b/torch/csrc/jit/ir/ir.cpp @@ -418,7 +418,7 @@ std::ostream& operator<<(std::ostream& out, const Graph& g) { static void checkSameDevice(const Node* node) { bool has_device = false; - c10::optional device = c10::nullopt; + std::optional device = c10::nullopt; auto checkValue = [&](const Value* v) { if (TensorTypePtr type = v->type()->cast()) { if (type->device() && !has_device) { @@ -984,7 +984,7 @@ static size_t findArgument(const FunctionSchema& the_schema, Symbol name) { return findArgument(the_schema, unqualName); } -c10::optional Node::get(Symbol name) const { +std::optional Node::get(Symbol name) const { return toIValue(namedInput(name)); } @@ -1686,7 +1686,7 @@ Value* Graph::insert( Symbol opname, at::ArrayRef args, at::ArrayRef kwargs, - const c10::optional& range) { + const std::optional& range) { return emitBuiltinCall( range.value_or(fakeRange()), *this, opname, args, kwargs); } @@ -1993,8 +1993,8 @@ Node* Graph::createClone( Value* Graph::insertConstant( const IValue& val, - c10::optional loc, - c10::optional scope) { + std::optional loc, + std::optional scope) { return jit::insertConstant(*this, val, std::move(loc), std::move(scope)); } @@ -2051,14 +2051,14 @@ void inlineCallStackOfNode( std::unordered_map& new_cs_entries, Function* callee, Node* to_replace, - c10::optional m_info); + std::optional m_info); static void inlineCallStackOfBlock( Block* b, std::unordered_map& new_cs_entries, Function* callee, Node* to_replace, - c10::optional m_info) { + std::optional m_info) { for (auto n : b->nodes()) { inlineCallStackOfNode(n, new_cs_entries, callee, to_replace, m_info); } @@ -2069,7 +2069,7 @@ void inlineCallStackOfNode( std::unordered_map& new_cs_entries, Function* callee, Node* to_replace, - c10::optional m_info) { + std::optional m_info) { auto new_node_cs = new_node->callstack(); InlinedCallStack* raw_callstack_ptr = @@ -2108,7 +2108,7 @@ std::vector inlineCallTo( std::unordered_map new_callstack_entries; - c10::optional module_instance_info = c10::nullopt; + std::optional module_instance_info = c10::nullopt; if (to_replace->kind() == prim::CallMethod) { auto class_type_ptr = to_replace->input(0)->type()->cast(); if (to_replace->input(0)->node()->kind() == prim::GetAttr) { diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h index 4781b15229cbb..549f4a11001f5 100644 --- a/torch/csrc/jit/ir/ir.h +++ b/torch/csrc/jit/ir/ir.h @@ -332,9 +332,9 @@ struct TORCH_API Node { std::vector blocks_; Graph* graph_; Block* owning_block_; - c10::optional source_range_; + std::optional source_range_; ScopePtr scope_; - c10::optional callstack_; + std::optional callstack_; // Assumes FunctionSchemas are persistent, so we don't manage their lifetime. // This field is effective a cache that's populated on attribute lookups and // invalidated every time we perform an operation that could potentially @@ -348,7 +348,7 @@ struct TORCH_API Node { // is changed, we need to rely on this name // to retrieve old schemas to successfully apply upgraders // for this operator. - c10::optional historic_schema_name_ = c10::nullopt; + std::optional historic_schema_name_ = c10::nullopt; protected: Node(Graph* graph_, NodeKind kind_); // defined after graph @@ -373,7 +373,7 @@ struct TORCH_API Node { return wrap_; } - const c10::optional getHistoricSchemaName() { + const std::optional getHistoricSchemaName() { return historic_schema_name_; } @@ -442,7 +442,7 @@ struct TORCH_API Node { return this; } - c10::optional callstack() const { + std::optional callstack() const { return callstack_; } void setCallStack(InlinedCallStackPtr cs) { @@ -527,10 +527,10 @@ struct TORCH_API Node { Value* namedInput(const std::string& unqualName) const; Value* namedInput(Symbol name) const; - c10::optional get(Symbol name) const; + std::optional get(Symbol name) const; template - c10::optional get(Symbol name) const { + std::optional get(Symbol name) const { if (auto v = get(name)) { return v->template to(); } @@ -1208,7 +1208,7 @@ struct Graph : std::enable_shared_from_this { Node* insert_before_; int64_t predicted_insert_count_ = 0; - c10::optional op_version_; + std::optional op_version_; public: Graph(ScopePtr scope_root = c10::make_intrusive()) @@ -1261,11 +1261,11 @@ struct Graph : std::enable_shared_from_this { return current_scope_; } - void set_op_version(c10::optional version) { + void set_op_version(std::optional version) { op_version_ = version; } - c10::optional get_op_version() { + std::optional get_op_version() { return op_version_; } @@ -1368,8 +1368,8 @@ struct Graph : std::enable_shared_from_this { // Insert constant IValue into the graph. TORCH_API Value* insertConstant( const IValue& val, - c10::optional loc = c10::nullopt, - c10::optional scope = c10::nullopt); + std::optional loc = c10::nullopt, + std::optional scope = c10::nullopt); // Schema-driven insert: // This inserts a node into the graph with inputs determined from args and @@ -1382,7 +1382,7 @@ struct Graph : std::enable_shared_from_this { Symbol opname, at::ArrayRef args, at::ArrayRef kwargs = {}, - const c10::optional& range = {}); + const std::optional& range = {}); Node* appendNode(Node* n) { return block_->appendNode(n); @@ -1591,7 +1591,7 @@ struct TORCH_API PythonOp : public Node { // recover the autograd.Function instance, if this PythonOp's function // was originally SomeFunction.apply // used in ONNX for discovering symbolics - virtual c10::optional autogradFunction() const = 0; + virtual std::optional autogradFunction() const = 0; virtual void lint_python() const = 0; }; @@ -1730,7 +1730,7 @@ struct OperatorMap { return n->maybeOperator() && contains(n->getOperator()); } - c10::optional find(const Operator& op) { + std::optional find(const Operator& op) { const auto it = map.find(Symbol::fromQualString(op.schema().name())); if (it == map.end()) { return c10::nullopt; @@ -1806,7 +1806,7 @@ struct FunctionSchemaMap { return false; } - c10::optional find(const FunctionSchema& schema) const { + std::optional find(const FunctionSchema& schema) const { const auto it = map.find(Symbol::fromQualString(schema.name())); if (it == map.end()) { return c10::nullopt; diff --git a/torch/csrc/jit/ir/irparser.cpp b/torch/csrc/jit/ir/irparser.cpp index c37988e322a8d..06e0a66fa055c 100644 --- a/torch/csrc/jit/ir/irparser.cpp +++ b/torch/csrc/jit/ir/irparser.cpp @@ -169,7 +169,7 @@ void IRParser::parseOperatorOutputs(std::vector* outs) { ParsedLiteral IRParser::parseScalarLiteral(Node* n) { auto token = L.cur(); std::string str; - std::pair> type_alias; + std::pair> type_alias; ParsedLiteral r; switch (token.kind) { case TK_STRINGLITERAL: diff --git a/torch/csrc/jit/ir/named_value.h b/torch/csrc/jit/ir/named_value.h index ead3d73e9a86b..277e7f2699695 100644 --- a/torch/csrc/jit/ir/named_value.h +++ b/torch/csrc/jit/ir/named_value.h @@ -73,8 +73,8 @@ struct NamedValue { at::TypePtr type() const; private: - c10::optional loc_; - c10::optional name_; + std::optional loc_; + std::optional name_; Value* value_{nullptr}; // only valid if value_ == nullptr; IValue ivalue_; diff --git a/torch/csrc/jit/ir/scope.cpp b/torch/csrc/jit/ir/scope.cpp index dfb1ef36f359e..3ff1c22b8d119 100644 --- a/torch/csrc/jit/ir/scope.cpp +++ b/torch/csrc/jit/ir/scope.cpp @@ -113,7 +113,7 @@ InlinedCallStack::InlinedCallStack(Function* fn, SourceRange source_range) InlinedCallStack::InlinedCallStack( Function* fn, SourceRange source_range, - c10::optional module_instance_info) + std::optional module_instance_info) : fn_(fn), fn_name_(fn_ ? fn_->name() : ""), source_range_(std::move(source_range)), @@ -122,7 +122,7 @@ InlinedCallStack::InlinedCallStack( InlinedCallStack::InlinedCallStack( Function* fn, SourceRange source_range, - c10::optional module_instance_info, + std::optional module_instance_info, std::string& function_name) : fn_(fn), fn_name_(std::move(function_name)), @@ -142,7 +142,7 @@ InlinedCallStack::InlinedCallStack( InlinedCallStackPtr callee, Function* fn, SourceRange source_range, - c10::optional module_instance_info, + std::optional module_instance_info, std::string& function_name) : callee_(std::move(callee)), fn_(fn), @@ -154,22 +154,22 @@ InlinedCallStack::InlinedCallStack( InlinedCallStackPtr callee, Function* fn, SourceRange source_range, - c10::optional module_instance_info) + std::optional module_instance_info) : callee_(std::move(callee)), fn_(fn), fn_name_(fn_ ? fn_->name() : ""), source_range_(std::move(source_range)), module_instance_info_(std::move(module_instance_info)) {} -c10::optional InlinedCallStack::callee() const { +std::optional InlinedCallStack::callee() const { return callee_; } -void InlinedCallStack::setCallee(c10::optional callee) { +void InlinedCallStack::setCallee(std::optional callee) { callee_ = std::move(callee); } -c10::optional InlinedCallStack::module_instance() const { +std::optional InlinedCallStack::module_instance() const { return module_instance_info_; } @@ -187,7 +187,7 @@ const std::string& InlinedCallStack::function_name() const { std::vector InlinedCallStack::vec() { std::vector r; - c10::optional current = intrusive_from_this(); + std::optional current = intrusive_from_this(); while (current) { r.emplace_back( (*current)->fn_, diff --git a/torch/csrc/jit/ir/scope.h b/torch/csrc/jit/ir/scope.h index 423bbbd3ab2e1..5449803032238 100644 --- a/torch/csrc/jit/ir/scope.h +++ b/torch/csrc/jit/ir/scope.h @@ -120,11 +120,11 @@ struct ModuleInstanceInfo { */ using InlinedCallStackPtr = c10::intrusive_ptr; using InlinedCallStackEntry = - std::tuple>; + std::tuple>; struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target { private: - c10::optional callee_; + std::optional callee_; Function* fn_; // Reason for fn_name_ even though we have fn_ // Serialized callstack is used in circustmances where InlinedCallstack @@ -137,7 +137,7 @@ struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target { const std::string fn_name_; SourceRange source_range_; InlinedCallStackPtr intrusive_from_this(); - c10::optional module_instance_info_; + std::optional module_instance_info_; public: // Constructor for a leaf callstack node. @@ -147,13 +147,13 @@ struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target { InlinedCallStack( Function* fn, SourceRange source_range, - c10::optional module_instance_info); + std::optional module_instance_info); // Constructor for a leaf callstack node. InlinedCallStack( Function* fn, SourceRange source_range, - c10::optional module_instance_info, + std::optional module_instance_info, std::string& function_name); // Constructor for an inner callstack node. @@ -166,20 +166,20 @@ struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target { InlinedCallStackPtr callee, Function* fn, SourceRange source_range, - c10::optional module_instance_info); + std::optional module_instance_info); InlinedCallStack( InlinedCallStackPtr callee, Function* fn, SourceRange source_range, - c10::optional module_instance_info, + std::optional module_instance_info, std::string& function_name); // Return next element in the callstack list. - c10::optional callee() const; + std::optional callee() const; // Return module instance associated with the current element. - c10::optional module_instance() const; + std::optional module_instance() const; // Returns the source range of the node SourceRange source_range() const; @@ -191,7 +191,7 @@ struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target { // Return callstack as a vector of [Function, SourceRange] pairs. std::vector vec(); - void setCallee(c10::optional); + void setCallee(std::optional); bool operator==(const InlinedCallStack& rhs) const { // No need to compare fn_, since source_range equivalence check diff --git a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp index de120c8fa1e87..1980023e8fc4a 100644 --- a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp +++ b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp @@ -31,7 +31,7 @@ using caffe2::serialize::ReadAdapterInterface; c10::IValue readArchive( const std::string& archive_name, PyTorchStreamReader& stream_reader) { - c10::optional device; + std::optional device; std::shared_ptr compilation_unit = std::make_shared(); diff --git a/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp b/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp index b3516e5bafc80..1cda81045b81a 100644 --- a/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp +++ b/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp @@ -53,7 +53,7 @@ std::unordered_map _get_runtime_ops_and_info() { for (auto& op : dispatcherOperators) { // grab schema const auto op_handle = c10::Dispatcher::singleton().findOp(op); - c10::optional num_schema_args; + std::optional num_schema_args; if (op_handle->hasSchema()) { num_schema_args = op_handle->schema().arguments().size(); } diff --git a/torch/csrc/jit/mobile/compatibility/runtime_compatibility.h b/torch/csrc/jit/mobile/compatibility/runtime_compatibility.h index 13adf04c0cc9d..2e65f1f38bd8d 100644 --- a/torch/csrc/jit/mobile/compatibility/runtime_compatibility.h +++ b/torch/csrc/jit/mobile/compatibility/runtime_compatibility.h @@ -13,7 +13,7 @@ namespace jit { // Struct storing metadata of an operator that can be useful for versioning struct OperatorInfo { // The number of arguments within the schema of the op - c10::optional num_schema_args; + std::optional num_schema_args; }; struct RuntimeCompatibilityInfo { diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.cpp b/torch/csrc/jit/mobile/flatbuffer_loader.cpp index f906f4e2b9eb4..239deb76d2673 100644 --- a/torch/csrc/jit/mobile/flatbuffer_loader.cpp +++ b/torch/csrc/jit/mobile/flatbuffer_loader.cpp @@ -359,7 +359,7 @@ std::unique_ptr FlatbufferLoader::parseFunction( (operator_version < caffe2::serialize::kProducedFileFormatVersion); for (const auto* op : *method->operators()) { - c10::optional num_args = c10::nullopt; + std::optional num_args = c10::nullopt; if (op->num_args_serialized() > -1) { num_args = op->num_args_serialized(); } @@ -752,7 +752,7 @@ void FlatbufferLoader::extractJitSourceAndConstants( mobile::Module parse_and_initialize_mobile_module( void* data, size_t size, - c10::optional, + std::optional, ExtraFilesMap* extra_files, bool should_copy_tensor_memory) { // TODO(T128189662): If not copying, enforce that data is aligned to @@ -781,7 +781,7 @@ mobile::Module parse_and_initialize_mobile_module( mobile::Module parse_and_initialize_mobile_module( std::shared_ptr data, size_t size, - c10::optional device, + std::optional device, ExtraFilesMap* extra_files) { mobile::Module m = parse_and_initialize_mobile_module( data.get(), @@ -798,7 +798,7 @@ mobile::Module parse_and_initialize_mobile_module_for_jit( size_t size, ExtraFilesMap& jit_sources, std::vector& jit_constants, - c10::optional, + std::optional, ExtraFilesMap* extra_files) { TORCH_CHECK( mobile::serialization::ModuleBufferHasIdentifier(data), "Format error"); @@ -825,7 +825,7 @@ mobile::Module parse_and_initialize_mobile_module_for_jit( mobile::Module load_mobile_module_from_file( const std::string& filename, - c10::optional device, + std::optional device, ExtraFilesMap* extra_files) { auto [data, size] = get_file_content(filename.c_str()); return parse_and_initialize_mobile_module( @@ -885,7 +885,7 @@ mobile::ModuleInfo get_module_info_from_flatbuffer(char* flatbuffer_content) { mobile::Module load_mobile_module_from_stream_with_copy( std::istream& in, - c10::optional device, + std::optional device, ExtraFilesMap* extra_files) { auto [data, size] = get_stream_content(in); return parse_and_initialize_mobile_module( @@ -895,7 +895,7 @@ mobile::Module load_mobile_module_from_stream_with_copy( mobile::Module parse_flatbuffer_no_object( std::shared_ptr data, size_t size, - c10::optional device) { + std::optional device) { (void)device; (void)size; diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.h b/torch/csrc/jit/mobile/flatbuffer_loader.h index f29fe5b2e4942..9ac9636f3f14b 100644 --- a/torch/csrc/jit/mobile/flatbuffer_loader.h +++ b/torch/csrc/jit/mobile/flatbuffer_loader.h @@ -58,7 +58,7 @@ using ExtraFilesMap = std::unordered_map; TORCH_API mobile::Module parse_and_initialize_mobile_module( void* data, size_t size, // of `data`, in bytes. - c10::optional device = c10::nullopt, + std::optional device = c10::nullopt, ExtraFilesMap* extra_files = nullptr, bool should_copy_tensor_memory = false); @@ -74,7 +74,7 @@ TORCH_API mobile::Module parse_and_initialize_mobile_module( TORCH_API mobile::Module parse_and_initialize_mobile_module( std::shared_ptr data, size_t size, // of `data`, in bytes. - c10::optional device = c10::nullopt, + std::optional device = c10::nullopt, ExtraFilesMap* extra_files = nullptr); // Parse a mobile::Module from raw bytes, also returning JIT-related metadata. @@ -87,7 +87,7 @@ TORCH_API mobile::Module parse_and_initialize_mobile_module_for_jit( size_t size, // of `data`, in bytes. ExtraFilesMap& jit_sources, std::vector& jit_constants, - c10::optional device = c10::nullopt, + std::optional device = c10::nullopt, ExtraFilesMap* extra_files = nullptr); // Load a mobile::Module from a filepath. @@ -100,7 +100,7 @@ TORCH_API mobile::Module parse_and_initialize_mobile_module_for_jit( // directly. TORCH_API mobile::Module load_mobile_module_from_file( const std::string& filename, - c10::optional device = c10::nullopt, + std::optional device = c10::nullopt, ExtraFilesMap* extra_files = nullptr); TORCH_API uint64_t get_bytecode_version(std::istream& in); @@ -114,18 +114,18 @@ TORCH_API mobile::ModuleInfo get_module_info_from_flatbuffer( // its entirity to a buffer TORCH_API mobile::Module load_mobile_module_from_stream_with_copy( std::istream& in, - c10::optional device = c10::nullopt, + std::optional device = c10::nullopt, ExtraFilesMap* extra_files = nullptr); TORCH_API mobile::Module parse_flatbuffer_no_object( std::shared_ptr data, size_t size, - c10::optional device); + std::optional device); TORCH_API mobile::Module parse_and_initialize_mobile_module( void* data, size_t, - c10::optional, + std::optional, ExtraFilesMap* extra_files, bool should_copy_tensor_memory); diff --git a/torch/csrc/jit/mobile/frame.h b/torch/csrc/jit/mobile/frame.h index 2db12f7d19374..45c51fef0085e 100644 --- a/torch/csrc/jit/mobile/frame.h +++ b/torch/csrc/jit/mobile/frame.h @@ -32,11 +32,11 @@ class Frame { return code_.instructions_.at(pc_); } - c10::optional getDebugHandle() const { + std::optional getDebugHandle() const { return getDebugHandle(pc_); } - c10::optional getDebugHandle(size_t pc) const { + std::optional getDebugHandle(size_t pc) const { if (pc >= code_.debug_handles_.size()) { return {}; } diff --git a/torch/csrc/jit/mobile/function.cpp b/torch/csrc/jit/mobile/function.cpp index b410bf7765cc7..36f19fb1fac41 100644 --- a/torch/csrc/jit/mobile/function.cpp +++ b/torch/csrc/jit/mobile/function.cpp @@ -47,7 +47,7 @@ void Function::append_instruction(OpCode op, int X, int N) { void Function::append_operator( const std::string& name, const std::string& overload_name, - const c10::optional& num_specified_args) { + const std::optional& num_specified_args) { // Keep the original opname in code_ code_.op_names_.emplace_back(name, overload_name); code_.operator_input_sizes_.emplace_back(num_specified_args.value_or(-1)); @@ -71,8 +71,8 @@ bool Function::initialize_operators(bool should_check_operators) { for (unsigned i = 0; i < code_.op_names_.size(); i++) { const auto& opname = code_.op_names_[i]; int num_args = code_.operator_input_sizes_[i]; - c10::optional num_specified_args = - num_args < 0 ? c10::nullopt : c10::optional(num_args); + std::optional num_specified_args = + num_args < 0 ? c10::nullopt : std::optional(num_args); auto func = makeOperatorFunction(opname, num_specified_args); if (!func.has_value()) { unsupported_op_names.insert(operator_str(opname)); @@ -165,9 +165,9 @@ const std::vector& Function::getExceptionDebugHandles() const { return getInterpretersExceptionDebugHandles(); } -c10::optional> makeOperatorFunction( +std::optional> makeOperatorFunction( c10::OperatorName opname, - c10::optional num_specified_args) { + std::optional num_specified_args) { std::function fn; const auto full_name = c10::toString(opname); const std::vector* pArgs = nullptr; diff --git a/torch/csrc/jit/mobile/function.h b/torch/csrc/jit/mobile/function.h index fb6f77fa64d76..42065d4a1c1b0 100644 --- a/torch/csrc/jit/mobile/function.h +++ b/torch/csrc/jit/mobile/function.h @@ -37,7 +37,7 @@ class TORCH_API Function : public torch::jit::Function { void append_operator( const std::string& name, const std::string& overload_name, - const c10::optional& num_specified_args); + const std::optional& num_specified_args); void append_constant(const c10::IValue& constant); void append_type(const c10::TypePtr& type); void append_function(mobile::Function& func); @@ -75,9 +75,9 @@ class TORCH_API Function : public torch::jit::Function { at::optional schema_; // (byte-code version 4+) }; -c10::optional> makeOperatorFunction( +std::optional> makeOperatorFunction( c10::OperatorName opname, - c10::optional num_specified_args); + std::optional num_specified_args); TORCH_API std::string operator_str(const c10::OperatorName& opname); diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp index a82e7d69366ec..96ff6c88779d9 100644 --- a/torch/csrc/jit/mobile/import.cpp +++ b/torch/csrc/jit/mobile/import.cpp @@ -191,12 +191,12 @@ class BytecodeDeserializer final { explicit BytecodeDeserializer( std::unique_ptr reader, uint64_t module_load_options = 0); - mobile::Module deserialize(c10::optional device); + mobile::Module deserialize(std::optional device); mobile::Module deserialize( - c10::optional device, + std::optional device, ExtraFilesMap& extra_files); void deserialize_only_extra( - c10::optional device, + std::optional device, ExtraFilesMap& extra_files); private: @@ -204,7 +204,7 @@ class BytecodeDeserializer final { void init_upgrader(mobile::Function* function); void parseMethods( c10::ivalue::TupleElements&& vals, - c10::optional&& debug_handles, + std::optional&& debug_handles, mobile::CompilationUnit& mcu); c10::IValue readArchive( const std::string& archive_name, @@ -217,7 +217,7 @@ class BytecodeDeserializer final { std::shared_ptr compilation_unit_; std::unordered_set imported_libs_; std::unique_ptr reader_{}; - c10::optional device_; + std::optional device_; uint64_t module_load_options_; // From `version` or `.data/version` in model.ptl and it's compute // dynamically. It's used for finding the minimum required runtime to run all @@ -305,7 +305,7 @@ void BytecodeDeserializer::init_upgrader(mobile::Function* function) { void BytecodeDeserializer::parseMethods( c10::ivalue::TupleElements&& vals, - c10::optional&& debug_handles, + std::optional&& debug_handles, mobile::CompilationUnit& mcu) { TORCH_CHECK(!vals.empty(), "Bytecode has no elements. "); // Initialized with the version number when kProducedBytecodeVersion was @@ -417,7 +417,7 @@ void BytecodeDeserializer::parseMethods( } void BytecodeDeserializer::deserialize_only_extra( - c10::optional device, + std::optional device, ExtraFilesMap& extra_files) { device_ = device; for (const auto& kv : extra_files) { @@ -431,14 +431,14 @@ void BytecodeDeserializer::deserialize_only_extra( } mobile::Module BytecodeDeserializer::deserialize( - c10::optional device, + std::optional device, ExtraFilesMap& extra_files) { deserialize_only_extra(device, extra_files); return deserialize(device); } mobile::Module BytecodeDeserializer::deserialize( - c10::optional device) { + std::optional device) { device_ = device; auto mcu = std::make_shared(); @@ -453,7 +453,7 @@ mobile::Module BytecodeDeserializer::deserialize( // auto bvals = std::move(readArchive("bytecode", mcu).toTupleRef()).elements(); - c10::optional debug_handles; + std::optional debug_handles; bool has_debug_handles{false}; if (reader_->hasRecord("mobile_debug_handles.pkl")) { debug_handles = @@ -504,7 +504,7 @@ c10::IValue BytecodeDeserializer::readArchive( mobile::Module _load_for_mobile_impl( std::unique_ptr rai, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, uint64_t module_load_options) { auto observer = torch::observerConfig().getModuleObserver(); @@ -577,7 +577,7 @@ mobile::Module _load_for_mobile_impl( mobile::Module _load_mobile_from_bytes( const std::shared_ptr& data, size_t size, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, uint64_t module_load_options) { TORCH_CHECK(size >= kFileFormatHeaderSize, "Format error"); @@ -603,28 +603,28 @@ mobile::Module _load_mobile_from_bytes( mobile::Module _load_for_mobile( std::istream& in, - c10::optional device) { + std::optional device) { ExtraFilesMap extra_files; return _load_for_mobile(in, device, extra_files); } mobile::Module _load_for_mobile( const std::string& filename, - c10::optional device) { + std::optional device) { ExtraFilesMap extra_files; return _load_for_mobile(filename, device, extra_files); } mobile::Module _load_for_mobile( std::unique_ptr rai, - c10::optional device) { + std::optional device) { ExtraFilesMap extra_files; return _load_for_mobile(std::move(rai), device, extra_files); } mobile::Module _load_for_mobile( std::istream& in, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, uint64_t module_load_options) { if (getFileFormat(in) == FileFormat::FlatbufferFileFormat) { @@ -640,7 +640,7 @@ mobile::Module _load_for_mobile( mobile::Module _load_for_mobile( const std::string& filename, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files) { return _load_for_mobile( filename, device, extra_files, kDefaultMobileLoadOptions); @@ -648,7 +648,7 @@ mobile::Module _load_for_mobile( mobile::Module _load_for_mobile( const std::string& filename, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, uint64_t module_load_options) { auto format = getFileFormat(filename); @@ -666,7 +666,7 @@ mobile::Module _load_for_mobile( TORCH_API mobile::Module _load_for_mobile( std::unique_ptr rai, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, uint64_t module_load_options) { // TODO optimize file read for non-flatbuffer models @@ -677,7 +677,7 @@ TORCH_API mobile::Module _load_for_mobile( void _load_extra_only_for_mobile( const std::string& filename, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files) { auto observer = torch::observerConfig().getModuleObserver(); // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand) diff --git a/torch/csrc/jit/mobile/import.h b/torch/csrc/jit/mobile/import.h index 26bc112f9a760..77a801e62571d 100644 --- a/torch/csrc/jit/mobile/import.h +++ b/torch/csrc/jit/mobile/import.h @@ -22,38 +22,38 @@ constexpr const char* kArchiveNameVersion = "version"; // into a mobile::Module object. TORCH_API mobile::Module _load_for_mobile( std::istream& in, - c10::optional device, + std::optional device, ExtraFilesMap& extra_file, uint64_t module_load_options = kDefaultMobileLoadOptions); TORCH_API mobile::Module _load_for_mobile( const std::string& filename, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files); TORCH_API mobile::Module _load_for_mobile( std::unique_ptr rai, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, uint64_t module_load_options = kDefaultMobileLoadOptions); TORCH_API mobile::Module _load_for_mobile( const std::string& filename, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, uint64_t module_load_options); TORCH_API mobile::Module _load_for_mobile( std::istream& in, - c10::optional device = c10::nullopt); + std::optional device = c10::nullopt); TORCH_API mobile::Module _load_for_mobile( const std::string& filename, - c10::optional device = c10::nullopt); + std::optional device = c10::nullopt); TORCH_API mobile::Module _load_for_mobile( std::unique_ptr rai, - c10::optional device = c10::nullopt); + std::optional device = c10::nullopt); /** * Load only the contents of the "extra/" files whose names are @@ -69,7 +69,7 @@ TORCH_API mobile::Module _load_for_mobile( */ void _load_extra_only_for_mobile( const std::string& filename, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files); // Currently used by both mobile/import.cpp and model_compatibility.cpp. diff --git a/torch/csrc/jit/mobile/import_data.cpp b/torch/csrc/jit/mobile/import_data.cpp index 11fbcbc45e3f2..32825f1f5e17f 100644 --- a/torch/csrc/jit/mobile/import_data.cpp +++ b/torch/csrc/jit/mobile/import_data.cpp @@ -40,13 +40,13 @@ namespace { class IValueUnpickler final { public: explicit IValueUnpickler(std::unique_ptr reader); - c10::IValue deserialize(c10::optional device); + c10::IValue deserialize(std::optional device); private: c10::IValue readArchive( const std::string& archive_name, std::shared_ptr mcu, - c10::optional device); + std::optional device); std::shared_ptr compilation_unit_; std::unique_ptr reader_; @@ -56,7 +56,7 @@ IValueUnpickler::IValueUnpickler(std::unique_ptr reader) : compilation_unit_(std::make_shared()), reader_(std::move(reader)) {} -c10::IValue IValueUnpickler::deserialize(c10::optional device) { +c10::IValue IValueUnpickler::deserialize(std::optional device) { auto mcu = std::make_shared(); // NOLINTNEXTLINE(performance-move-const-arg) @@ -66,7 +66,7 @@ c10::IValue IValueUnpickler::deserialize(c10::optional device) { c10::IValue IValueUnpickler::readArchive( const std::string& archive_name, std::shared_ptr mcu, - c10::optional device) { + std::optional device) { std::stringstream picklename; picklename << archive_name << ".pkl"; at::DataPtr pickle_ptr; @@ -169,7 +169,7 @@ c10::IValue IValueUnpickler::readArchive( */ std::map load_parameters_from_zip( std::unique_ptr rai, - c10::optional device) { + std::optional device) { auto reader = std::make_unique(std::move(rai)); IValueUnpickler unpickler(std::move(reader)); auto result = unpickler.deserialize(device).toGenericDict(); @@ -241,7 +241,7 @@ std::map mobile_module_to_parameter_map( static std::map _load_parameters_bytes( std::shared_ptr data, size_t size, - c10::optional device) { + std::optional device) { TORCH_CHECK(size >= kFileFormatHeaderSize, "Unrecognized data format"); FileFormat format = getFileFormat(data.get()); // Call the appropriate parser. @@ -268,14 +268,14 @@ static std::map _load_parameters_bytes( std::map _load_parameters( std::istream& in, - c10::optional device) { + std::optional device) { auto [data, size] = get_stream_content(in); return _load_parameters_bytes(std::move(data), size, device); } std::map _load_parameters( const std::string& filename, - c10::optional device) { + std::optional device) { auto [data, size] = get_file_content(filename.c_str()); return _load_parameters_bytes(std::move(data), size, device); } diff --git a/torch/csrc/jit/mobile/import_data.h b/torch/csrc/jit/mobile/import_data.h index f3eb202b7f00a..25e1fd81341c1 100644 --- a/torch/csrc/jit/mobile/import_data.h +++ b/torch/csrc/jit/mobile/import_data.h @@ -19,7 +19,7 @@ namespace jit { */ TORCH_API std::map _load_parameters( std::istream& in, - c10::optional device = c10::nullopt); + std::optional device = c10::nullopt); /** * Loads named parameters from the serialized data in @p filename. @@ -28,7 +28,7 @@ TORCH_API std::map _load_parameters( */ TORCH_API std::map _load_parameters( const std::string& filename, - c10::optional device = c10::nullopt); + std::optional device = c10::nullopt); // NOTE: Please prefer using _load_parameters over using the function below. TORCH_API std::map mobile_module_to_parameter_map( diff --git a/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp b/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp index 0da724ade0bf8..c273b41537e40 100644 --- a/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp +++ b/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp @@ -10,7 +10,7 @@ OperatorCallTracer::OperatorCallTracer() { auto recorder_cb = [](const at::RecordFunction& fn) -> std::unique_ptr { - c10::optional op_name = fn.operator_name(); + std::optional op_name = fn.operator_name(); if (op_name.has_value()) { getCalledOperators().withLock( [op_name](std::set& called_operators) { diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp index 55ec47d8e9387..23dfe9ff36785 100644 --- a/torch/csrc/jit/mobile/module.cpp +++ b/torch/csrc/jit/mobile/module.cpp @@ -46,7 +46,7 @@ Method Module::get_method(const std::string& name) const { bool Module::compareMethodSchemas( const std::string& name_1, const std::string& name_2) { - c10::optional schema_1, schema_2; + std::optional schema_1, schema_2; for (const auto& fn : cu_->methods()) { if (fn->name() == name_1) { schema_1 = fn->getSchema(); @@ -87,7 +87,7 @@ void Module::unsafeCopyMethod( cu_->register_function(std::move(new_fn)); } -c10::optional Module::find_method(const std::string& basename) const { +std::optional Module::find_method(const std::string& basename) const { for (const auto& fn : cu_->methods()) { if (fn->name() == basename) { return c10::make_optional(Method(this, fn.get())); @@ -316,7 +316,7 @@ c10::IValue Method::operator()(std::vector stack) const { return stack.front(); } -static c10::optional print_type(const c10::Type& t) { +static std::optional print_type(const c10::Type& t) { auto namedType = t.cast(); if (namedType && namedType->name()) { return namedType->name().value().qualifiedName(); diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h index 5e5d87f946355..3d37c7dc436ad 100644 --- a/torch/csrc/jit/mobile/module.h +++ b/torch/csrc/jit/mobile/module.h @@ -76,7 +76,7 @@ class TORCH_API Module { c10::IValue forward(std::vector inputs) { return get_method("forward")(std::move(inputs)); } - c10::optional find_method(const std::string& basename) const; + std::optional find_method(const std::string& basename) const; const std::string name() const { return object_->name(); diff --git a/torch/csrc/jit/mobile/nnc/aot_compiler.cpp b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp index 3b3fb8af6185a..1f7ba264048ff 100644 --- a/torch/csrc/jit/mobile/nnc/aot_compiler.cpp +++ b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp @@ -328,7 +328,7 @@ static std::string getNncKernelFuncName( static std::pair, std::vector> preprocessGraphPasses( std::shared_ptr& graph, - const std::vector>& example_inputs, + const std::vector>& example_inputs, const std::vector& dynamic_sizes) { GRAPH_DEBUG("Before preprocessing graph passes: ", *graph); torch::jit::RemoveTensorMutation(graph); @@ -368,11 +368,11 @@ preprocessGraphPasses( return std::make_pair(graph, sym_val); } -static std::vector> generateExampleInputs( +static std::vector> generateExampleInputs( const std::vector>& inputShapes, const std::vector& inputTypes, const std::vector& inputMemoryFormats) { - std::vector> example_inputs; + std::vector> example_inputs; example_inputs.reserve(inputShapes.size()); for (const auto i : c10::irange(inputShapes.size())) { const auto dtype = at::dtype(inputTypes[i]); diff --git a/torch/csrc/jit/mobile/nnc/context.h b/torch/csrc/jit/mobile/nnc/context.h index ddc179740549e..3976d28ec8944 100644 --- a/torch/csrc/jit/mobile/nnc/context.h +++ b/torch/csrc/jit/mobile/nnc/context.h @@ -47,8 +47,8 @@ struct TORCH_API OutputSpec { std::vector sizes_; c10::ScalarType dtype_{c10::ScalarType::Undefined}; - c10::optional qscale_; - c10::optional qzero_; + std::optional qscale_; + std::optional qzero_; }; // Hold the temporary buffers / states needed during the execution. diff --git a/torch/csrc/jit/mobile/parse_operators.cpp b/torch/csrc/jit/mobile/parse_operators.cpp index 03415657c780b..c260a2e5d832a 100644 --- a/torch/csrc/jit/mobile/parse_operators.cpp +++ b/torch/csrc/jit/mobile/parse_operators.cpp @@ -16,7 +16,7 @@ void parseOperators( "There should be either two parts (name and overload name), ", "or three parts (name, overload name and number of specified args) ", "for an operator"); - c10::optional num_args; + std::optional num_args; if (op_item.size() > 2) { num_args = op_item[2].toInt(); } diff --git a/torch/csrc/jit/mobile/promoted_prim_ops.cpp b/torch/csrc/jit/mobile/promoted_prim_ops.cpp index 7ee8140b931c5..8e49749042424 100644 --- a/torch/csrc/jit/mobile/promoted_prim_ops.cpp +++ b/torch/csrc/jit/mobile/promoted_prim_ops.cpp @@ -24,7 +24,7 @@ void raiseException(Stack& stack) { void raiseExceptionWithMessage(Stack& stack) { // this kernel supports RaiseException with only two arguments: the error and // the message Please make changes only to this kernel - c10::optional qualified_class_name = + std::optional qualified_class_name = pop(stack).toOptional(); std::string message; pop(stack, message); @@ -116,9 +116,9 @@ void toPrimDType(Stack& stack) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool copy; pop(stack, non_blocking, copy); - c10::optional scalarType = + std::optional scalarType = pop(stack).toOptional(); - c10::optional device = c10::nullopt; + std::optional device = c10::nullopt; at::Tensor self = pop(stack).toTensor(); push(stack, to_dispatch(self, device, scalarType, non_blocking, copy)); } diff --git a/torch/csrc/jit/mobile/register_ops_common_utils.h b/torch/csrc/jit/mobile/register_ops_common_utils.h index b0ecaf055f5ee..904e8786b1611 100644 --- a/torch/csrc/jit/mobile/register_ops_common_utils.h +++ b/torch/csrc/jit/mobile/register_ops_common_utils.h @@ -17,8 +17,8 @@ int64_t normalizeIndex(int64_t idx, int64_t list_size); // reference function THPVariable_to in python_variable_methods.cpp static C10_UNUSED at::Tensor to_dispatch( at::Tensor self, - c10::optional device, - c10::optional scalarType, + std::optional device, + std::optional scalarType, bool non_blocking, bool copy) { if (device && device->is_cuda()) { diff --git a/torch/csrc/jit/mobile/upgrader_mobile.h b/torch/csrc/jit/mobile/upgrader_mobile.h index f339484214f8b..68094a62ceabb 100644 --- a/torch/csrc/jit/mobile/upgrader_mobile.h +++ b/torch/csrc/jit/mobile/upgrader_mobile.h @@ -28,7 +28,7 @@ getOperatorVersionMapForMobile(); struct OperatorString { const std::string name; const std::string overload_name; - const c10::optional num_specified_args; + const std::optional num_specified_args; }; struct ByteCodeFunctionWithOperator { diff --git a/torch/csrc/jit/operator_upgraders/utils.cpp b/torch/csrc/jit/operator_upgraders/utils.cpp index 2cfd7c0559fe0..fef7b92c83c95 100644 --- a/torch/csrc/jit/operator_upgraders/utils.cpp +++ b/torch/csrc/jit/operator_upgraders/utils.cpp @@ -10,7 +10,7 @@ namespace torch::jit { -c10::optional findUpgrader( +std::optional findUpgrader( const std::vector& upgraders_for_schema, size_t current_version) { // we want to find the entry which satisfies following two conditions: @@ -51,7 +51,7 @@ bool isOpSymbolCurrent(const std::string& name, size_t current_version) { std::vector loadPossibleHistoricOps( const std::string& name, - c10::optional version) { + std::optional version) { std::vector possibleSchemas; if (!version.has_value()) { diff --git a/torch/csrc/jit/operator_upgraders/utils.h b/torch/csrc/jit/operator_upgraders/utils.h index 78cb31b4bf60e..a30b8c1182b9c 100644 --- a/torch/csrc/jit/operator_upgraders/utils.h +++ b/torch/csrc/jit/operator_upgraders/utils.h @@ -16,7 +16,7 @@ struct UpgraderRange { // Given a list of upgrader entries for a single operator // and the model version for that operator, find a valid // upgrader. -TORCH_API c10::optional findUpgrader( +TORCH_API std::optional findUpgrader( const std::vector& upgraders_for_schema, size_t current_version); @@ -39,7 +39,7 @@ TORCH_API bool isOpSymbolCurrent( // can be multiple schemas for different overloads. TORCH_API std::vector loadPossibleHistoricOps( const std::string& name, - c10::optional version); + std::optional version); TORCH_API uint64_t getMaxOperatorVersion(); diff --git a/torch/csrc/jit/passes/autocast.cpp b/torch/csrc/jit/passes/autocast.cpp index 213f569f87b02..635162e049531 100644 --- a/torch/csrc/jit/passes/autocast.cpp +++ b/torch/csrc/jit/passes/autocast.cpp @@ -60,7 +60,7 @@ bool isAutocastNode(Value* value) { // 2. `prim::SetAttr` must follow `prim::CreateObject()` in the same block, // but there might be other nodes in between // -c10::optional parseAutocast( +std::optional parseAutocast( Value* value, const AutocastContext& context) { if (!isAutocastNode(value)) { @@ -71,7 +71,7 @@ c10::optional parseAutocast( AutocastScope scope; scope.instance = value; scope.context = context; - c10::optional enabled; + std::optional enabled; std::string device; c10::ScalarType dtype = c10::ScalarType::Undefined; for (Use use : value->uses()) { @@ -269,7 +269,7 @@ void updateAutocastEnabledCheck(Node* node, bool is_jit_enabled) { void handleBlock(Block* block, AutocastContext initial_state) { std::stack autocast_stack; - c10::optional incompatible_amp = c10::nullopt; + std::optional incompatible_amp = c10::nullopt; // The current autocast enabled/disabled state auto current_state = [&] { diff --git a/torch/csrc/jit/passes/canonicalize.cpp b/torch/csrc/jit/passes/canonicalize.cpp index 5a5b867a36d09..20a883a8d06fd 100644 --- a/torch/csrc/jit/passes/canonicalize.cpp +++ b/torch/csrc/jit/passes/canonicalize.cpp @@ -142,7 +142,7 @@ bool isBeforeOrAfter(const Use& a, const Use& b, bool checking_before) { return checking_before ? isBefore(a, b) : isAfter(a, b); } -c10::optional firstOrLastUse(Value* v, bool find_first) { +std::optional firstOrLastUse(Value* v, bool find_first) { if (v->uses().empty()) { return c10::nullopt; } @@ -157,9 +157,9 @@ c10::optional firstOrLastUse(Value* v, bool find_first) { return extreme_use; } -static std::vector> gatherFirstUses( +static std::vector> gatherFirstUses( at::ArrayRef values) { - return fmap(values, [&](Value* v) -> c10::optional { + return fmap(values, [&](Value* v) -> std::optional { return firstOrLastUse(v, true); }); } @@ -169,7 +169,7 @@ static std::vector sort_indexes(at::ArrayRef values) { std::vector idx(values.size()); std::iota(idx.begin(), idx.end(), 0); - std::vector> first_uses = gatherFirstUses(values); + std::vector> first_uses = gatherFirstUses(values); // Sort values based on canonical ordering of their first usage std::sort(idx.begin(), idx.end(), [&first_uses](size_t i1, size_t i2) { diff --git a/torch/csrc/jit/passes/canonicalize.h b/torch/csrc/jit/passes/canonicalize.h index 46d90d1a515f6..b84cdd9f6a355 100644 --- a/torch/csrc/jit/passes/canonicalize.h +++ b/torch/csrc/jit/passes/canonicalize.h @@ -11,7 +11,7 @@ TORCH_API std::shared_ptr Canonicalize( TORCH_API void CanonicalizeOutputs(std::shared_ptr& graph); -TORCH_API c10::optional firstOrLastUse(Value* v, bool find_first); +TORCH_API std::optional firstOrLastUse(Value* v, bool find_first); TORCH_API bool isBeforeOrAfter( const Use& a, diff --git a/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp b/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp index a8d7c75fbe7f3..72d419eeb9c16 100644 --- a/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp +++ b/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp @@ -12,7 +12,7 @@ struct ChunkOutput { size_t offset; }; -static c10::optional> getChunkOutputs(Node* chunk) { +static std::optional> getChunkOutputs(Node* chunk) { std::vector outputs; for (auto list_use : chunk->output()->uses()) { if (list_use.user->matches( diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp index cd3fb6b1e2b06..6334cd75faa90 100644 --- a/torch/csrc/jit/passes/constant_propagation.cpp +++ b/torch/csrc/jit/passes/constant_propagation.cpp @@ -19,7 +19,7 @@ namespace torch { namespace jit { -c10::optional> runNodeIfInputsAreConstant( +std::optional> runNodeIfInputsAreConstant( const Node* n, bool ignore_custom_classes, AliasDb* db) { diff --git a/torch/csrc/jit/passes/constant_propagation.h b/torch/csrc/jit/passes/constant_propagation.h index 62293c8d7abc9..2200acfa39ede 100644 --- a/torch/csrc/jit/passes/constant_propagation.h +++ b/torch/csrc/jit/passes/constant_propagation.h @@ -23,7 +23,7 @@ TORCH_API bool ConstantPropagationImmutableTypes(std::shared_ptr& graph); // make their own determination if constant prop is appropriate - for example // non-deterministic ops or ops with side effects. If ignore_custom_classes is // specified, nodes that output user defined classes are not run. -TORCH_API c10::optional runNodeIfInputsAreConstant( +TORCH_API std::optional runNodeIfInputsAreConstant( const Node* node, bool ignore_custom_classes = false, AliasDb* db = nullptr); diff --git a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp index 162487201da7b..c5fe65537669a 100644 --- a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp +++ b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp @@ -281,7 +281,7 @@ class SubgraphSlicer { // Try to merge `producer` into `consumer`. If successful, this destroys // `producer` and returns the `consumer` group. - c10::optional tryMerge(Node* consumer, Node* producer) { + std::optional tryMerge(Node* consumer, Node* producer) { AT_ASSERT(consumer->kind() == prim::DifferentiableGraph); bool canMerge = shouldConsiderForMerge(producer) && aliasDb_.moveBeforeTopologicallyValid(producer, consumer); @@ -302,7 +302,7 @@ class SubgraphSlicer { std::vector& diff_nodes_; }; -c10::optional getProfileNodeRequiresGrad(Node* n) { +std::optional getProfileNodeRequiresGrad(Node* n) { TORCH_INTERNAL_ASSERT(n->kind() == prim::profile); if (!n->hasAttribute(attr::profiled_type)) { return c10::nullopt; @@ -359,7 +359,7 @@ struct ContextMapping { } }; -c10::optional findRequiresGradForOutput( +std::optional findRequiresGradForOutput( Node* diff_graph, Value* output, const ContextMapping& ctx_mapping) { @@ -374,7 +374,7 @@ c10::optional findRequiresGradForOutput( } if (use.user->kind() == prim::profile) { - c10::optional req_grad_use; + std::optional req_grad_use; if ((req_grad_use = getProfileNodeRequiresGrad(use.user)).has_value()) { return req_grad_use.value(); } @@ -393,7 +393,7 @@ c10::optional findRequiresGradForOutput( } if (dg_use.user->kind() == prim::profile) { - c10::optional req_grad_use; + std::optional req_grad_use; if ((req_grad_use = getProfileNodeRequiresGrad(dg_use.user)) .has_value()) { return req_grad_use.value(); diff --git a/torch/csrc/jit/passes/decompose_ops.cpp b/torch/csrc/jit/passes/decompose_ops.cpp index 9f5b3c80b6a07..1276a1f97245a 100644 --- a/torch/csrc/jit/passes/decompose_ops.cpp +++ b/torch/csrc/jit/passes/decompose_ops.cpp @@ -22,7 +22,7 @@ c10::AliasAnalysisKind aliasAnalysisFromSchema() { // helper to determine if an optional tensor argument/value passed in is // statically defined (neither a None constant nor a Optional[Tensor] type) // return yes, no, or no value if we can't tell -static c10::optional isDefined(Value* tensor) { +static std::optional isDefined(Value* tensor) { if (tensor->type()->isSubtypeOf(*TensorType::get())) { return true; } diff --git a/torch/csrc/jit/passes/device_type_analysis.cpp b/torch/csrc/jit/passes/device_type_analysis.cpp index 590ac9e2896a8..7670292696ae6 100644 --- a/torch/csrc/jit/passes/device_type_analysis.cpp +++ b/torch/csrc/jit/passes/device_type_analysis.cpp @@ -27,7 +27,7 @@ of the Node (based on the rule itself) Returns: Bool indicating if anything was changed */ -bool setDeviceType(Value* value, c10::optional device) { +bool setDeviceType(Value* value, std::optional device) { auto tensor_type = value->type()->expect(); bool changed = tensor_type->device() != device; if (changed) { @@ -36,7 +36,7 @@ bool setDeviceType(Value* value, c10::optional device) { return changed; } -bool setReturnsToDevice(Node* n, c10::optional device) { +bool setReturnsToDevice(Node* n, std::optional device) { bool changed = false; for (Value* out : n->outputs()) { auto tensor_type = out->type()->cast(); @@ -93,7 +93,7 @@ bool propWithNoDevice(Node* n) { auto tensor_type = n->inputs()[input_num]->type()->expect(); bool only_seen_cpu_zerodim = isZerodimCPUTensor(tensor_type); - c10::optional device = tensor_type->device(); + std::optional device = tensor_type->device(); // Now see if all inputs have a consistent device type for (input_num++; input_num < n->inputs().size(); input_num++) { diff --git a/torch/csrc/jit/passes/dtype_analysis.cpp b/torch/csrc/jit/passes/dtype_analysis.cpp index feeb5f567cd0d..f63ea6f341948 100644 --- a/torch/csrc/jit/passes/dtype_analysis.cpp +++ b/torch/csrc/jit/passes/dtype_analysis.cpp @@ -99,7 +99,7 @@ static bool canBeInferredWithMetaTensor(Node* n) { return true; } -c10::optional inferWithMetaTensor(Node* n) { +std::optional inferWithMetaTensor(Node* n) { GRAPH_DEBUG("inferWithMetaTensor", getHeader(n)); if (!canBeInferredWithMetaTensor(n)) { return c10::nullopt; diff --git a/torch/csrc/jit/passes/fold_conv_bn.cpp b/torch/csrc/jit/passes/fold_conv_bn.cpp index 9df6887d24289..6f0c82e7bebe2 100644 --- a/torch/csrc/jit/passes/fold_conv_bn.cpp +++ b/torch/csrc/jit/passes/fold_conv_bn.cpp @@ -105,7 +105,7 @@ void addBiasForConvIfNone(Module& module, const std::string& pattern_name) { if (!t->hasAttribute("bias")) { auto optional_tensor_type = OptionalType::create(TensorType::get()); t->addAttribute("bias", std::move(optional_tensor_type), true); - auto optional_tensor = c10::optional(); + auto optional_tensor = std::optional(); module.setattr("bias", std::move(optional_tensor)); replaceConvBiasWithGetAttr(module); } diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp index 9ebbaa4e53e0d..4d67d5d217813 100644 --- a/torch/csrc/jit/passes/freeze_module.cpp +++ b/torch/csrc/jit/passes/freeze_module.cpp @@ -167,7 +167,7 @@ class AttributePropagator { // Examples: // submodule1.submodule2.foo -> {submodule2, "foo"} // submodule1.non_existent_module.foo -> nullopt - c10::optional resolveName(const std::string& name) { + std::optional resolveName(const std::string& name) { auto sub_names = splitName(name); if (sub_names.empty()) { return c10::nullopt; @@ -225,7 +225,7 @@ class AttributePropagator { return true; } - c10::optional> getModulePath( + std::optional> getModulePath( Value* input, std::shared_ptr& graph) { bool success = _loadModulePath(input, graph); diff --git a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp index f6f63de01a498..c28e99a445258 100644 --- a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp +++ b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp @@ -1099,7 +1099,7 @@ class MKLDNNSubgraphSlicer { // Try to merge `consumer` into `producer`. If successful, this destroys // `consumer` and returns the `producer` group. - c10::optional tryMerge(Node* producer, Node* consumer) { + std::optional tryMerge(Node* producer, Node* consumer) { AT_ASSERT(producer->kind() == prim::MKLDNNGroup); bool canMerge = shouldConsiderForMerge(consumer) && aliasDb_.moveAfterTopologicallyValid(consumer, producer); diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp index 0acc6f9bd07bb..9848783072621 100644 --- a/torch/csrc/jit/passes/graph_fuser.cpp +++ b/torch/csrc/jit/passes/graph_fuser.cpp @@ -490,7 +490,7 @@ struct GraphFuser { return true; } - c10::optional findFusedChunk(Node* group, Value* input) { + std::optional findFusedChunk(Node* group, Value* input) { AT_ASSERT(group->kind() == prim::FusionGroup); auto it = std::find(group->inputs().begin(), group->inputs().end(), input); if (it == group->inputs().end()) { diff --git a/torch/csrc/jit/passes/graph_rewrite_helper.cpp b/torch/csrc/jit/passes/graph_rewrite_helper.cpp index cd06bee7fc4ab..edb9f5b9589a0 100644 --- a/torch/csrc/jit/passes/graph_rewrite_helper.cpp +++ b/torch/csrc/jit/passes/graph_rewrite_helper.cpp @@ -27,7 +27,7 @@ Value* getValue( return match_vmap.at(vmap.at(name)); } -c10::optional getIValue( +std::optional getIValue( const std::string& name, const std::unordered_map& match_vmap, const std::unordered_map& vmap) { diff --git a/torch/csrc/jit/passes/graph_rewrite_helper.h b/torch/csrc/jit/passes/graph_rewrite_helper.h index 0920830babb8b..9f8b9f0a1b8fa 100644 --- a/torch/csrc/jit/passes/graph_rewrite_helper.h +++ b/torch/csrc/jit/passes/graph_rewrite_helper.h @@ -14,7 +14,7 @@ Value* getValue( const std::string& name, const std::unordered_map& match_vmap, const std::unordered_map& vmap); -c10::optional getIValue( +std::optional getIValue( const std::string& name, const std::unordered_map& match_vmap, const std::unordered_map& vmap); diff --git a/torch/csrc/jit/passes/hoist_conv_packed_params.cpp b/torch/csrc/jit/passes/hoist_conv_packed_params.cpp index ef3b861772c31..c3db2373f2a3c 100644 --- a/torch/csrc/jit/passes/hoist_conv_packed_params.cpp +++ b/torch/csrc/jit/passes/hoist_conv_packed_params.cpp @@ -100,7 +100,7 @@ void HoistConvPackedParams(script::Module& m) { n->kind() == prim::GetAttr && n->s(attr::name) == "_packed_params"; if (isGetPackedParamsNode) { // make sure the foo in {foo}.{_packed_params} is a quantized conv - c10::optional moduleName = getModuleName(n->inputs()[0]); + std::optional moduleName = getModuleName(n->inputs()[0]); bool moduleNameIsQuantizedConv = moduleName.has_value() && (moduleName.value() == "__torch__.torch.ao.nn.quantized.modules.conv.Conv1d" || diff --git a/torch/csrc/jit/passes/integer_value_refinement.cpp b/torch/csrc/jit/passes/integer_value_refinement.cpp index e3a339efe6d7b..16a329b3b11f3 100644 --- a/torch/csrc/jit/passes/integer_value_refinement.cpp +++ b/torch/csrc/jit/passes/integer_value_refinement.cpp @@ -204,7 +204,7 @@ struct IntegerValueRefiner { return block_refinements; }; - c10::optional tryFindRefinement(Value* v) { + std::optional tryFindRefinement(Value* v) { for (const auto& ref : active_refinements_) { auto maybe_refinement = ref->find(v); if (maybe_refinement != ref->end()) { diff --git a/torch/csrc/jit/passes/loop_unrolling.cpp b/torch/csrc/jit/passes/loop_unrolling.cpp index 3df61ad8a7765..4fac1cfbe5fbf 100644 --- a/torch/csrc/jit/passes/loop_unrolling.cpp +++ b/torch/csrc/jit/passes/loop_unrolling.cpp @@ -19,7 +19,7 @@ static constexpr int64_t kMaxBodySize = 32; static constexpr int64_t kMaxBodyRepeats = 64; bool isTrueConstant(Value* val) { - c10::optional maybe_value = constant_as(val); + std::optional maybe_value = constant_as(val); return maybe_value && *maybe_value; } @@ -178,7 +178,7 @@ void unroll(Node* loop) { // Some optimization for constant-length loops. If we know they won't run too // many times, then we can unroll them entirely. Value* trip_count = loop->inputs().at(0); - c10::optional const_len = constant_as(trip_count); + std::optional const_len = constant_as(trip_count); if (const_len && *const_len < kMaxBodyRepeats) { Block* dest = loop->addBlock(); repeatBody(body, *const_len, dest); diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp index e6cbd22efb014..85b49dd31e94e 100644 --- a/torch/csrc/jit/passes/onnx.cpp +++ b/torch/csrc/jit/passes/onnx.cpp @@ -218,6 +218,13 @@ py::dict BlockToONNX( } } + // Determine if all inputs are static. This is used for each node to + // determine whether or not to propagate shapes. + if (!is_sub_block) { + bool static_input_shape = AllGraphInputsStatic(ctx.block->owningGraph()); + ConstantValueMap::SetAllGraphInputsStatic(static_input_shape); + } + // Finally, visit all nodes in the graph for (auto node : old_block->nodes()) { NodeToONNX(node, ctx.block, operator_export_type, env, values_in_env); diff --git a/torch/csrc/jit/passes/onnx/constant_fold.cpp b/torch/csrc/jit/passes/onnx/constant_fold.cpp index 1d0457c65a5fb..4eeba79aae90c 100644 --- a/torch/csrc/jit/passes/onnx/constant_fold.cpp +++ b/torch/csrc/jit/passes/onnx/constant_fold.cpp @@ -64,7 +64,7 @@ void handleNegativeStartEndIndex( } } -c10::optional runTorchSlice_opset9( +std::optional runTorchSlice_opset9( const Node* node, std::vector& inputTensorValues) { assert(inputTensorValues.size() == 1); @@ -101,10 +101,10 @@ c10::optional runTorchSlice_opset9( return c10::nullopt; updated_val = at::narrow(updated_val, axis, start, length); } - return c10::optional(updated_val); + return std::optional(updated_val); } -c10::optional runTorchSlice_opset10( +std::optional runTorchSlice_opset10( const Node* node, std::vector& inputTensorValues) { const int maxSliceInputCount = 5; @@ -195,7 +195,7 @@ c10::optional runTorchSlice_opset10( return c10::nullopt; updated_val = at::narrow(updated_val, axis, start, length); } - return c10::optional(updated_val); + return std::optional(updated_val); } // Refer to AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF @@ -259,7 +259,7 @@ at::Tensor IntToTensor(int64_t value) { return at::squeeze(f_copy, 0); } -c10::optional runTorchBackendForOnnx( +std::optional runTorchBackendForOnnx( const Node* node, std::vector& inputTensorValues, int opset_version) { @@ -280,10 +280,10 @@ c10::optional runTorchBackendForOnnx( } updated_val = at::cat(at::TensorList(inputTensorValues), node->i(attr::axis)); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Sqrt) { updated_val = at::sqrt(inputTensorValues[0]); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Div) { // One example shows at::div(CPULongType, CPULongType) = CPUFloatType, // So we add a cast below. @@ -292,16 +292,16 @@ c10::optional runTorchBackendForOnnx( inputTensorValues[1].scalar_type()) { updated_val = updated_val.to(inputTensorValues[0].scalar_type()); } - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Mul) { updated_val = at::mul(inputTensorValues[0], inputTensorValues[1]); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Sub) { updated_val = at::sub(inputTensorValues[0], inputTensorValues[1]); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Add) { updated_val = at::add(inputTensorValues[0], inputTensorValues[1]); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Unsqueeze) { if (opset_version >= ONNX_OPSET_13) { assert(inputTensorValues.size() == 2); @@ -328,7 +328,7 @@ c10::optional runTorchBackendForOnnx( for (int64_t i = 0; i < inputTensorValues[1].sizes()[0]; ++i) { updated_val = at::unsqueeze(updated_val, axes[i]); } - return c10::optional(updated_val); + return std::optional(updated_val); } else if (opset_version >= ONNX_OPSET_9) { assert(inputTensorValues.size() == 1); if (!node->hasAttributeS("axes")) { @@ -340,7 +340,7 @@ c10::optional runTorchBackendForOnnx( for (auto axis : axesAttr) { updated_val = at::unsqueeze(updated_val, axis); } - return c10::optional(updated_val); + return std::optional(updated_val); } else { TORCH_WARN( "Constant folding - unsupported opset version. " @@ -373,7 +373,7 @@ c10::optional runTorchBackendForOnnx( updated_val = at::squeeze(updated_val, axes[i]); } } - return c10::optional(updated_val); + return std::optional(updated_val); } else if (opset_version >= ONNX_OPSET_9) { assert(inputTensorValues.size() == 1); updated_val = inputTensorValues[0]; @@ -384,7 +384,7 @@ c10::optional runTorchBackendForOnnx( updated_val = at::squeeze(updated_val, axis); } } - return c10::optional(updated_val); + return std::optional(updated_val); } else { TORCH_WARN( "Constant folding - unsupported opset version. " @@ -397,13 +397,13 @@ c10::optional runTorchBackendForOnnx( return c10::nullopt; } updated_val = inputTensorValues[0].permute(node->is(attr::perm)); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Cast) { assert(inputTensorValues.size() == 1); if (node->hasAttributeS("to") && ONNXTypeToATenType(node->i(attr::to))) { updated_val = inputTensorValues[0].to( ONNXTypeToATenType(node->i(attr::to)).value()); - return c10::optional(updated_val); + return std::optional(updated_val); } return c10::nullopt; } else if (node->kind() == onnx::Reshape) { @@ -433,11 +433,11 @@ c10::optional runTorchBackendForOnnx( shape[i] = shape_a[i]; } } - return c10::optional(at::reshape(updated_val, shape)); + return std::optional(at::reshape(updated_val, shape)); } else if (node->kind() == onnx::Shape) { TORCH_INTERNAL_ASSERT(inputTensorValues.size() == 1); updated_val = at::_shape_as_tensor(inputTensorValues[0]); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::ReduceL1 || node->kind() == onnx::ReduceL2) { assert(inputTensorValues.size() == 1); if (!node->hasAttributeS("axes")) { @@ -449,7 +449,7 @@ c10::optional runTorchBackendForOnnx( int p = node->kind() == onnx::ReduceL1 ? 1 : 2; updated_val = at::norm( inputTensorValues[0], p, node->is(attr::axes), node->i(attr::keepdims)); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::ReduceProd) { int64_t rank = inputTensorValues[0].sizes().size(); std::vector axes; @@ -469,7 +469,7 @@ c10::optional runTorchBackendForOnnx( for (const auto& axis : axes) { updated_val = at::prod(updated_val, axis, keepdims); } - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Gather) { assert(inputTensorValues.size() == 2); // default axis = 0 @@ -503,41 +503,41 @@ c10::optional runTorchBackendForOnnx( if (q < 1) { updated_val = updated_val.squeeze(axis); } - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Range) { updated_val = runTorchArange_opset11(node, inputTensorValues); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Where) { updated_val = at::where( inputTensorValues[0], inputTensorValues[1], inputTensorValues[2]); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Equal) { updated_val = at::eq(inputTensorValues[0], inputTensorValues[1]); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Greater) { updated_val = at::greater(inputTensorValues[0], inputTensorValues[1]); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Less) { updated_val = at::less(inputTensorValues[0], inputTensorValues[1]); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Neg) { updated_val = at::neg(inputTensorValues[0]); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Not) { auto ones = at::ones(inputTensorValues[0].sizes(), inputTensorValues[0].dtype()); updated_val = at::ne(inputTensorValues[0], ones); - return c10::optional(updated_val); + return std::optional(updated_val); } else if (node->kind() == onnx::Size) { int64_t total_size = 1; for (auto size : inputTensorValues[0].sizes()) { total_size *= size; } - return c10::optional(IntToTensor(total_size)); + return std::optional(IntToTensor(total_size)); } else if (node->kind() == onnx::Softmax) { int64_t axis = node->hasAttributeS("axis") ? node->i(attr::axis) : -1; updated_val = at::softmax(inputTensorValues[0], axis); - return c10::optional(updated_val); + return std::optional(updated_val); } else { return c10::nullopt; } diff --git a/torch/csrc/jit/passes/onnx/constant_fold.h b/torch/csrc/jit/passes/onnx/constant_fold.h index 8bfb0dd081c39..201c3def32685 100644 --- a/torch/csrc/jit/passes/onnx/constant_fold.h +++ b/torch/csrc/jit/passes/onnx/constant_fold.h @@ -19,7 +19,7 @@ namespace onnx_constant_fold { at::Tensor IntToTensor(int64_t value); -c10::optional runTorchBackendForOnnx( +std::optional runTorchBackendForOnnx( const Node* node, std::vector& inputTensorValues, int opset_version); diff --git a/torch/csrc/jit/passes/onnx/constant_map.cpp b/torch/csrc/jit/passes/onnx/constant_map.cpp index c36440da8d811..8fd1bed0b7a1b 100644 --- a/torch/csrc/jit/passes/onnx/constant_map.cpp +++ b/torch/csrc/jit/passes/onnx/constant_map.cpp @@ -32,13 +32,22 @@ bool ConstantValueMap::HasRank(const std::string& tensorName) { ConstantValueMap::getInstance().rankMap.end(); } -c10::optional ConstantValueMap::GetRank(const std::string& tensorName) { +std::optional ConstantValueMap::GetRank(const std::string& tensorName) { if (!HasRank(tensorName)) { return c10::nullopt; } return ConstantValueMap::getInstance().rankMap[tensorName]; } +void ConstantValueMap::SetAllGraphInputsStatic(bool all_static) { + ConstantValueMap::getInstance().allGraphInputsStatic = + c10::make_optional(all_static); +} + +c10::optional ConstantValueMap::GetAllGraphInputsStatic() { + return ConstantValueMap::getInstance().allGraphInputsStatic; +} + void ConstantValueMap::SetShape( const std::string& tensorName, const c10::SymbolicShape& shapeValue) { @@ -51,7 +60,7 @@ bool ConstantValueMap::HasShape(const std::string& tensorName) { ConstantValueMap::getInstance().shapeMap.end(); } -c10::optional ConstantValueMap::GetShape( +std::optional ConstantValueMap::GetShape( const std::string& tensorName) { if (!HasShape(tensorName)) { return c10::nullopt; @@ -70,7 +79,7 @@ bool ConstantValueMap::HasValue(const std::string& tensorName) { ConstantValueMap::getInstance().tensorValueMap.end(); } -c10::optional ConstantValueMap::GetValue( +std::optional ConstantValueMap::GetValue( const std::string& tensorName) { if (!HasValue(tensorName)) { return c10::nullopt; @@ -94,7 +103,7 @@ std::vector ConstantValueMap::GetCompleteShapeInto1DInt64Vector( return shape_value; } -c10::optional> ConstantValueMap::GetShapeInto1DInt64Vector( +std::optional> ConstantValueMap::GetShapeInto1DInt64Vector( const std::string& value_name) { if (ConstantValueMap::HasShape(value_name)) { auto shape_size = ConstantValueMap::GetShape(value_name).value(); @@ -107,7 +116,7 @@ c10::optional> ConstantValueMap::GetShapeInto1DInt64Vector( return c10::nullopt; } -c10::optional> ConstantValueMap:: +std::optional> ConstantValueMap:: GetShapeInto1DInt64VectorWithOneUnknown(const std::string& value_name) { if (ConstantValueMap::HasShape(value_name)) { auto shape_size = ConstantValueMap::GetShape(value_name).value(); @@ -163,7 +172,7 @@ bool ConstantValueMap::HasTypeReliable(const std::string& tensorName) { ConstantValueMap::getInstance().typeReliableMap.end(); } -c10::optional ConstantValueMap::GetTypeReliable( +std::optional ConstantValueMap::GetTypeReliable( const std::string& tensorName) { if (!HasTypeReliable(tensorName)) { return c10::nullopt; @@ -182,7 +191,7 @@ bool ConstantValueMap::HasUseInferredType(const std::string& tensorName) { ConstantValueMap::getInstance().useInferredTypeMap.end(); } -c10::optional ConstantValueMap::GetUseInferredType( +std::optional ConstantValueMap::GetUseInferredType( const std::string& tensorName) { if (!HasUseInferredType(tensorName)) { return c10::nullopt; @@ -201,7 +210,7 @@ bool ConstantValueMap::HasShapeValue(const std::string& tensorName) { ConstantValueMap::getInstance().shapeValueMap.end(); } -c10::optional ConstantValueMap::GetShapeValue( +std::optional ConstantValueMap::GetShapeValue( const std::string& tensorName) { if (!HasShapeValue(tensorName)) { return c10::nullopt; @@ -218,6 +227,10 @@ SymbolDimMap& ConstantValueMap::GetSymbolDimMap() { return ConstantValueMap::getInstance().symbolDimMap; } +DimSymbolMap& ConstantValueMap::GetDimSymbolMap() { + return ConstantValueMap::getInstance().dimSymbolMap; +} + template void UpdateStrKey( Map& map, @@ -262,6 +275,8 @@ void ConstantValueMap::ClearMaps() { ConstantValueMap::getInstance().shapeValueMap.clear(); ConstantValueMap::getInstance().inferredShapeData.clear(); ConstantValueMap::getInstance().symbolDimMap.clear(); + ConstantValueMap::getInstance().dimSymbolMap.clear(); + ConstantValueMap::getInstance().allGraphInputsStatic = c10::nullopt; } // For debug only. @@ -349,6 +364,15 @@ void ConstantValueMap::PrintMaps() { std::cout << std::endl; } } + std::cout << "DimSymbol Map:" << std::endl; + count = 0; + for (const auto& x : ConstantValueMap::getInstance().dimSymbolMap) { + std::cout << "(" << x.first << ": " << x.second << "), "; + count++; + if (count % 10 == 0) { + std::cout << std::endl; + } + } } } // namespace jit diff --git a/torch/csrc/jit/passes/onnx/constant_map.h b/torch/csrc/jit/passes/onnx/constant_map.h index b7b534d730587..303d373eea56f 100644 --- a/torch/csrc/jit/passes/onnx/constant_map.h +++ b/torch/csrc/jit/passes/onnx/constant_map.h @@ -24,49 +24,53 @@ class ConstantValueMap { static ConstantValueMap& getInstance(); static void SetRank(const std::string& tensorName, size_t rankValue); static bool HasRank(const std::string& tensorName); - static c10::optional GetRank(const std::string& tensorName); + static std::optional GetRank(const std::string& tensorName); + + static void SetAllGraphInputsStatic(bool all_static); + static c10::optional GetAllGraphInputsStatic(); static void SetShape( const std::string& tensorName, const c10::SymbolicShape& shapeValue); static bool HasShape(const std::string& tensorName); - static c10::optional GetShape( + static std::optional GetShape( const std::string& tensorName); static void SetValue(const std::string& tensorName, const at::Tensor& value); static bool HasValue(const std::string& tensorName); - static c10::optional GetValue(const std::string& tensorName); + static std::optional GetValue(const std::string& tensorName); static void EraseValue(const std::string& tensorName); static std::vector GetCompleteShapeInto1DInt64Vector( const c10::SymbolicShape& shape); - static c10::optional> GetShapeInto1DInt64Vector( + static std::optional> GetShapeInto1DInt64Vector( const std::string& value_name); - static c10::optional> + static std::optional> GetShapeInto1DInt64VectorWithOneUnknown(const std::string& value_name); static std::vector GetValueInto1DInt64Vector( const std::string& value_name); static void SetTypeReliable(const std::string& tensorName, bool reliable); static bool HasTypeReliable(const std::string& tensorName); - static c10::optional GetTypeReliable(const std::string& tensorName); + static std::optional GetTypeReliable(const std::string& tensorName); static void SetUseInferredType( const std::string& tensorName, bool useInferredType); static bool HasUseInferredType(const std::string& tensorName); - static c10::optional GetUseInferredType(const std::string& tensorName); + static std::optional GetUseInferredType(const std::string& tensorName); static void SetShapeValue( const std::string& tensorName, const c10::SymbolicShape& shapeValue); static bool HasShapeValue(const std::string& tensorName); - static c10::optional GetShapeValue( + static std::optional GetShapeValue( const std::string& tensorName); static ShapeDataMap& GetInferredShapeData(); static SymbolDimMap& GetSymbolDimMap(); + static DimSymbolMap& GetDimSymbolMap(); static void UpdateValueName( const std::string& old_name, @@ -101,6 +105,9 @@ class ConstantValueMap { // during future node-level shape inference. ShapeDataMap inferredShapeData; SymbolDimMap symbolDimMap; + DimSymbolMap dimSymbolMap; + // Stores if all graph-level inputs have static shape + c10::optional allGraphInputsStatic; }; } // namespace jit diff --git a/torch/csrc/jit/passes/onnx/function_extraction.cpp b/torch/csrc/jit/passes/onnx/function_extraction.cpp index d6555c5c5bb70..c545c7aba823a 100644 --- a/torch/csrc/jit/passes/onnx/function_extraction.cpp +++ b/torch/csrc/jit/passes/onnx/function_extraction.cpp @@ -58,8 +58,8 @@ struct FunctionExtractor { scope_ctx_map& scope_ctxs); void DebugPrint() const; void SetAttrName(Node* ref_n, Symbol attr, const std::string& name); - c10::optional FindAttrName(Node* ref_n, Symbol attr); - c10::optional FindAttrName(Node* ref_const_n); + std::optional FindAttrName(Node* ref_n, Symbol attr); + std::optional FindAttrName(Node* ref_const_n); ScopePtr scope_key_; scope_ctx_map scope_ctxs_; @@ -76,10 +76,10 @@ struct FunctionExtractor { using func_ctx_map = std::unordered_map; static bool IsValidScope(ScopePtr s); - static c10::optional InferScope(Node* n); + static std::optional InferScope(Node* n); static bool IsAncestor(ScopePtr parent, ScopePtr child); - static c10::optional FindCommonAncestor(ScopePtr a, ScopePtr b); - static c10::optional FindCommonAncestor(const scope_list& scopes); + static std::optional FindCommonAncestor(ScopePtr a, ScopePtr b); + static std::optional FindCommonAncestor(const scope_list& scopes); std::shared_ptr ConstructFuncGraph(FunctionContext& ctx); void ConvertScopeToFunction( @@ -219,7 +219,7 @@ void FunctionExtractor::FunctionContext::SetAttrName( auto n_attr_it = node_attr_to_name_[n_in_def][attr.toUnqualString()] = name; } -c10::optional FunctionExtractor::FunctionContext::FindAttrName( +std::optional FunctionExtractor::FunctionContext::FindAttrName( Node* ref_n, Symbol attr) { auto v_it = @@ -297,7 +297,7 @@ bool FunctionExtractor::IsAncestor(ScopePtr parent, ScopePtr child) { return false; } -c10::optional FunctionExtractor::FindCommonAncestor( +std::optional FunctionExtractor::FindCommonAncestor( ScopePtr a, ScopePtr b) { if (!IsValidScope(a) || !IsValidScope(b)) { @@ -330,13 +330,13 @@ c10::optional FunctionExtractor::FindCommonAncestor( return c10::nullopt; } -c10::optional FunctionExtractor::FindCommonAncestor( +std::optional FunctionExtractor::FindCommonAncestor( const scope_list& scopes) { if (scopes.empty()) { return c10::nullopt; } - c10::optional common_ancestor = scopes.at(0); + std::optional common_ancestor = scopes.at(0); for (const auto& scope : scopes) { common_ancestor = FindCommonAncestor(common_ancestor.value(), scope); if (!common_ancestor.has_value()) { @@ -347,7 +347,7 @@ c10::optional FunctionExtractor::FindCommonAncestor( return common_ancestor; } -c10::optional FunctionExtractor::InferScope(Node* n) { +std::optional FunctionExtractor::InferScope(Node* n) { // The scope of node n is assigned based on the following rules. // 1. If all uses of outputs of n belongs to the same scope, // assign that scope, otherwise diff --git a/torch/csrc/jit/passes/onnx/function_substitution.cpp b/torch/csrc/jit/passes/onnx/function_substitution.cpp index a6e2f89e106ec..81bfa3fd6caf5 100644 --- a/torch/csrc/jit/passes/onnx/function_substitution.cpp +++ b/torch/csrc/jit/passes/onnx/function_substitution.cpp @@ -12,7 +12,7 @@ namespace { const std::string kTopModuleVariableName = ""; std::string TidyClassNameFromTorchScript( - const c10::optional& class_name) { + const std::optional& class_name) { if (!class_name) { return "UNKNOWN_CLASS"; } diff --git a/torch/csrc/jit/passes/onnx/helper.cpp b/torch/csrc/jit/passes/onnx/helper.cpp index d6b2a6385fab4..9d4c5061414c5 100644 --- a/torch/csrc/jit/passes/onnx/helper.cpp +++ b/torch/csrc/jit/passes/onnx/helper.cpp @@ -61,7 +61,7 @@ void buildParamsMapFromValueToParamsMap( } } -c10::optional ONNXTypeToATenType(int32_t onnx_type) { +std::optional ONNXTypeToATenType(int32_t onnx_type) { switch (onnx_type) { case ::ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED: return at::ScalarType::Undefined; @@ -104,7 +104,7 @@ c10::optional ONNXTypeToATenType(int32_t onnx_type) { onnx_type, " is an unexpected tensor scalar type"); } - return c10::optional{}; + return std::optional{}; } Node* addNodeToBlock(Block* block, Symbol kind, ArrayRef inputs) { diff --git a/torch/csrc/jit/passes/onnx/helper.h b/torch/csrc/jit/passes/onnx/helper.h index 77eb98ba8a707..9e09c638779ef 100644 --- a/torch/csrc/jit/passes/onnx/helper.h +++ b/torch/csrc/jit/passes/onnx/helper.h @@ -40,7 +40,7 @@ TORCH_API Node* addNodeToBlock( TORCH_API Value* addInputToBlock(Block* block); -TORCH_API c10::optional ONNXTypeToATenType(int32_t onnx_type); +TORCH_API std::optional ONNXTypeToATenType(int32_t onnx_type); // Use int return type as no sable way exists to forward declare protobuf enum TORCH_API int ATenTypeToOnnxType(at::ScalarType at_type); diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp index 41e3ac9ecc4e8..6110954990455 100644 --- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp +++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp @@ -77,7 +77,7 @@ Node* EncapsulateInplaceIndexPutForONNX(Node* index_put_node) { } // namespace -c10::optional EncapsulatePatternIntoSubblock(Node* n) { +std::optional EncapsulatePatternIntoSubblock(Node* n) { switch (n->kind()) { case aten::index_put_: case aten::index_put: { diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h index cd78663cffc47..6673d4aba3a75 100644 --- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h +++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h @@ -28,7 +28,7 @@ namespace jit { // the subblock of a new placeholder node. The outputs of the new placeholder // node are used in place of the original nodes instead. The category of the // pattern is stored as attr::name. -TORCH_API c10::optional EncapsulatePatternIntoSubblock(Node* n); +TORCH_API std::optional EncapsulatePatternIntoSubblock(Node* n); } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp index 9e1c17120f654..73c19851e569b 100644 --- a/torch/csrc/jit/passes/onnx/peephole.cpp +++ b/torch/csrc/jit/passes/onnx/peephole.cpp @@ -101,7 +101,7 @@ std::vector getBroadcastPositions(Node* node) { // Determine whether `from` can broadcast to `to`, and if so at which // position. `from` must be a suffix of `to`, except that any // occurrences of 1 in `from` are treated as wildcards. -c10::optional fusibleExpandTo( +std::optional fusibleExpandTo( at::IntArrayRef from, at::IntArrayRef to) { if (from.size() > to.size()) { @@ -156,7 +156,7 @@ void fuseBroadcast(Block* b) { } // Not all broadcasts are supported by ONNX broadcast. - c10::optional axis = fusibleExpandTo( + std::optional axis = fusibleExpandTo( unexpanded_input->type() ->expectRef() .sizes() diff --git a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp index 638acd464adcd..427e5771a9f0f 100644 --- a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp +++ b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp @@ -97,7 +97,7 @@ static bool IsImplicitCastSupported(const NodeKind& nodeKind) { IsSelectorOp(nodeKind); } -static c10::optional PromoteScalarTypes( +static std::optional PromoteScalarTypes( const std::vector& types) { if (types.empty()) { return c10::nullopt; @@ -112,7 +112,7 @@ static c10::optional PromoteScalarTypes( // Type promotion between scalars and tensors // per logic here // https://pytorch.org/docs/main/tensor_attributes.html#tensor-attributes -static c10::optional PromoteScalarTypesWithCategory( +static std::optional PromoteScalarTypesWithCategory( const std::vector& typesFromTensors, const std::vector& typesFromScalars) { auto typeFromTensor = PromoteScalarTypes(typesFromTensors); @@ -146,12 +146,12 @@ static c10::optional PromoteScalarTypesWithCategory( return typeFromTensor; } -static c10::optional InferExpectedScalarType(const Node* n) { +static std::optional InferExpectedScalarType(const Node* n) { std::vector typesFromTensors; std::vector typesFromScalars; auto get_scalar_type = - [](const Value* input) -> c10::optional { + [](const Value* input) -> std::optional { if (auto* tensor_type = input->type()->castRaw()) { return tensor_type->scalarType(); } @@ -252,7 +252,7 @@ static c10::optional InferExpectedScalarType(const Node* n) { } }); - c10::optional st = c10::nullopt; + std::optional st = c10::nullopt; const auto output_st = get_scalar_type(n->output()); if (IsComparisonOp(n->kind())) { @@ -280,7 +280,7 @@ static c10::optional InferExpectedScalarType(const Node* n) { return st; } -static c10::optional LowPrecisionCastForStandardOps( +static std::optional LowPrecisionCastForStandardOps( const Node* n, const c10::ScalarType& scalar_type) { // Some of standardOps do not support uint8\int8\int16 type for ONNX diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp index 186623bf4e049..dd79754f4c016 100644 --- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp +++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp @@ -87,21 +87,24 @@ namespace onnx_torch = ::torch::onnx; namespace onnx = ::ONNX_NAMESPACE; namespace diagnostics = ::torch::onnx::diagnostics; +// SymbolDimMap is a Torch-to-ONNX shape look-up. This is built so it can be +// returned by the export function. During the export however, when we come +// across new ONNX shapes, the reverse look-up is needed. To avoid incurring +// a linear-time look-up, we maintain DimSymbolMap in parallel. c10::ShapeSymbol ONNXDimToShapeSymbol( const onnx::TensorShapeProto_Dimension& dim, - SymbolDimMap& symbol_dim_map) { + SymbolDimMap& symbol_dim_map, + DimSymbolMap& dim_symbol_map) { if (dim.has_dim_value()) { return c10::ShapeSymbol::fromStaticSize(dim.dim_value()); } - c10::optional sym = c10::nullopt; + std::optional sym = c10::nullopt; if (dim.has_dim_param()) { // If this param is already known, assign the same Symbol. GRAPH_UPDATE("Got dim_param:", dim.dim_param()); - for (const auto& pair : symbol_dim_map) { - if (pair.second == dim.dim_param()) { - sym = pair.first; - break; - } + auto maybe_symbol = dim_symbol_map.find(dim.dim_param()); + if (maybe_symbol != dim_symbol_map.end()) { + sym = maybe_symbol->second; } } if (!sym) { @@ -109,14 +112,16 @@ c10::ShapeSymbol ONNXDimToShapeSymbol( // If dim.dim_param() is empty, no need to keep track // because there won't be duplicates. symbol_dim_map[sym.value()] = dim.dim_param(); + dim_symbol_map[dim.dim_param()] = sym.value(); } return sym.value(); } TensorTypePtr TorchTensorTypeFromONNX( const onnx::TypeProto_Tensor& onnx_tensor_type, - SymbolDimMap& symbol_dim_map) { - c10::optional scalar_type; + SymbolDimMap& symbol_dim_map, + DimSymbolMap& dim_symbol_map) { + std::optional scalar_type; if (onnx_tensor_type.has_elem_type()) { scalar_type = ONNXTypeToATenType(onnx_tensor_type.elem_type()); } @@ -132,8 +137,8 @@ TensorTypePtr TorchTensorTypeFromONNX( const auto& onnx_shape = onnx_tensor_type.shape(); for (const auto i : c10::irange(onnx_shape.dim_size())) { - sizes.emplace_back( - ONNXDimToShapeSymbol(onnx_shape.dim(i), symbol_dim_map)); + sizes.emplace_back(ONNXDimToShapeSymbol( + onnx_shape.dim(i), symbol_dim_map, dim_symbol_map)); } v_type = TensorType::create(scalar_type, at::kCPU, sizes.size(), {}); v_type = v_type->withSymbolicShapes(c10::SymbolicShape(sizes)); @@ -150,13 +155,14 @@ TensorTypePtr TorchTensorTypeFromONNX( ListTypePtr TorchListTypeFromONNX( const onnx::TypeProto_Sequence& onnx_sequence_type, - SymbolDimMap& symbol_dim_map) { + SymbolDimMap& symbol_dim_map, + DimSymbolMap& dim_symbol_map) { if (onnx_sequence_type.has_elem_type()) { const auto& onnx_seq_elem_type = onnx_sequence_type.elem_type(); if (onnx_seq_elem_type.has_tensor_type()) { const auto& onnx_tensor_type = onnx_seq_elem_type.tensor_type(); - const auto v_tensor_type = - TorchTensorTypeFromONNX(onnx_tensor_type, symbol_dim_map); + const auto v_tensor_type = TorchTensorTypeFromONNX( + onnx_tensor_type, symbol_dim_map, dim_symbol_map); auto v_type = ListType::create(v_tensor_type); return v_type; } @@ -167,21 +173,22 @@ ListTypePtr TorchListTypeFromONNX( void UpdateTorchValueByOnnxValueInfo( Value* v, const onnx::ValueInfoProto& p_info, - SymbolDimMap& symbol_dim_map) { + SymbolDimMap& symbol_dim_map, + DimSymbolMap& dim_symbol_map) { if (!p_info.has_type()) { return; } const auto& p_type = p_info.type(); if (p_type.has_tensor_type()) { - const auto torch_tensor_type = - TorchTensorTypeFromONNX(p_type.tensor_type(), symbol_dim_map); + const auto torch_tensor_type = TorchTensorTypeFromONNX( + p_type.tensor_type(), symbol_dim_map, dim_symbol_map); if (torch_tensor_type) { MergeInferredTypeAndSetMap(v, v->type(), torch_tensor_type); } } else if (p_type.has_sequence_type()) { - const auto torch_list_type = - TorchListTypeFromONNX(p_type.sequence_type(), symbol_dim_map); + const auto torch_list_type = TorchListTypeFromONNX( + p_type.sequence_type(), symbol_dim_map, dim_symbol_map); if (torch_list_type) { MergeInferredTypeAndSetMap(v, v->type(), torch_list_type); } @@ -260,7 +267,7 @@ Value* CloneValueFromListConstruct( // is preserved. If the elemtype is Int, insert a onnx::Concat node into // the graph. TypePtr elem = v->type()->castRaw()->getElementType(); - c10::optional scalar_type = c10::nullopt; + std::optional scalar_type = c10::nullopt; if (elem->cast()) { scalar_type = at::kLong; if (isValidToTransformToONNXConcatNode(v->node())) { @@ -325,7 +332,7 @@ Node* CloneNodeToGraph( // Try to lookup input value and insert it into the graph. // If the input value is unknown, set it to graph input in the new // graph, and copy over metadata, such as datatype and shape. - ::c10::optional val = ::c10::nullopt; + ::std::optional val = ::c10::nullopt; auto v0 = params_dict.find(v->debugName()); if (v0 != params_dict.end()) { val = v0->second.toTensor(); @@ -377,6 +384,7 @@ void ConvertGraphToONNXProto( std::shared_ptr graph, std::shared_ptr& model_proto, SymbolDimMap& symbol_dim_map, + DimSymbolMap& dim_symbol_map, int opset_version) { RawDataExportMap export_map; bool val_use_external_data_format; @@ -402,12 +410,15 @@ void ConvertGraphToONNXProto( false, std::string()); symbol_dim_map.insert(new_symbol_dim_map.begin(), new_symbol_dim_map.end()); + for (const auto& pair : new_symbol_dim_map) { + dim_symbol_map[pair.second] = pair.first; + } for (int i = 0; i < model_proto->graph().output_size(); ++i) { model_proto->mutable_graph()->mutable_output(i)->clear_type(); } } -c10::optional ComputeConstantFolding(Node* n, int opset_version) { +std::optional ComputeConstantFolding(Node* n, int opset_version) { if (n->inputs().empty()) { return c10::nullopt; } @@ -437,7 +448,7 @@ c10::optional ComputeConstantFolding(Node* n, int opset_version) { } // Similar to the function above, but for symbolic shapes. -c10::optional<::c10::SymbolicShape> ComputeShapeFromReshape( +std::optional<::c10::SymbolicShape> ComputeShapeFromReshape( Node* n, const c10::SymbolicShape& input_shape, const c10::SymbolicShape& shape, @@ -549,7 +560,7 @@ c10::optional<::c10::SymbolicShape> ComputeShapeFromReshape( return final_shape_0; } -c10::optional<::c10::SymbolicShape> ComputeShapeFromExpand( +std::optional<::c10::SymbolicShape> ComputeShapeFromExpand( const std::vector<::c10::ShapeSymbol>& input_shape, const std::vector& reshape) { for (const auto& it : reshape) { @@ -588,7 +599,7 @@ c10::optional<::c10::SymbolicShape> ComputeShapeFromExpand( return shape; } -c10::optional<::c10::SymbolicShape> ComputeShapeFromTile( +std::optional<::c10::SymbolicShape> ComputeShapeFromTile( const std::vector<::c10::ShapeSymbol>& input_shape, const std::vector& reshape) { TORCH_INTERNAL_ASSERT( @@ -616,7 +627,7 @@ c10::optional<::c10::SymbolicShape> ComputeShapeFromTile( void UpdateRank(Value* value, size_t rank) { ConstantValueMap::SetRank(value->debugName(), rank); if (TensorTypePtr value_type = value->type()->cast()) { - c10::optional rank_opt = rank; + std::optional rank_opt = rank; auto shape = ::c10::SymbolicShape(rank_opt); value->setType(value_type->withSymbolicShapes(shape)); } @@ -662,7 +673,7 @@ void UpdateShapeConstantValueMap( } } -c10::optional> GetValueFromListConstructNode( +std::optional> GetValueFromListConstructNode( Node* lc_node) { std::vector shape_size; for (const auto& input : lc_node->inputs()) { @@ -676,7 +687,7 @@ c10::optional> GetValueFromListConstructNode( } } return lc_node->inputs().size() == shape_size.size() - ? c10::optional>(shape_size) + ? std::optional>(shape_size) : c10::nullopt; } @@ -1548,26 +1559,19 @@ bool IsListConstructIntType(const Value* v) { return false; } -bool AllGraphInputsStatic(const Graph* g) { - for (auto n : g->inputs()) { - if (TensorTypePtr input_type = n->type()->cast()) { - if (input_type->dim()) { - auto shape = input_type->symbolic_sizes(); - if (!ConstantValueMap::HasShape(n->debugName())) { - UpdateShapeConstantValueMap(n, shape); - } - } - } - } - for (auto n : g->inputs()) { - // Some inputs can be non-Tensor type, e.g., - // __torch__.torch.classes.quantized.LinearPackedParamsBase - // so we only need check Tensor type here. - if (n->type()->cast() && !n->isCompleteTensor()) { - return false; - } +// Check if all graph inputs are static and allow a cached value to return. +// Since this traverses all inputs of the graph (including weights), it can be +// costly for large graphs. Since this is called for each node in an export, +// and the inputs remain unchanged, we can cut down export time by caching. +bool AllGraphInputsStaticWithCaching(const Graph* g) { + auto maybe_is_static = ConstantValueMap::GetAllGraphInputsStatic(); + if (maybe_is_static.has_value()) { + return maybe_is_static.value(); + } else { + bool ret = AllGraphInputsStatic(g); + ConstantValueMap::SetAllGraphInputsStatic(ret); + return ret; } - return true; } void ProcessConstantValueMap(Node* n, int opset_version) { @@ -1581,7 +1585,7 @@ void ProcessConstantValueMap(Node* n, int opset_version) { // shapes UpdateReliable(n); - auto static_input_shape = AllGraphInputsStatic(n->owningGraph()); + auto static_input_shape = AllGraphInputsStaticWithCaching(n->owningGraph()); for (auto i : c10::irange(n->outputs().size())) { if (TensorTypePtr output_type = n->output(i)->type()->cast()) { if (output_type->dim().has_value()) { @@ -1803,7 +1807,8 @@ void UpdateOutputTypeByONNXProto( Node* n, Node* clone_node, const onnx::ModelProto& model_proto, - SymbolDimMap& symbol_dim_map) { + SymbolDimMap& symbol_dim_map, + DimSymbolMap& dim_symbol_map) { const auto& graph_proto = model_proto.graph(); // get data from value_info and updated original graph. @@ -1812,7 +1817,7 @@ void UpdateOutputTypeByONNXProto( for (size_t i = 0; i < n->outputs().size(); ++i) { if (clone_node->output(i)->debugName() == v_info.name()) { UpdateTorchValueByOnnxValueInfo( - n->output(i), v_info, symbol_dim_map); + n->output(i), v_info, symbol_dim_map, dim_symbol_map); } } }; @@ -1914,6 +1919,28 @@ void ONNXShapeTypeInference( static std::unordered_map> non_required_shape_inference_idx_map = {{"onnx::LSTM", {4}}}; +bool AllGraphInputsStatic(const Graph* g) { + for (auto n : g->inputs()) { + if (TensorTypePtr input_type = n->type()->cast()) { + if (input_type->dim()) { + auto shape = input_type->symbolic_sizes(); + if (!ConstantValueMap::HasShape(n->debugName())) { + UpdateShapeConstantValueMap(n, shape); + } + } + } + } + for (auto n : g->inputs()) { + // Some inputs can be non-Tensor type, e.g., + // __torch__.torch.classes.quantized.LinearPackedParamsBase + // so we only need check Tensor type here. + if (n->type()->cast() && !n->isCompleteTensor()) { + return false; + } + } + return true; +} + std::pair AreInputsReliableOrStatic(Node* n) { auto reliable = true; auto complete = true; @@ -2025,6 +2052,7 @@ void ONNXShapeTypeInference( auto& original_shape_data = ConstantValueMap::GetInferredShapeData(); ShapeDataMap inferred_shape_data; auto& symbol_dim_map = ConstantValueMap::GetSymbolDimMap(); + auto& dim_symbol_map = ConstantValueMap::GetDimSymbolMap(); SetGraphInputTypeReliable(n->owningGraph()); GRAPH_UPDATE( @@ -2079,7 +2107,7 @@ void ONNXShapeTypeInference( // e.g: ListConstruct, ListUnpack, etc. std::shared_ptr model_proto; ConvertGraphToONNXProto( - n_graph, model_proto, symbol_dim_map, opset_version); + n_graph, model_proto, symbol_dim_map, dim_symbol_map, opset_version); GRAPH_DEBUG( "ONNX graph to run shape inference: ", prettyPrint(*model_proto)); @@ -2104,7 +2132,7 @@ void ONNXShapeTypeInference( } } UpdateOutputTypeByONNXProto( - n, clone_node, *model_proto, symbol_dim_map); + n, clone_node, *model_proto, symbol_dim_map, dim_symbol_map); } catch (std::runtime_error& ex) { // TODO: include this as warning once we have a more consolidated // warning system. @@ -2146,8 +2174,8 @@ void ONNXShapeTypeInference( int rank = inferred_shape.dim_size(); std::vector<::c10::ShapeSymbol> final_shape(rank); for (int i = 0; i < rank; ++i) { - final_shape[i] = - ONNXDimToShapeSymbol(inferred_shape.dim(i), symbol_dim_map); + final_shape[i] = ONNXDimToShapeSymbol( + inferred_shape.dim(i), symbol_dim_map, dim_symbol_map); } c10::SymbolicShape shape_value(final_shape); // Store data propagation result into shapeValueMap diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.h b/torch/csrc/jit/passes/onnx/shape_type_inference.h index 03e927a01bff4..685ca39c16dec 100644 --- a/torch/csrc/jit/passes/onnx/shape_type_inference.h +++ b/torch/csrc/jit/passes/onnx/shape_type_inference.h @@ -86,6 +86,7 @@ TORCH_API void ONNXShapeTypeInference( const ParamMap& params_dict, int opset_version); +bool AllGraphInputsStatic(const Graph* g); std::pair AreInputsReliableOrStatic(Node* n); void UpdateReliable( torch::jit::Value* output, diff --git a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp index 9270028b98808..7390bea56e77b 100644 --- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp +++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp @@ -30,7 +30,7 @@ using namespace ::c10::onnx; // we traverse up the graph to get the scale from its input until we hit a node // where scale is explicitly specified. double getScaleFromInput(Node* input_node) { - c10::optional scale; + std::optional scale; std::string input_name = input_node->kind().toQualString(); std::unordered_set noscale_ops = { "quantized::max_pool2d", @@ -332,7 +332,7 @@ void unpackQuantizedWeightsHelper( "getValues: Quantized weight value not found amongst constant parameters."); } at::Tensor unpacked_weight; - c10::optional bias; + std::optional bias; constexpr int64_t stride_idx = 2; constexpr int64_t padding_idx = 3; int64_t output_padding_idx; @@ -346,10 +346,10 @@ void unpackQuantizedWeightsHelper( dilation_idx = 4; groups_idx = 5; } - c10::optional> stride, padding, dilation, + std::optional> stride, padding, dilation, output_padding; - c10::optional groups; - c10::optional transpose; + std::optional groups; + std::optional transpose; torch::List stride_int, padding_int, dilation_int, output_padding_int; @@ -371,9 +371,9 @@ void unpackQuantizedWeightsHelper( TORCH_INTERNAL_ASSERT(elements.size() == 3, "Wrong tuple size."); auto config_vals = elements[1].to>(); - auto tensors = elements[2].to>>(); + auto tensors = elements[2].to>>(); - c10::optional weight = tensors[1]; + std::optional weight = tensors[1]; TORCH_INTERNAL_ASSERT( weight, "Weight should always be present in serialized qconv."); unpacked_weight = *weight; @@ -534,7 +534,7 @@ void unpackQuantizedWeightsHelper( at::Tensor packed_weight = itr->second.toTensor(); auto op = Dispatcher::singleton() .findSchemaOrThrow(unpack_fn.c_str(), "") - .typed>( + .typed>( at::Tensor)>(); std::tie(unpacked_weight, bias) = op.call(packed_weight); } @@ -598,7 +598,7 @@ void unpackQuantizedWeightsHelper( if (stride.has_value() && padding.has_value() && dilation.has_value() && groups.has_value() && (!expect_output_padding || output_padding.has_value())) { - std::vector>> conv_ints_args; + std::vector>> conv_ints_args; conv_ints_args.push_back(stride); conv_ints_args.push_back(padding); if (expect_output_padding) { diff --git a/torch/csrc/jit/passes/peephole.cpp b/torch/csrc/jit/passes/peephole.cpp index b1e38697ef59d..aa4e2176f1905 100644 --- a/torch/csrc/jit/passes/peephole.cpp +++ b/torch/csrc/jit/passes/peephole.cpp @@ -19,7 +19,7 @@ namespace jit { // Conservatively compare two optionals. If both are undefined, assume // they aren't equal template -static bool mustBeEqual(const c10::optional& a, const c10::optional& b) { +static bool mustBeEqual(const std::optional& a, const c10::optional& b) { return a == b && a.has_value(); } diff --git a/torch/csrc/jit/passes/peephole_dict_idioms.cpp b/torch/csrc/jit/passes/peephole_dict_idioms.cpp index 4e2a56a9d06bd..d3a5cfa36261b 100644 --- a/torch/csrc/jit/passes/peephole_dict_idioms.cpp +++ b/torch/csrc/jit/passes/peephole_dict_idioms.cpp @@ -125,7 +125,7 @@ class DictNode { return 0; } - c10::optional getOrNullopt(const IValue& key) const { + std::optional getOrNullopt(const IValue& key) const { if (impl_ && impl_->contains(key)) { return impl_->get(key); } @@ -181,7 +181,7 @@ class PeepholeOptimizeDictIdiomsImpl { return cached->second; } - c10::optional getValueFromDict(Node* dict_creation_node, Value* key) { + std::optional getValueFromDict(Node* dict_creation_node, Value* key) { const DictNode& dict_node = getDictNode(dict_creation_node); auto key_opt = toIValue(key); // Key is not constant if we cannot convert to IValue @@ -195,7 +195,7 @@ class PeepholeOptimizeDictIdiomsImpl { return c10::nullopt; } - c10::optional computeLen(Node* dict_creation_node) { + std::optional computeLen(Node* dict_creation_node) { const DictNode& dict_node = getDictNode(dict_creation_node); if (dict_node.canOptimize()) { return static_cast(dict_node.size()); diff --git a/torch/csrc/jit/passes/peephole_list_idioms.cpp b/torch/csrc/jit/passes/peephole_list_idioms.cpp index 15f4c807335fd..9c106e13edf1f 100644 --- a/torch/csrc/jit/passes/peephole_list_idioms.cpp +++ b/torch/csrc/jit/passes/peephole_list_idioms.cpp @@ -14,7 +14,7 @@ namespace torch { namespace jit { -static c10::optional normalizeIndex(int64_t index, size_t len) { +static std::optional normalizeIndex(int64_t index, size_t len) { if (index < 0) { index = index + len; } @@ -129,7 +129,7 @@ struct ListLenRefiner { return block_refinements; }; - c10::optional tryFindRefinement(Value* v) { + std::optional tryFindRefinement(Value* v) { for (const auto& ref : active_refinements_) { auto maybe_refinement = ref->find(v); if (maybe_refinement != ref->end()) { diff --git a/torch/csrc/jit/passes/peephole_non_tensor.cpp b/torch/csrc/jit/passes/peephole_non_tensor.cpp index 5cd2b6c2ee65d..5fa9c89b1fb0e 100644 --- a/torch/csrc/jit/passes/peephole_non_tensor.cpp +++ b/torch/csrc/jit/passes/peephole_non_tensor.cpp @@ -19,7 +19,7 @@ namespace { * @post if there's one constant in two operands, then the second operand is * constant. */ -c10::optional checkArithNode(Node& node) { +std::optional checkArithNode(Node& node) { if (node.inputs().size() != 2 || node.input(0)->type() != IntType::get() || node.input(1)->type() != IntType::get()) { return {}; diff --git a/torch/csrc/jit/passes/quantization/helper.cpp b/torch/csrc/jit/passes/quantization/helper.cpp index a4ac1f6fe4be9..8a74ec01086a5 100644 --- a/torch/csrc/jit/passes/quantization/helper.cpp +++ b/torch/csrc/jit/passes/quantization/helper.cpp @@ -235,7 +235,7 @@ std::vector _propagate_quant_binary_ops = { bool matchAtenFuncToUse( const Use& use, const std::string& func_name, - c10::optional n) { + std::optional n) { Node* node = use.user; return node->kind() == Symbol::aten(func_name) && (!n.has_value() || static_cast(n.value()) == use.offset); @@ -244,7 +244,7 @@ bool matchAtenFuncToUse( bool matchCallFuncToUse( const Use& use, const std::string& func_name, - c10::optional n) { + std::optional n) { Node* node = use.user; return node->kind() == prim::CallFunction && getFuncName(node->inputs()[0]) == func_name && @@ -316,7 +316,7 @@ bool isEmbeddingBagNonInput(Value* v) { return result; } -c10::optional getClampScalarInputUse(Value* v) { +std::optional getClampScalarInputUse(Value* v) { for (const auto& use : v->uses()) { for (const auto& aten_func : _clamp_funcs) { if (matchAtenFuncToUse(use, aten_func, 1) || @@ -493,7 +493,7 @@ bool isBinaryOpWithScalarInput(Node* n) { return isPropagateQuantBinaryOp(n) && isScalar(n->input(1)); } -c10::optional> getFixedQParams(Node* n) { +std::optional> getFixedQParams(Node* n) { static std::vector fixed_qparam_funcs; std::transform( _fixed_qparams_map.begin(), @@ -642,7 +642,7 @@ Module getInvokedModule(Module& module, Node* n, Value* self) { return findChildModule(module, path); } -c10::optional getInvokedModuleOpt( +std::optional getInvokedModuleOpt( const Module& module, Node* n, Value* self) { @@ -686,7 +686,7 @@ std::string removeTorchMangle(const std::string& orig_name) { return qualified_name; } -c10::optional getModuleName(Value* value) { +std::optional getModuleName(Value* value) { auto type = value->type()->cast(); if (type && type->name()) { return removeTorchMangle(type->name()->qualifiedName()); diff --git a/torch/csrc/jit/passes/quantization/helper.h b/torch/csrc/jit/passes/quantization/helper.h index b5a5adf40b65c..680e3c7ca43d5 100644 --- a/torch/csrc/jit/passes/quantization/helper.h +++ b/torch/csrc/jit/passes/quantization/helper.h @@ -32,7 +32,7 @@ TORCH_API bool isBiasOfConvOrLinear(Value* v); TORCH_API bool isEmbeddingBagNonInput(Value* v); // Get the use as scalar input of clamp ops for the input value -c10::optional getClampScalarInputUse(Value* v); +std::optional getClampScalarInputUse(Value* v); // For a given value `v`, get the list of values that we need to check // if they are observed/quantized or not, if so, we can say the @@ -59,7 +59,7 @@ TORCH_API bool hitGraphInput(Value* value); TORCH_API std::string removeTorchMangle(const std::string& orig_name); // Return the module name that corresponds to the value. -TORCH_API c10::optional getModuleName(Value* value); +TORCH_API std::optional getModuleName(Value* value); // =========== helper functions for Node ========= TORCH_API bool isSingleInputGeneralShapeAtenFunction(Node* n); @@ -91,7 +91,7 @@ TORCH_API bool isPropagateQuantOp(Node* n); // quantized::{op}_scalar TORCH_API bool isBinaryOpWithScalarInput(Node* n); -TORCH_API c10::optional> getFixedQParams( +TORCH_API std::optional> getFixedQParams( Node* n); // We don't want to analyze the graph for some `builtin` CallFunctions @@ -121,14 +121,14 @@ TORCH_API std::shared_ptr getCallFunctionGraph(Node* n); bool matchCallFuncToUse( const Use& use, const std::string& func_name, - c10::optional nth_arg); + std::optional nth_arg); // Check if `use` is a AtenFunction of name `func_name` and if value // `v` is the nth argument (if provided) of the function bool matchAtenFuncToUse( const Use& use, const std::string& func_name, - c10::optional nth_arg); + std::optional nth_arg); // =========== helper functions for Block ========= // checks if a block will always raise an Exception @@ -151,7 +151,7 @@ TORCH_API Module getInvokedModule(Module& module, Node* n, Value* self); // Given an CallMethod node, get the module instance corresponding // to the instance Value if the instance is a module, otherwise return // c10::nullopt -c10::optional getInvokedModuleOpt( +std::optional getInvokedModuleOpt( const Module& module, Node* n, Value* self); diff --git a/torch/csrc/jit/passes/quantization/insert_observers.cpp b/torch/csrc/jit/passes/quantization/insert_observers.cpp index f514fbc193ddd..e5df64f1929c7 100644 --- a/torch/csrc/jit/passes/quantization/insert_observers.cpp +++ b/torch/csrc/jit/passes/quantization/insert_observers.cpp @@ -20,12 +20,12 @@ namespace torch { namespace jit { -using ModuleQConfigMap = std::unordered_map>; +using ModuleQConfigMap = std::unordered_map>; namespace { struct OptionalQConfigHash { - inline size_t operator()(const c10::optional& qconfig_opt) const { + inline size_t operator()(const std::optional& qconfig_opt) const { if (qconfig_opt.has_value()) { const auto& m1 = std::get<0>(*qconfig_opt); const auto& m2 = std::get<1>(*qconfig_opt); @@ -36,9 +36,9 @@ struct OptionalQConfigHash { } }; using QConfigTypePtrMap = - std::unordered_map, TypePtr, OptionalQConfigHash>; + std::unordered_map, TypePtr, OptionalQConfigHash>; using NameModuleVector = std::vector>; -using OptionalModuleVector = std::vector>; +using OptionalModuleVector = std::vector>; using ModuleMethodVector = std::vector>; using graph_rewrite_helper::PatternInfo; using graph_rewrite_helper::replaceConvolutionWithAtenConv; @@ -49,8 +49,8 @@ void fillQConfigMap( const QConfigDict& qconfig_dict, ModuleQConfigMap& map, const std::string& key = "", - const c10::optional& parent_qconfig = c10::nullopt) { - c10::optional qconfig; + const std::optional& parent_qconfig = c10::nullopt) { + std::optional qconfig; if (qconfig_dict.find(key) != qconfig_dict.end()) { GRAPH_DEBUG("Got module config for key:", key); qconfig = qconfig_dict.at(key); @@ -187,7 +187,7 @@ class ModuleCloneHelper { const Module& source, Module& target, const ModuleQConfigMap& module_qconfig_map, - const std::function)>& + const std::function)>& type_remap_fn) { // remap of %self will be done outside of the function // and we don't support the case when people pass in @@ -239,7 +239,7 @@ class ModuleCloneHelper { const Module& source, Module& target, const ModuleQConfigMap& module_qconfig_map, - const std::function)>& + const std::function)>& type_remap_fn) { remapTypes( graph->block(), @@ -257,7 +257,7 @@ class ModuleCloneHelper { const ModuleQConfigMap& module_qconfig_map, const std::unordered_map& type_remap) { auto type_remap_fn = [&](TypePtr type_ptr, - const c10::optional& qconfig) { + const std::optional& qconfig) { if (type_remap.find(type_ptr) != type_remap.end()) { const auto& qconfig_map = type_remap.at(type_ptr); if (qconfig_map.find(qconfig) != qconfig_map.end()) { @@ -401,7 +401,7 @@ class InsertObserversHelper { // Uses the state created by fillBoundaryValueMap and fillValueObserverMap // to return an observer configured for a value, if it is needed. - c10::optional getObserverFor(Value* v); + std::optional getObserverFor(Value* v); // Uses the state created by fillPassThroughValueMap to propagage observed // property which should pass through from inputs to outputs. @@ -1312,13 +1312,13 @@ void InsertObserversHelper::fillValueObserverMap( } } -c10::optional InsertObserversHelper::getObserverFor(Value* v) { +std::optional InsertObserversHelper::getObserverFor(Value* v) { if (observer_for_value_.count(v)) { auto observer = observer_for_value_.at(v); GRAPH_DEBUG("Got observer module config for:", v->debugName()); return observer; } - c10::optional result; + std::optional result; if (boundary_value_map_.count(v)) { for (Value* next : boundary_value_map_.at(v)) { GRAPH_DEBUG( @@ -1384,9 +1384,9 @@ InsertObserversHelper::insertObserversFor( // the graph itself can be shared std::unordered_set inputs_outputs; // list of observer modules for input values - std::vector> block_input_observers; + std::vector> block_input_observers; // list of observer modules for output values - std::vector> block_output_observers; + std::vector> block_output_observers; // if the current block is the block for entry point graph(the forward graph // of the top level module), we can insert observers in the block directly diff --git a/torch/csrc/jit/passes/quantization/insert_observers.h b/torch/csrc/jit/passes/quantization/insert_observers.h index 6fa7fe0449112..e8857318261c8 100644 --- a/torch/csrc/jit/passes/quantization/insert_observers.h +++ b/torch/csrc/jit/passes/quantization/insert_observers.h @@ -18,7 +18,7 @@ namespace torch { namespace jit { using QConfig = std::tuple; -using QConfigDict = std::unordered_map>; +using QConfigDict = std::unordered_map>; /** \brief Insert observer module and observer function call for * the Tensors that needs to be observed. diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp index 93683a308dc86..02f4f10969760 100644 --- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp +++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp @@ -59,7 +59,7 @@ bool isWeight(Module& module, Value* v) { if (isWeight(v)) { return true; } - c10::optional result; + std::optional result; auto* self = v->owningGraph()->inputs()[0]; for (const Use& u : v->uses()) { Node* n = u.user; @@ -221,7 +221,7 @@ Node* insertFP16CastOps(Graph* graph, Value* observer_out) { } // find the observer for Value `v` and return the name of the observer -c10::optional findObserverName(Value* v) { +std::optional findObserverName(Value* v) { // Note that here we just check for the name of observer, but the ideally // we should be comparing the type of observer, this is a temporary // work around until data only clone of module.clone is supported. @@ -258,7 +258,7 @@ at::ScalarType getObserverDtype(Module& module, Value* v) { return at::ScalarType::Undefined; } -c10::optional getEmbeddingBagObsName( +std::optional getEmbeddingBagObsName( script::Module& module, Node* n) { Value* v = n->output(); @@ -273,7 +273,7 @@ c10::optional getEmbeddingBagObsName( bool isEmbeddingBagOp( Node* observer, - c10::optional embedding_bag_name) { + std::optional embedding_bag_name) { return embedding_bag_name && embedding_bag_name.value().find("embedding_bag_") != std::string::npos; } @@ -791,7 +791,7 @@ class InsertQuantDeQuantHelper { Value* original_output, const std::vector& inputs, bool is_scalar = false, - const c10::optional>& qparams_opt = + const std::optional>& qparams_opt = c10::nullopt); bool isQuantized(Value* v) { @@ -1125,7 +1125,7 @@ ModuleMethodVector InsertQuantDeQuantHelper::getInvokedMethods( if (n->kind() == prim::CallMethod) { auto module_instance = n->inputs()[0]; auto module_method_name = n->s(attr::name); - c10::optional m; + std::optional m; // calling method on self if (module_instance == graph->inputs()[0]) { m = module; @@ -1152,7 +1152,7 @@ void InsertQuantDeQuantHelper::propagateQParams( Value* original_output, const std::vector& inputs, bool is_scalar, - const c10::optional>& qparams_opt) { + const std::optional>& qparams_opt) { Node* n = original_output->node(); Graph* graph = n->owningGraph(); if (is_scalar) { @@ -1248,7 +1248,7 @@ void removeDequantizeFromInputs(const std::unordered_set& inputs) { // Check if we need to propagate the quantization ops from input to // output -c10::optional> getDequantizedInputs(Value* output) { +std::optional> getDequantizedInputs(Value* output) { auto inputs = getPassThroughInputs(output); if (!inputs.empty()) { // note that we don't need to recursively check for prim::If diff --git a/torch/csrc/jit/passes/remove_mutation.cpp b/torch/csrc/jit/passes/remove_mutation.cpp index 183c7894f0867..84b990f628336 100644 --- a/torch/csrc/jit/passes/remove_mutation.cpp +++ b/torch/csrc/jit/passes/remove_mutation.cpp @@ -360,7 +360,7 @@ bool RemoveListMutation(const std::shared_ptr& graph) { bool RemoveTensorMutation( const std::shared_ptr& graph, - c10::optional> mutation_filter) { + std::optional> mutation_filter) { MutationRemover mr(graph, std::move(mutation_filter)); return mr.removeTensorMutation(); } diff --git a/torch/csrc/jit/passes/remove_mutation.h b/torch/csrc/jit/passes/remove_mutation.h index eb8cf195ee4ca..be8fc12b11f3d 100644 --- a/torch/csrc/jit/passes/remove_mutation.h +++ b/torch/csrc/jit/passes/remove_mutation.h @@ -11,7 +11,7 @@ namespace jit { struct TORCH_API MutationRemover { MutationRemover( std::shared_ptr graph, - c10::optional> mutation_filter = c10::nullopt) + std::optional> mutation_filter = c10::nullopt) : mutation_filter_(mutation_filter), aliasDb_(nullptr), graph_(std::move(graph)) {} @@ -55,7 +55,7 @@ struct TORCH_API MutationRemover { return aliasDb_.get(); } - c10::optional> mutation_filter_; + std::optional> mutation_filter_; std::unique_ptr aliasDb_ = nullptr; std::shared_ptr graph_; }; @@ -71,7 +71,7 @@ TORCH_API bool RemoveListMutation(const std::shared_ptr& graph); // return true if graph is modified TORCH_API bool RemoveTensorMutation( const std::shared_ptr& graph, - c10::optional> mutation_filter = c10::nullopt); + std::optional> mutation_filter = c10::nullopt); // Replaces in-place aten activation ops with their functional equivalence TORCH_API bool InplaceToFunctionalActivation( diff --git a/torch/csrc/jit/passes/replacement_of_old_operators.cpp b/torch/csrc/jit/passes/replacement_of_old_operators.cpp index 430cd4f743fdc..38255ad141877 100644 --- a/torch/csrc/jit/passes/replacement_of_old_operators.cpp +++ b/torch/csrc/jit/passes/replacement_of_old_operators.cpp @@ -30,7 +30,7 @@ struct OldOpsReplacerWithUpgraders { Node* node = graph_it.next(); while (node) { // load the schema name for this op - c10::optional schema_name = c10::nullopt; + std::optional schema_name = c10::nullopt; if (auto op_schema = node->maybeSchema()) { schema_name = getFullSchemaName(*op_schema); } else { diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index 706a17bf13e02..abc7bb6411dba 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -153,7 +153,7 @@ bool containsTensorType(const TypePtr& t) { // for each node in the schema with type Tensor, extract the T type // returns c10::nullopt if any Tensor in the schema does not have a known // shape ignores non-tensor in the list of inputs -c10::optional> gatherTensorTypes( +std::optional> gatherTensorTypes( Node* node, bool complete = false) { std::vector tensor_types; @@ -209,7 +209,7 @@ c10::ScalarType unionScalarTypes( // new type promotion logic. See tensor_attributes.rst for details. // This doesn't handle the case of arithmetic ops with Scalar arguments (when // `Tensor.getUnsafeTensorImpl()->is_wrapped_number()` would return true) -c10::optional getPromotedTypeForArithmeticOp(Node* node) { +std::optional getPromotedTypeForArithmeticOp(Node* node) { c10::ScalarType dimmed = c10::ScalarType::Undefined; c10::ScalarType zerodim = c10::ScalarType::Undefined; // binary arithmetic ops, more than 2 args is alpha. @@ -741,7 +741,7 @@ class ShapePropagator : public PropertyPropBase { return setUnshapedType(node); } - static c10::optional determineListSize(Value* list) { + static std::optional determineListSize(Value* list) { AT_ASSERT(list->type()->cast()); if (auto shape = constant_as>(list)) { return shape->size(); @@ -769,7 +769,7 @@ class ShapePropagator : public PropertyPropBase { bool PropagateTensorShapeOnNode(Node* node, bool insert_expands) { static const auto broadcast = [](std::vector& tensor_types, - c10::optional t) -> TensorTypePtr { + std::optional t) -> TensorTypePtr { if (tensor_types.size() == 1) { return tensor_types[0]->dimensionedOnly()->withScalarType(t); } @@ -1244,7 +1244,7 @@ class ShapePropagator : public PropertyPropBase { static const auto reduce_op_handler = [](Node* node, int64_t num_reduced_dim = 0, bool upcast_integer = false, - c10::optional opt_dtype = + std::optional opt_dtype = c10::nullopt) -> type_vec_t { if (auto type = node->input(0)->type()->cast()) { if (!type->scalarType() || !type->dim()) { diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp index 96aa425b291a1..1765e65d02a6e 100644 --- a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp +++ b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp @@ -65,7 +65,7 @@ namespace jit { struct ShapeArg : public std:: - pair, c10::optional> { + pair, c10::optional> { using pair::pair; static ShapeArg unknownInteger() { @@ -87,11 +87,11 @@ struct ShapeArg } } - c10::optional asConstantInt() const { + std::optional asConstantInt() const { return this->second; } - c10::optional asShapeSymbol() const { + std::optional asShapeSymbol() const { return this->first; } @@ -208,7 +208,7 @@ bool isListOfTensors(const TypePtr& type) { type->cast()->getElementType()->cast(); } -c10::optional normIndex(int64_t index, size_t len) { +std::optional normIndex(int64_t index, size_t len) { if (index < 0) { index = index + len; } @@ -255,7 +255,7 @@ c10::SymbolicShape extractListShape( return c10::SymbolicShape(); } Node* list_construct = list->node(); - std::vector> output_shape; + std::vector> output_shape; for (Value* input : list_construct->inputs()) { if (symbolic_shape_values.count(input)) { output_shape.emplace_back(symbolic_shape_values[input]); @@ -605,7 +605,7 @@ struct SymbolicShapeOpAnalyzer { shape_compute_graph_ = graph->copy(); } - c10::optional> run( + std::optional> run( std::vector& inputs) { if (!shape_compute_graph_) { return c10::nullopt; @@ -813,7 +813,7 @@ struct SymbolicShapeGraphAnalyzer { beg_->owningBlock() == end_->owningBlock() && end_->isAfter(beg_)); } - c10::optional run() { + std::optional run() { AliasDb db(graph_); std::unordered_map> partial_evaluated_graphs = propagateShapesAndGatherPartialEvalShapeGraphs(db); @@ -1120,7 +1120,7 @@ void PropagateShapesOnGraph(std::shared_ptr& graph) { PropagateShapesOnBlock(graph->block(), db); } -c10::optional +std::optional PropagateShapesAndBuildLargeShapeComputeGraph( std::shared_ptr& graph, Node* beg, @@ -1128,7 +1128,7 @@ PropagateShapesAndBuildLargeShapeComputeGraph( return SymbolicShapeGraphAnalyzer(graph, beg, end).run(); } -TORCH_API c10::optional> +TORCH_API std::optional> calculateSymbolicShapesOnOp( const FunctionSchema* schema, const std::vector& inputs) { diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.h b/torch/csrc/jit/passes/symbolic_shape_analysis.h index 824740792aaf0..f5a17f2c5e550 100644 --- a/torch/csrc/jit/passes/symbolic_shape_analysis.h +++ b/torch/csrc/jit/passes/symbolic_shape_analysis.h @@ -36,7 +36,7 @@ struct ShapeComputeGraphMapping { std::unordered_map graph_output_to_symbolic_shape_dim_; }; -TORCH_API c10::optional +TORCH_API std::optional PropagateShapesAndBuildLargeShapeComputeGraph( std::shared_ptr& graph, Node* beg, @@ -50,7 +50,7 @@ TORCH_API bool setSymbolicShapeAnalysisTestMode(bool value); TORCH_API bool symbolicShapeAnalysisTestModeEnabled(); using SSAInput = std::variant; -TORCH_API c10::optional> +TORCH_API std::optional> calculateSymbolicShapesOnOp( const FunctionSchema* schema, const std::vector& inputs); diff --git a/torch/csrc/jit/passes/symbolic_shape_cache.cpp b/torch/csrc/jit/passes/symbolic_shape_cache.cpp index be8179f18786d..4a742b3f5f635 100644 --- a/torch/csrc/jit/passes/symbolic_shape_cache.cpp +++ b/torch/csrc/jit/passes/symbolic_shape_cache.cpp @@ -109,7 +109,7 @@ TORCH_API void cache_shape_function( shapeCache.Add(std::move(cache_key), std::move(can_ret_vec)); } -TORCH_API c10::optional> +TORCH_API std::optional> get_cached_shape_function( const FunctionSchema* schema, const std::vector& arg_vec) { diff --git a/torch/csrc/jit/passes/symbolic_shape_cache.h b/torch/csrc/jit/passes/symbolic_shape_cache.h index 02e00acac08d2..b842c731c0ce4 100644 --- a/torch/csrc/jit/passes/symbolic_shape_cache.h +++ b/torch/csrc/jit/passes/symbolic_shape_cache.h @@ -31,7 +31,7 @@ struct TORCH_API CanonicalizedSymbolicShape { const CanonicalizedSymbolicShape& b); private: - c10::optional> values_; + std::optional> values_; void init( const c10::SymbolicShape& orig_shape, @@ -39,7 +39,7 @@ struct TORCH_API CanonicalizedSymbolicShape { }; // SHAPE CACHE API -TORCH_API c10::optional> +TORCH_API std::optional> get_cached_shape_function( const FunctionSchema* schema, const std::vector& arg_vec); diff --git a/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp b/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp index b4902a1d5a0d4..9c213f2480d51 100644 --- a/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp +++ b/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp @@ -178,7 +178,7 @@ static StrideInput summarizeOutputStrides(const TensorType& tt) { // Also summarize input striding behavior. The Size information is stored on the // type, The striding is returned. See StrideInput for description of stride // specializations -static c10::optional>> +static std::optional>> TryGeneralizeInputDimensionsToSymbolicShapes( std::shared_ptr tensorexpr_graph) { std::map shape_to_sym_shape; diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index cd95af3424dc2..c9b9b974600dc 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -780,7 +780,7 @@ class TensorExprFuser { } } - c10::optional tryMerge(Node* fusion_group, Node* to_merge) { + std::optional tryMerge(Node* fusion_group, Node* to_merge) { if (!canMerge(fusion_group, to_merge)) { return c10::nullopt; } diff --git a/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp b/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp index b926939910c3a..15cefadd8cc76 100644 --- a/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp +++ b/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp @@ -8,7 +8,7 @@ namespace jit { static void UpdateDifferentiableGraphRequiresGrad( Block* block, - c10::optional new_requires_grad) { + std::optional new_requires_grad) { for (Node* n : block->nodes()) { for (Value* v : n->inputs()) { auto ty = v->type()->cast(); @@ -31,7 +31,7 @@ static void UpdateDifferentiableGraphRequiresGrad( void UpdateDifferentiableGraphRequiresGrad( std::shared_ptr& diff_forward_graph, - c10::optional new_requires_grad) { + std::optional new_requires_grad) { UpdateDifferentiableGraphRequiresGrad( diff_forward_graph->block(), new_requires_grad); } diff --git a/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h b/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h index eb51ba00c4c9f..0ba8696088934 100644 --- a/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h +++ b/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h @@ -14,7 +14,7 @@ namespace jit { // the types of prim::profiles TORCH_API void UpdateDifferentiableGraphRequiresGrad( std::shared_ptr& diff_forward_graph, - c10::optional new_requires_grad); + std::optional new_requires_grad); } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/passes/utils/check_alias_annotation.cpp b/torch/csrc/jit/passes/utils/check_alias_annotation.cpp index d538e33a21359..4c081200715a7 100644 --- a/torch/csrc/jit/passes/utils/check_alias_annotation.cpp +++ b/torch/csrc/jit/passes/utils/check_alias_annotation.cpp @@ -188,7 +188,7 @@ const Node* findNodeForOp( // Handle a few special cases where we need to propagate constants // manually // TODO(suo): we should be able to move this stuff to constant prop -c10::optional toIValueProp(const Value* v) { +std::optional toIValueProp(const Value* v) { if (v->node()->kind() == prim::ListConstruct) { std::vector genericList; for (auto input : v->node()->inputs()) { diff --git a/torch/csrc/jit/passes/utils/memory_dag.h b/torch/csrc/jit/passes/utils/memory_dag.h index f3068588dae85..da5584f9d4bd3 100644 --- a/torch/csrc/jit/passes/utils/memory_dag.h +++ b/torch/csrc/jit/passes/utils/memory_dag.h @@ -62,9 +62,9 @@ struct Element { // We memoize the results of `getMemoryLocations` to speed up queries. // A nullopt means that this cache is not yet populated. Since `MemoryDAG` is // immutable, this cache should never need to be invalidated. - mutable c10::optional cachedMemoryLocations_; + mutable std::optional cachedMemoryLocations_; - mutable c10::optional cachedAllContainedMemoryLocations_; + mutable std::optional cachedAllContainedMemoryLocations_; }; // class MemoryDAG diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.cpp b/torch/csrc/jit/passes/utils/subgraph_utils.cpp index 36515e9e849e3..1bb82432e218f 100644 --- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp +++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp @@ -18,9 +18,9 @@ bool hasSubgraph(Node* n) { return n->hasAttribute(attr::Subgraph); } -std::vector> gatherLastUses( +std::vector> gatherLastUses( at::ArrayRef values) { - return fmap(values, [&](Value* v) -> c10::optional { + return fmap(values, [&](Value* v) -> std::optional { return firstOrLastUse(v, /*find_first*/ false); }); } @@ -38,7 +38,7 @@ struct ValueMapper { ValueMapper( Node* to_merge, AliasDb& db, - c10::optional existing_subgraph) { + std::optional existing_subgraph) { last_uses_ = gatherLastUses(to_merge->outputs()); if (existing_subgraph) { existing_last_uses_ = gatherLastUses((*existing_subgraph)->outputs()); @@ -91,14 +91,14 @@ struct ValueMapper { placeholder_node_->destroy(); } - std::vector> last_uses_; - std::vector> existing_last_uses_; + std::vector> last_uses_; + std::vector> existing_last_uses_; Node* placeholder_node_; }; Node* executeSubgraphMergeAndUpdateAliasing( Node* to_merge, - c10::optional existing, + std::optional existing, AliasDb& db, const std::function& merge_fn) { // When we merge a node into a subgraph, the new subgraph outputs diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index a5e3c6059bc84..290a10d06af5a 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -151,7 +151,7 @@ static bool opAllowsNumbersAsTensors(c10::Symbol symbol) { torch::should_allow_numbers_as_tensors(symbol.toUnqualString())); } -c10::optional toTypeInferredIValueOptional(py::handle input) { +std::optional toTypeInferredIValueOptional(py::handle input) { // Errors need to be caught here because toTypeInferredIValue errors out // on various object types, but we want it to work with all types. try { @@ -217,7 +217,7 @@ void initJITBindings(PyObject* module) { []() { return c10::ShapeSymbol::newSymbol().value(); }) .def( "_jit_shape_compute_graph_for_node", - [](Node* n) -> c10::optional> { + [](Node* n) -> std::optional> { if (!n->maybeSchema()) { return c10::nullopt; } @@ -225,7 +225,7 @@ void initJITBindings(PyObject* module) { }) .def( "_jit_decomposition_graph_for_node", - [](Node* n) -> c10::optional> { + [](Node* n) -> std::optional> { if (!n->maybeSchema()) { return c10::nullopt; } @@ -320,7 +320,7 @@ void initJITBindings(PyObject* module) { int quant_type_int) { auto dict = py::cast>>>(qconfig_dict); + std::optional>>>(qconfig_dict); auto quant_type = static_cast(quant_type_int); return InsertObservers( module, method_name, dict, inplace, quant_type); @@ -339,7 +339,7 @@ void initJITBindings(PyObject* module) { int quant_type_int) { auto dict = py::cast>>>(qconfig_dict); + std::optional>>>(qconfig_dict); auto quant_type = static_cast(quant_type_int); return InsertObserversForOnDevicePTQ( module, method_name, dict, inplace, quant_type); @@ -1389,14 +1389,36 @@ void initJITBindings(PyObject* module) { return size; } py::gil_scoped_acquire acquire; - auto memory_view = py::memoryview::from_memory( - reinterpret_cast(data), size); - buffer.attr("write")(std::move(memory_view)); + if (!data) { + // See [Note: write_record_metadata] + buffer.attr("seek")( + size, py::module::import("os").attr("SEEK_CUR")); + } else { + auto memory_view = py::memoryview::from_memory( + reinterpret_cast(data), size); + buffer.attr("write")(std::move(memory_view)); + } return size; }; return std::make_unique(std::move(writer_func)); })) .def(py::init&>()) + // [Note: write_record_metadata] + // The write_record_metadata function is intended to write metadata (i.e. + // the zipfile header and end of central directory record) for a file + // while reserving nbytes of space for the file for the bytes of the + // actual file to be added in later. This functionality is achieved by + // defining `m_pWrite` to seek instead of write if the buffer passed is a + // nullptr. This has implications on CRC-32 which will not be written at + // write_record_metadata time, and will not be combined with the hash in + // combined_uncomp_crc32_. We define this in `m_pWrite` rather than + // extending the interface of miniz to have an `m_pSeek` since different + // versions of miniz are used in fbcode/oss. + .def( + "write_record_metadata", + [](PyTorchStreamWriter& self, const std::string& name, size_t size) { + return self.writeRecord(name, nullptr, size); + }) .def( "write_record", [](PyTorchStreamWriter& self, @@ -1652,7 +1674,7 @@ void initJITBindings(PyObject* module) { auto func_dk = py::cpp_function( [op, symbol, allow_numbers_as_tensors]( c10::DispatchKey dk_, py::args args, py::kwargs kwargs) { - c10::optional dk = + std::optional dk = c10::make_optional(dk_); ToIValueAllowNumbersAsTensors g(allow_numbers_as_tensors); return _get_operation_for_overload_or_packet( @@ -1821,7 +1843,7 @@ void initJITBindings(PyObject* module) { [](SchemaInfo& self, const std::string& name, const py::object& value) { - c10::optional i_value = toTypeInferredIValueOptional(value); + std::optional i_value = toTypeInferredIValueOptional(value); if (i_value) { // For normalization purposes there is an inconsistency within // torch.fx that turns all arguments named "self" into "input". @@ -1841,7 +1863,7 @@ void initJITBindings(PyObject* module) { TORCH_INTERNAL_ASSERT( key.isString(), "Add argument value keys types should be strings."); - c10::optional value = + std::optional value = toTypeInferredIValueOptional(key_pair.second); if (value) { // For normalization purposes there is an inconsistency within @@ -2077,8 +2099,8 @@ void initJITBindings(PyObject* module) { py::call_guard()); m.def("_is_alias_of", [](const py::object& self, const py::object& other) { - c10::optional self_value = toTypeInferredIValueOptional(self); - c10::optional other_value = toTypeInferredIValueOptional(other); + std::optional self_value = toTypeInferredIValueOptional(self); + std::optional other_value = toTypeInferredIValueOptional(other); // Only return true if we are certain that self and other are aliasing. if (!self_value || !other_value) { @@ -2087,8 +2109,8 @@ void initJITBindings(PyObject* module) { return self_value->isAliasOf(*other_value); }); m.def("_overlaps", [](const py::object& self, const py::object& other) { - c10::optional self_value = toTypeInferredIValueOptional(self); - c10::optional other_value = toTypeInferredIValueOptional(other); + std::optional self_value = toTypeInferredIValueOptional(self); + std::optional other_value = toTypeInferredIValueOptional(other); // Only return true if we are certain that self and other are overlapping. if (!self_value || !other_value) { diff --git a/torch/csrc/jit/python/module_python.h b/torch/csrc/jit/python/module_python.h index 3ab34f5cd8e77..5c7fbbb42d6cf 100644 --- a/torch/csrc/jit/python/module_python.h +++ b/torch/csrc/jit/python/module_python.h @@ -8,7 +8,7 @@ namespace py = pybind11; namespace torch::jit { -inline c10::optional as_module(py::handle obj) { +inline std::optional as_module(py::handle obj) { static py::handle ScriptModule = py::module::import("torch.jit").attr("ScriptModule"); if (py::isinstance(obj, ScriptModule)) { @@ -17,7 +17,7 @@ inline c10::optional as_module(py::handle obj) { return c10::nullopt; } -inline c10::optional as_object(py::handle obj) { +inline std::optional as_object(py::handle obj) { static py::handle ScriptObject = py::module::import("torch").attr("ScriptObject"); if (py::isinstance(obj, ScriptObject)) { diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp index 23107d91d99ac..4cfe3309a766b 100644 --- a/torch/csrc/jit/python/pybind_utils.cpp +++ b/torch/csrc/jit/python/pybind_utils.cpp @@ -55,7 +55,7 @@ IValue listToIValue(py::handle obj) { return c10::impl::toList(rs); } -IValue toIValue(py::handle obj, const TypePtr& type, c10::optional N) { +IValue toIValue(py::handle obj, const TypePtr& type, std::optional N) { switch (type->kind()) { case TypeKind::TensorType: { if (obj.ptr() == Py_None) { @@ -802,7 +802,7 @@ py::object invokeOperatorFromPython( const std::vector>& operations, py::args args, const py::kwargs& kwargs, - c10::optional dk) { + std::optional dk) { auto [found_op, stack] = getOpWithStack(operations, args, kwargs); { pybind11::gil_scoped_release no_gil_guard; @@ -881,7 +881,7 @@ py::object _get_operation_for_overload_or_packet( py::args args, const py::kwargs& kwargs, bool is_overload, - c10::optional dk) { + std::optional dk) { std::string ns = symbol.ns().toUnqualString(); std::string method_name = symbol.toUnqualString(); std::string overload_name = operations[0]->schema().overload_name(); diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h index a78c3e0c0be34..242da11af7c04 100644 --- a/torch/csrc/jit/python/pybind_utils.h +++ b/torch/csrc/jit/python/pybind_utils.h @@ -62,7 +62,7 @@ void clear_registered_instances(void* ptr); TORCH_PYTHON_API IValue toIValue( py::handle obj, const TypePtr& type, - c10::optional N = c10::nullopt); + std::optional N = c10::nullopt); TORCH_PYTHON_API py::object toPyObject(IValue ivalue); @@ -111,7 +111,7 @@ struct VISIBILITY_HIDDEN PythonFutureWrapper explicit PythonFutureWrapper( c10::intrusive_ptr fut, - c10::optional unwrap_func = c10::nullopt) + std::optional unwrap_func = c10::nullopt) : fut(std::move(fut)), unwrap_func(std::move(unwrap_func)) {} explicit PythonFutureWrapper(const PythonFutureWrapper&) = delete; @@ -232,7 +232,7 @@ struct VISIBILITY_HIDDEN PythonFutureWrapper c10::intrusive_ptr fut; // unwrap_func works like a callback for the value returned by // PythonFutureWrapper::wait(). - c10::optional unwrap_func; + std::optional unwrap_func; private: std::shared_ptr getPtr() { @@ -348,7 +348,7 @@ inline TypedIValue toDictKeyIValue(py::handle key) { } } -inline c10::optional unifyOrInitializeType( +inline std::optional unifyOrInitializeType( const TypePtr& accum, const TypePtr& unify) { if (!accum) { @@ -987,7 +987,7 @@ inline Stack createStackForSchema( const FunctionSchema& schema, const tuple_slice& args, const py::kwargs& kwargs, - c10::optional self) { + std::optional self) { size_t all_arguments = (self ? 1 : 0) + args.size() + kwargs.size(); if (all_arguments > schema.arguments().size()) { throw schema_match_error(c10::str( @@ -1102,7 +1102,7 @@ inline py::object runAndInsertCall( Function& callee, const tuple_slice& args, const py::kwargs& kwargs, - c10::optional self, + std::optional self, // Lambda that tells this function how to insert `callee` into the graph if // we're tracing. const std::function& @@ -1158,7 +1158,7 @@ inline py::object runAndInsertCall( return toPyObject(std::move(stack.back())); } -inline c10::optional maybeTorchFunctionDispatch( +inline std::optional maybeTorchFunctionDispatch( const py::object& callee, const tuple_slice& args_no_self, const py::kwargs& kwargs, @@ -1255,7 +1255,7 @@ TORCH_PYTHON_API py::object invokeOperatorFromPython( const std::vector>& operations, py::args args, const py::kwargs& kwargs, - c10::optional dk = c10::nullopt); + std::optional dk = c10::nullopt); TORCH_PYTHON_API py::tuple _maybe_handle_torch_function( const std::string& ns, @@ -1276,6 +1276,6 @@ TORCH_PYTHON_API py::object _get_operation_for_overload_or_packet( py::args args, const py::kwargs& kwargs, bool is_overload, - c10::optional dk = c10::nullopt); + std::optional dk = c10::nullopt); } // namespace torch::jit diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp index 7c6c5089b6d38..2442ef0573545 100644 --- a/torch/csrc/jit/python/python_ir.cpp +++ b/torch/csrc/jit/python/python_ir.cpp @@ -131,7 +131,7 @@ void ConcretePythonOp::cloneFrom(Node* other_) { // recover the autograd.Function instance, if this PythonOp's function // was originally SomeFunction.apply // used in ONNX for discovering symbolics -c10::optional ConcretePythonOp::autogradFunction() const { +std::optional ConcretePythonOp::autogradFunction() const { pybind11::gil_scoped_acquire gil; // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) py::handle obj = const_cast(pyobj.get()); @@ -865,7 +865,7 @@ void initPythonIRBindings(PyObject* module_) { }) .def( "with_sizes", - [](Type& t, c10::optional>> sizes) + [](Type& t, std::optional>> sizes) -> py::object { auto ptt = t.expect(); if (!ptt) { diff --git a/torch/csrc/jit/python/python_ir.h b/torch/csrc/jit/python/python_ir.h index 296fc3f0b1f2e..26adf8c0e4941 100644 --- a/torch/csrc/jit/python/python_ir.h +++ b/torch/csrc/jit/python/python_ir.h @@ -42,7 +42,7 @@ struct ConcretePythonOp : public PythonOp { // recover the autograd.Function instance, if this PythonOp's function // was originally SomeFunction.apply // used in ONNX for discovering symbolics - c10::optional autogradFunction() const override; + std::optional autogradFunction() const override; void writeScalars(std::ostream& out) const override; void lint_python() const override; }; diff --git a/torch/csrc/jit/python/python_ivalue.h b/torch/csrc/jit/python/python_ivalue.h index f33ceca30f2d0..4cdc8e430b9a8 100644 --- a/torch/csrc/jit/python/python_ivalue.h +++ b/torch/csrc/jit/python/python_ivalue.h @@ -31,7 +31,7 @@ struct C10_EXPORT ConcretePyObjectHolder final : PyObjectHolder { return torch::jit::tryToInferType(py_obj_); } - IValue toIValue(const TypePtr& type, c10::optional N = c10::nullopt) + IValue toIValue(const TypePtr& type, std::optional N = c10::nullopt) override { pybind11::gil_scoped_acquire ag; return torch::jit::toIValue(py_obj_, type, N); diff --git a/torch/csrc/jit/python/python_list.h b/torch/csrc/jit/python/python_list.h index d70e653043c93..b5bb88b3aeb20 100644 --- a/torch/csrc/jit/python/python_list.h +++ b/torch/csrc/jit/python/python_list.h @@ -175,7 +175,7 @@ class ScriptList final { // Remove and return the element at the specified index from the list. If no // index is passed, the last element is removed and returned. - IValue pop(c10::optional idx = c10::nullopt) { + IValue pop(std::optional idx = c10::nullopt) { IValue ret; if (idx) { diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp index 4b854c884d026..d6f014759c05e 100644 --- a/torch/csrc/jit/python/python_sugared_value.cpp +++ b/torch/csrc/jit/python/python_sugared_value.cpp @@ -24,7 +24,7 @@ std::string typeString(py::handle h) { return py::str(h.get_type().attr("__name__")); } -c10::optional as_function(const py::object& obj) { +std::optional as_function(const py::object& obj) { if (py::isinstance(obj)) { return py::cast(obj); } @@ -169,7 +169,7 @@ std::string PythonValue::kind() const { std::vector> PythonValue::asTuple( const SourceRange& loc, GraphFunction& m, - const c10::optional& size_hint) { + const std::optional& size_hint) { const std::string type_str = typeString(self); std::stringstream ss; ss << kind() << " cannot be used as a tuple"; @@ -927,7 +927,7 @@ std::shared_ptr BooleanDispatchValue::call( at::ArrayRef args, at::ArrayRef kwargs, size_t n_binders) { - c10::optional result; + std::optional result; Graph& graph = *(caller.graph()); auto index = py::cast(dispatched_fn_["index"]); diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h index 35298e30b08a6..cb397796c9f55 100644 --- a/torch/csrc/jit/python/python_sugared_value.h +++ b/torch/csrc/jit/python/python_sugared_value.h @@ -27,12 +27,12 @@ std::shared_ptr toSugaredValue( const SourceRange& loc, bool is_constant = false); -c10::optional as_function(const py::object& obj); +std::optional as_function(const py::object& obj); struct VISIBILITY_HIDDEN PythonValue : public SugaredValue { PythonValue( py::object the_self, - c10::optional rcb = c10::nullopt, + std::optional rcb = c10::nullopt, Value* module_self = nullptr) : self(std::move(the_self)), rcb(std::move(rcb)), @@ -56,7 +56,7 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue { std::vector> asTuple( const SourceRange& loc, GraphFunction& m, - const c10::optional& size_hint = {}) override; + const std::optional& size_hint = {}) override; std::shared_ptr attr( const SourceRange& loc, @@ -79,7 +79,7 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue { // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) py::object self; // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) - c10::optional rcb; + std::optional rcb; // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) Value* moduleSelf_ = nullptr; }; diff --git a/torch/csrc/jit/python/python_tracer.cpp b/torch/csrc/jit/python/python_tracer.cpp index bdc62d33568de..92e6e2d3ace23 100644 --- a/torch/csrc/jit/python/python_tracer.cpp +++ b/torch/csrc/jit/python/python_tracer.cpp @@ -45,7 +45,7 @@ std::vector _pythonCallstack() { SourceRange getPythonInterpreterSourceRange() { auto cs = pythonCallstack(); - c10::optional source_filename; + std::optional source_filename; size_t source_line = 0; std::stringstream stack_trace; for (const auto& entry : cs) { diff --git a/torch/csrc/jit/python/python_tree_views.cpp b/torch/csrc/jit/python/python_tree_views.cpp index a171314099c3e..50d18b908107e 100644 --- a/torch/csrc/jit/python/python_tree_views.cpp +++ b/torch/csrc/jit/python/python_tree_views.cpp @@ -12,7 +12,7 @@ namespace py = pybind11; namespace torch::jit { -c10::optional maybeConvertToString(const py::object& obj) { +std::optional maybeConvertToString(const py::object& obj) { if (obj.is_none()) { return c10::nullopt; } @@ -177,10 +177,10 @@ void initTreeViewBindings(PyObject* module) { [](const Property& property) { return property.getter().name(); }) .def("setter_name", [](const Property& property) { if (property.setter().present()) { - return c10::optional(property.setter().get().name()); + return std::optional(property.setter().get().name()); } - return c10::optional(c10::nullopt); + return std::optional(c10::nullopt); }); py::class_(m, "ClassDef") diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp index 22809069f8809..971b6c76ca47e 100644 --- a/torch/csrc/jit/python/script_init.cpp +++ b/torch/csrc/jit/python/script_init.cpp @@ -207,7 +207,7 @@ void checkOverloadDecl(const Decl& new_decl, const Decl& old_decl) { } } -c10::optional tryCalculateDefaultParam( +std::optional tryCalculateDefaultParam( const Argument& arg, const py::object& def_value) { auto n = arg.N(); @@ -287,7 +287,7 @@ FunctionSchema getSchemaWithNameAndDefaults( auto it = default_args.find(arg.name()); if (it != default_args.end()) { checkMutableFunctionDefault(range, arg, it->second); - c10::optional value = tryCalculateDefaultParam(arg, it->second); + std::optional value = tryCalculateDefaultParam(arg, it->second); if (!value) { ErrorReport error(range); error << "Expected a default value of type " << arg.type()->repr_str() @@ -1369,10 +1369,10 @@ void initJitScriptBindings(PyObject* module) { [](std::shared_ptr self, const std::string& name) { auto fn = self->find_function(QualifiedName(name)); if (fn) { - return c10::optional( + return std::optional( StrongFunctionPtr(std::move(self), fn)); } else { - return c10::optional(c10::nullopt); + return std::optional(c10::nullopt); } }) .def( @@ -1852,7 +1852,7 @@ void initJitScriptBindings(PyObject* module) { py::object map_location, const py::dict& extra_files, bool restore_shapes = false) { - c10::optional optional_device; + std::optional optional_device; if (!map_location.is_none()) { AT_ASSERT(THPDevice_Check(map_location.ptr())); optional_device = @@ -1877,7 +1877,7 @@ void initJitScriptBindings(PyObject* module) { storage_context, py::object map_location, std::string ts_id) { - c10::optional optional_device; + std::optional optional_device; if (!map_location.is_none()) { AT_ASSERT(THPDevice_Check(map_location.ptr())); optional_device = @@ -1898,7 +1898,7 @@ void initJitScriptBindings(PyObject* module) { const py::dict& extra_files, bool restore_shapes = false) { std::istringstream in(buffer); - c10::optional optional_device; + std::optional optional_device; if (!map_location.is_none()) { AT_ASSERT(THPDevice_Check(map_location.ptr())); optional_device = @@ -1918,7 +1918,7 @@ void initJitScriptBindings(PyObject* module) { m.def( "_load_for_lite_interpreter", [](const std::string& filename, py::object map_location) { - c10::optional optional_device; + std::optional optional_device; if (!map_location.is_none()) { AT_ASSERT(THPDevice_Check(map_location.ptr())); optional_device = @@ -1930,7 +1930,7 @@ void initJitScriptBindings(PyObject* module) { "_load_for_lite_interpreter_from_buffer", [](const std::string& buffer, py::object map_location) { std::istringstream in(buffer); - c10::optional optional_device; + std::optional optional_device; if (!map_location.is_none()) { AT_ASSERT(THPDevice_Check(map_location.ptr())); optional_device = @@ -1975,7 +1975,7 @@ void initJitScriptBindings(PyObject* module) { m.def( "_get_model_extra_files", [](const std::string& filename, const py::dict& py_extra_files) { - c10::optional optional_device; + std::optional optional_device; ExtraFilesMap cpp_extra_files = ExtraFilesMap(); _load_for_mobile(filename, optional_device, cpp_extra_files); extra_files_to_python(cpp_extra_files, py_extra_files); @@ -1990,7 +1990,7 @@ void initJitScriptBindings(PyObject* module) { m.def( "_get_model_extra_files_from_buffer", [](const std::string& buffer, const py::dict& py_extra_files) { - c10::optional optional_device; + std::optional optional_device; ExtraFilesMap cpp_extra_files = ExtraFilesMap(); std::istringstream in(buffer); _load_for_mobile(in, optional_device, cpp_extra_files); @@ -2124,7 +2124,7 @@ void initJitScriptBindings(PyObject* module) { m.def( "_get_graph_executor_optimize", - [](c10::optional new_setting = c10::nullopt) { + [](std::optional new_setting = c10::nullopt) { bool old_value = getGraphExecutorOptimize(); if (new_setting) { setGraphExecutorOptimize(*new_setting); diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h index 06c77edca718c..7a815e815d8e9 100644 --- a/torch/csrc/jit/runtime/argument_spec.h +++ b/torch/csrc/jit/runtime/argument_spec.h @@ -47,7 +47,7 @@ struct ArgumentInfo { return TensorType::get(); return TensorType::create( - type(), device(), c10::optional(dim()), requires_grad()); + type(), device(), std::optional(dim()), requires_grad()); } operator TypePtr() const { return toType(); @@ -460,10 +460,10 @@ inline CompleteArgumentInfo CompleteArgumentSpec::at(size_t i) const { return CompleteArgumentInfo(*this, i); } -inline c10::optional convertOptional( - c10::optional const& from) { - return (from) ? c10::optional(static_cast(*from)) - : c10::optional{}; +inline std::optional convertOptional( + std::optional const& from) { + return (from) ? std::optional(static_cast(*from)) + : std::optional{}; } } // namespace torch::jit @@ -475,7 +475,7 @@ struct hash> { size_t operator()(const c10::VaryingShape& vs) const { return c10::get_hash( vs.size(), - vs.size() ? vs.sizes().value() : std::vector>()); + vs.size() ? vs.sizes().value() : std::vector>()); } }; @@ -483,10 +483,10 @@ template <> struct hash { size_t operator()(const c10::TensorType& ptt) const { return c10::get_hash< - c10::optional, + std::optional, c10::VaryingShape, c10::VaryingShape, - c10::optional>( + std::optional>( torch::jit::convertOptional(ptt.scalarType()), ptt.sizes(), ptt.strides(), diff --git a/torch/csrc/jit/runtime/autodiff.cpp b/torch/csrc/jit/runtime/autodiff.cpp index 0d33abb217ee9..3987521f658f9 100644 --- a/torch/csrc/jit/runtime/autodiff.cpp +++ b/torch/csrc/jit/runtime/autodiff.cpp @@ -128,7 +128,7 @@ bool isDifferentiable(Graph& g) { // will be cleaned up later using EliminateDeadCode(block). TupleUnPack node in // backward graph will be removed in eliminateDeadcode(ReverseDetails) defined // in this file. -static c10::optional> build_script_grad( +static std::optional> build_script_grad( Node* node, const ArrayRef& grads) { auto graph = node->owningGraph(); @@ -352,7 +352,7 @@ bool outputRequiresGrad(Value* output) { if (output->type()->castRaw() == nullptr) { return output->requires_grad(); } - c10::optional requiresGrad = + std::optional requiresGrad = output->type()->expectRef().requiresGrad(); if (requiresGrad.has_value()) { return *requiresGrad; diff --git a/torch/csrc/jit/runtime/custom_operator.h b/torch/csrc/jit/runtime/custom_operator.h index 64d514374f58e..faa8c90754a0e 100644 --- a/torch/csrc/jit/runtime/custom_operator.h +++ b/torch/csrc/jit/runtime/custom_operator.h @@ -18,8 +18,8 @@ struct TORCH_API RegisterOperators { /// Registers a vector of already created `Operator`s. /// The operator element is now optional to filter null ops. It's backward /// compatible and works for selective operator registration. - explicit RegisterOperators(std::vector> operators) { - for (c10::optional& o : operators) { + explicit RegisterOperators(std::vector> operators) { + for (std::optional& o : operators) { if (o) { registerOperator(std::move(o.value())); } diff --git a/torch/csrc/jit/runtime/decomposition_registry.cpp b/torch/csrc/jit/runtime/decomposition_registry.cpp index 0c5f5f0876c1b..900ee32746906 100644 --- a/torch/csrc/jit/runtime/decomposition_registry.cpp +++ b/torch/csrc/jit/runtime/decomposition_registry.cpp @@ -107,7 +107,7 @@ void RunDecompositions(std::shared_ptr g) { } } -c10::optional> GetDecomposition( +std::optional> GetDecomposition( const FunctionSchema& schema) { loadDecompositionFunctions(); GRAPH_DEBUG("Trying to find schema: ", schema); @@ -120,7 +120,7 @@ c10::optional> GetDecomposition( return c10::nullopt; } -c10::optional GetDecompositionFunction( +std::optional GetDecompositionFunction( const FunctionSchema& schema) { loadDecompositionFunctions(); auto cache_it = schema_to_function.find(&schema); diff --git a/torch/csrc/jit/runtime/decomposition_registry.h b/torch/csrc/jit/runtime/decomposition_registry.h index 8633609bcf2a8..59f5aa796f76c 100644 --- a/torch/csrc/jit/runtime/decomposition_registry.h +++ b/torch/csrc/jit/runtime/decomposition_registry.h @@ -7,7 +7,7 @@ namespace torch::jit { -TORCH_API c10::optional> GetDecomposition( +TORCH_API std::optional> GetDecomposition( const FunctionSchema& schema); TORCH_API void RegisterDecomposition( @@ -16,7 +16,7 @@ TORCH_API void RegisterDecomposition( TORCH_API void RunDecompositions(std::shared_ptr g); -TORCH_API c10::optional GetDecompositionFunction( +TORCH_API std::optional GetDecompositionFunction( const FunctionSchema& schema); // For invocation in C++, recommended is to assign to static local variable diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp index b1888f6344f18..d46e9028bf0af 100644 --- a/torch/csrc/jit/runtime/graph_executor.cpp +++ b/torch/csrc/jit/runtime/graph_executor.cpp @@ -636,7 +636,7 @@ struct GraphExecutorImpl : public GraphExecutorImplBase { const ExecutionPlan& getPlanFor( Stack& stack, - c10::optional remaining_bailout_depth) override { + std::optional remaining_bailout_depth) override { return getGraphExecutorOptimize() ? getOrCompile(stack) : getOrCompileFallback(); } @@ -838,7 +838,7 @@ c10::intrusive_ptr GraphExecutor::runAsync( const ExecutionPlan& GraphExecutor::getPlanFor( Stack& inputs, - c10::optional remaining_bailout_depth) { + std::optional remaining_bailout_depth) { return pImpl->getPlanFor(inputs, remaining_bailout_depth); } diff --git a/torch/csrc/jit/runtime/graph_executor.h b/torch/csrc/jit/runtime/graph_executor.h index d82d69ad5dce5..fce8d4a02e66c 100644 --- a/torch/csrc/jit/runtime/graph_executor.h +++ b/torch/csrc/jit/runtime/graph_executor.h @@ -87,7 +87,7 @@ struct TORCH_API GraphExecutor { // current global fusion strategy settings. const ExecutionPlan& getPlanFor( Stack& inputs, - c10::optional remaining_bailout_depth = c10::nullopt); + std::optional remaining_bailout_depth = c10::nullopt); GraphExecutorState getDebugState(); void debugFlushCompilationCache(); diff --git a/torch/csrc/jit/runtime/graph_executor_impl.h b/torch/csrc/jit/runtime/graph_executor_impl.h index 3aae2eb852796..22a563f00be28 100644 --- a/torch/csrc/jit/runtime/graph_executor_impl.h +++ b/torch/csrc/jit/runtime/graph_executor_impl.h @@ -78,7 +78,7 @@ struct GraphExecutorImplBase { virtual const ExecutionPlan& getPlanFor( Stack& stack, - c10::optional remaining_bailout_depth = c10::nullopt) = 0; + std::optional remaining_bailout_depth = c10::nullopt) = 0; virtual GraphExecutorState getDebugState() = 0; virtual ~GraphExecutorImplBase() = default; diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index e5f0f69a45498..18231173dd70e 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -181,7 +181,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { void callFunction( Function& f, Stack& stack, - c10::optional bailOut = c10::nullopt, + std::optional bailOut = c10::nullopt, bool next = true) { bool newFrame = f.call(stack, bailOut, [&](const Code& code) { enterFrame(code, stack.size() - code.num_inputs()); @@ -882,7 +882,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { // Janky af. See https://github.com/pytorch/pytorch/issues/54612 auto* not_implemented_error = dynamic_cast(&e); - c10::optional python_class_name; + std::optional python_class_name; if (jit_exception) { python_class_name = jit_exception->getPythonClassName(); } @@ -913,7 +913,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { const std::exception& e, bool is_jit_exception, c10::NotImplementedError* not_implemented_error, - c10::optional python_class_name) { + std::optional python_class_name) { ExceptionMessage msg(e); std::ostringstream ss; std::string class_name = diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h index e47a581fd5def..a28b1eb93526b 100644 --- a/torch/csrc/jit/runtime/interpreter.h +++ b/torch/csrc/jit/runtime/interpreter.h @@ -124,7 +124,7 @@ struct InterpreterContinuation { InterpreterState state_, Stack stack_, int64_t dist_autograd_context_id = 0, - c10::optional tls_state = c10::nullopt) + std::optional tls_state = c10::nullopt) : state(std::move(state_)), stack(std::move(stack_)), tls_state_(std::move(tls_state)) @@ -140,7 +140,7 @@ struct InterpreterContinuation { private: InterpreterState state; Stack stack; - c10::optional tls_state_ = c10::nullopt; + std::optional tls_state_ = c10::nullopt; #ifdef USE_DISTRIBUTED int64_t dist_autograd_context_id_; #endif diff --git a/torch/csrc/jit/runtime/interpreter/code_impl.h b/torch/csrc/jit/runtime/interpreter/code_impl.h index 98701aa23b365..60948da5a86d6 100644 --- a/torch/csrc/jit/runtime/interpreter/code_impl.h +++ b/torch/csrc/jit/runtime/interpreter/code_impl.h @@ -111,8 +111,8 @@ struct CodeImpl { // It is also very useful for debugging interpreter problems to // keep this around. std::shared_ptr graph_; - c10::optional> grad_executors_; - c10::optional> forward_executors_; + std::optional> grad_executors_; + std::optional> forward_executors_; PreprocessGraph preprocess_; // map from unique of nodes to register in register table diff --git a/torch/csrc/jit/runtime/interpreter/frame.h b/torch/csrc/jit/runtime/interpreter/frame.h index e3de0a02ff7fa..c6873605d0deb 100644 --- a/torch/csrc/jit/runtime/interpreter/frame.h +++ b/torch/csrc/jit/runtime/interpreter/frame.h @@ -26,7 +26,7 @@ struct Frame { size_t base_pointer; // unique to every frame with prim::profile across all threads - c10::optional id; + std::optional id; // RecordFunction object associated with this frame std::unique_ptr record_function; diff --git a/torch/csrc/jit/runtime/jit_exception.cpp b/torch/csrc/jit/runtime/jit_exception.cpp index 809b1b2f5e599..2586f904c9871 100644 --- a/torch/csrc/jit/runtime/jit_exception.cpp +++ b/torch/csrc/jit/runtime/jit_exception.cpp @@ -7,8 +7,8 @@ static thread_local std::string caughtPythonClassName = ""; JITException::JITException( const std::string& msg, - c10::optional python_class_name, - c10::optional original_msg) + std::optional python_class_name, + std::optional original_msg) : std::runtime_error(msg), python_class_name_(std::move(python_class_name)), original_msg_(std::move(original_msg)) {} diff --git a/torch/csrc/jit/runtime/jit_exception.h b/torch/csrc/jit/runtime/jit_exception.h index 728675ed78418..34c3ebd6fca84 100644 --- a/torch/csrc/jit/runtime/jit_exception.h +++ b/torch/csrc/jit/runtime/jit_exception.h @@ -11,17 +11,17 @@ namespace torch::jit { struct TORCH_API JITException : public std::runtime_error { explicit JITException( const std::string& msg, - c10::optional python_class_name = c10::nullopt, - c10::optional original_msg = c10::nullopt); + std::optional python_class_name = c10::nullopt, + std::optional original_msg = c10::nullopt); - c10::optional getPythonClassName() const { + std::optional getPythonClassName() const { return python_class_name_; } // the original msg if this is from a python exception. The interpretor has // changed the original message by adding "The following operation failed in // the TorchScript interpreter." in front of it in the handleError function. - c10::optional getOriginalMsg() const { + std::optional getOriginalMsg() const { return original_msg_; } @@ -31,8 +31,8 @@ struct TORCH_API JITException : public std::runtime_error { static void setCaughtPythonClassName(const std::string& pythonClassName); private: - c10::optional python_class_name_; - c10::optional original_msg_; + std::optional python_class_name_; + std::optional original_msg_; }; } // namespace torch::jit diff --git a/torch/csrc/jit/runtime/operator.h b/torch/csrc/jit/runtime/operator.h index bcab476441e29..dbc2638457c05 100644 --- a/torch/csrc/jit/runtime/operator.h +++ b/torch/csrc/jit/runtime/operator.h @@ -67,7 +67,7 @@ struct TORCH_API Operator { }; struct UnparsedFunctionSchema final { std::string schema_string_; - mutable c10::optional alias_analysis_; + mutable std::optional alias_analysis_; }; // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) struct JitOnlyOperator final { @@ -298,16 +298,16 @@ TORCH_API bool aliasAnalysisHasSpecialCaseFor(c10::Symbol sym); // compile-time function for the selective op registration based on schema // string. template -c10::optional OperatorGenerator( +std::optional OperatorGenerator( const char* schema_str, Func&& op, AliasAnalysisKind alias_analysis) { - return c10::optional(Operator( + return std::optional(Operator( std::string(schema_str), std::forward(op), alias_analysis)); } template -c10::optional OperatorGenerator( +std::optional OperatorGenerator( torch::detail::SelectiveStr schema_str, Func&& op, AliasAnalysisKind alias_analysis) { @@ -318,7 +318,7 @@ c10::optional OperatorGenerator( } template -c10::optional OperatorGenerator( +std::optional OperatorGenerator( torch::detail::SelectiveStr schema_str, Func&& op, AliasAnalysisKind alias_analysis) { @@ -326,14 +326,14 @@ c10::optional OperatorGenerator( } template -c10::optional OperatorGenerator( +std::optional OperatorGenerator( const std::string name, const std::string overload_name, const std::vector arguments, const std::vector returns, Func&& op, AliasAnalysisKind alias_analysis) { - return c10::optional(Operator( + return std::optional(Operator( name, overload_name, arguments, diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp index 58d80c48f9c87..48c7a1959ab22 100644 --- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp +++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp @@ -118,7 +118,7 @@ static FusionStrategy getInitialStrategy() { } // defer initial value so that we can load in gflags -static c10::optional fusion_strategy = c10::nullopt; +static std::optional fusion_strategy = c10::nullopt; FusionStrategy getFusionStrategy() { std::lock_guard guard(fusion_strategy_lock); @@ -613,7 +613,7 @@ size_t ProfilingGraphExecutorImpl::getInstantiatedBailoutDepth() { const ExecutionPlan& ProfilingGraphExecutorImpl::getOptimizedPlanFor( Stack& stack, - c10::optional remaining_bailout_depth) { + std::optional remaining_bailout_depth) { GRAPH_DEBUG("Running ProfilingGraphExecutorImpl ", this); // TODO: instantiate simple executor when getProfilingMode() is false @@ -700,7 +700,7 @@ const ExecutionPlan& ProfilingGraphExecutorImpl::getOptimizedPlanFor( const ExecutionPlan& ProfilingGraphExecutorImpl::getPlanFor( Stack& stack, - c10::optional remaining_bailout_depth) { + std::optional remaining_bailout_depth) { std::lock_guard lock(compile_mutex); // IMPORTANT: This is a hot path of calling a torchscript function. Try not to diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h index 45da1f030e962..a49ef18e2fa42 100644 --- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h +++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h @@ -18,7 +18,7 @@ struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase { const ExecutionPlan& getPlanFor( Stack& stack, - c10::optional remaining_bailout_depth) override; + std::optional remaining_bailout_depth) override; GraphExecutorState getDebugState() override; ~ProfilingGraphExecutorImpl() override = default; @@ -31,7 +31,7 @@ struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase { private: const ExecutionPlan& getOptimizedPlanFor( Stack& stack, - c10::optional remaining_bailout_depth); + std::optional remaining_bailout_depth); void runProfilingInsensitiveOptimizations(std::shared_ptr& graph); void runProfilingOptimizations( std::shared_ptr& graph, @@ -47,13 +47,13 @@ struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase { void clearTheGraphCompilationIntermediateGraphs(); std::unique_ptr pr_; - c10::optional + std::optional profiling_plan_; // plan to run in order to profiling the code - c10::optional optimized_plan_; + std::optional optimized_plan_; FusionStrategy fusion_strategy_; // this plan is used if getGraphExecutorOptimize is unset - c10::optional fallback_plan_; + std::optional fallback_plan_; // fallback functions are inserted for tensorexpr fusion groups // and by specialize_autogradzero. Whenever, at runtime, input // tensor don't match profiled properties, fallback functions are called @@ -63,7 +63,7 @@ struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase { // They only exist in the optimized graph which is a private property // of the GraphExecutor and only shared with InterpreterState std::vector> fallback_functions_; - c10::optional remaining_bailout_depth_; + std::optional remaining_bailout_depth_; // The time the optimized_plan_ is created. int32_t time_optimized_plan_created_ = 0; // Has the extra memory used by the graph for profiling is released? diff --git a/torch/csrc/jit/runtime/register_ops_utils.cpp b/torch/csrc/jit/runtime/register_ops_utils.cpp index b926c59e75dee..7335f132dfbf5 100644 --- a/torch/csrc/jit/runtime/register_ops_utils.cpp +++ b/torch/csrc/jit/runtime/register_ops_utils.cpp @@ -403,7 +403,7 @@ void listSetItem(Stack& stack) { at::Generator make_generator_for_device( c10::Device device, - c10::optional seed) { + std::optional seed) { if (device.is_cpu()) { if (seed.has_value()) { return at::detail::createCPUGenerator(seed.value()); diff --git a/torch/csrc/jit/runtime/register_ops_utils.h b/torch/csrc/jit/runtime/register_ops_utils.h index de70cea3a1d50..15e59acb9fe6e 100644 --- a/torch/csrc/jit/runtime/register_ops_utils.h +++ b/torch/csrc/jit/runtime/register_ops_utils.h @@ -879,6 +879,6 @@ struct OperatorGeneratorArgs { TORCH_API at::Generator make_generator_for_device( c10::Device device, - c10::optional seed = c10::nullopt); + std::optional seed = c10::nullopt); } // namespace torch::jit diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp index ee1c0c9e29ef8..bb9c08465c0ae 100644 --- a/torch/csrc/jit/runtime/register_prim_ops.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops.cpp @@ -34,8 +34,8 @@ namespace { std::string stringSlice( std::string string, - c10::optional start, - c10::optional end, + std::optional start, + std::optional end, int64_t step) { int64_t start_val = start.has_value() ? start.value() : INT64_MAX; int64_t end_val = end.has_value() ? end.value() : INT64_MAX; @@ -1167,7 +1167,7 @@ static const std::vector opGenArgs{ "aten::index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"), [](Stack& stack) { auto indices = pop(stack).to>(); - c10::List> opt_list_indices; + c10::List> opt_list_indices; opt_list_indices.reserve(indices.size()); for (const auto& ten : indices) { opt_list_indices.push_back(ten); @@ -1182,7 +1182,7 @@ static const std::vector opGenArgs{ "aten::_unsafe_index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"), [](Stack& stack) { auto indices = pop(stack).to>(); - c10::List> opt_list_indices; + c10::List> opt_list_indices; opt_list_indices.reserve(indices.size()); for (const auto& ten : indices) { opt_list_indices.push_back(ten); @@ -1200,7 +1200,7 @@ static const std::vector opGenArgs{ auto accumulate = pop(stack).toBool(); auto values = pop(stack).toTensor(); auto indices = pop(stack).to>(); - c10::List> opt_list_indices; + c10::List> opt_list_indices; opt_list_indices.reserve(indices.size()); for (const auto& ten : indices) { opt_list_indices.push_back(ten); @@ -1218,7 +1218,7 @@ static const std::vector opGenArgs{ auto accumulate = pop(stack).toBool(); auto values = pop(stack).toTensor(); auto indices = pop(stack).to>(); - c10::List> opt_list_indices; + c10::List> opt_list_indices; opt_list_indices.reserve(indices.size()); for (const auto& ten : indices) { opt_list_indices.push_back(ten); @@ -1236,7 +1236,7 @@ static const std::vector opGenArgs{ auto accumulate = pop(stack).toBool(); auto values = pop(stack).toTensor(); auto indices = pop(stack).to>(); - c10::List> opt_list_indices; + c10::List> opt_list_indices; opt_list_indices.reserve(indices.size()); for (const auto& ten : indices) { opt_list_indices.push_back(ten); @@ -1254,7 +1254,7 @@ static const std::vector opGenArgs{ auto accumulate = pop(stack).toBool(); auto values = pop(stack).toTensor(); auto indices = pop(stack).to>(); - c10::List> opt_list_indices; + c10::List> opt_list_indices; opt_list_indices.reserve(indices.size()); for (const auto& ten : indices) { opt_list_indices.push_back(ten); @@ -1275,9 +1275,9 @@ static const std::vector opGenArgs{ // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool copy; pop(stack, non_blocking, copy); - c10::optional scalarType = + std::optional scalarType = pop(stack).toOptional(); - c10::optional device = + std::optional device = pop(stack).toOptional(); at::Tensor self = pop(stack).toTensor(); push( @@ -1404,9 +1404,9 @@ static const std::vector opGenArgs{ } }))}; -static std::vector> createOperators( +static std::vector> createOperators( const std::vector& args) { - std::vector> result; + std::vector> result; result.reserve(args.size()); for (const auto& arg : args) { if (arg.schema_str) { @@ -1769,8 +1769,8 @@ static const std::vector stringOpGenArgs{ "aten::slice.str(str string, int? start=None, int? end=None, int step=1) -> str"), [](Stack& stack) { int64_t step = pop(stack).toInt(); - c10::optional end = pop(stack).toOptional(); - c10::optional start = pop(stack).toOptional(); + std::optional end = pop(stack).toOptional(); + std::optional start = pop(stack).toOptional(); std::string string = pop(stack).toStringRef(); push(stack, stringSlice(string, start, end, step)); }, @@ -2397,7 +2397,7 @@ static const std::vector stringOpGenArgs{ for (const auto& v : ivalues) { values.emplace_back(v.toStringRef()); } - c10::optional opt_string = + std::optional opt_string = pop(stack).toOptional(); const std::string& string = opt_string.value_or(""); std::stringstream ss; @@ -2463,8 +2463,8 @@ static const std::vector opGenArgs1{ // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool copy; pop(stack, self, non_blocking, copy); - c10::optional device = c10::nullopt; - c10::optional scalarType = c10::nullopt; + std::optional device = c10::nullopt; + std::optional scalarType = c10::nullopt; push( stack, to_dispatch(self, device, scalarType, non_blocking, copy)); }, diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp index d48a981666c83..4359b852b6a38 100644 --- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp @@ -427,8 +427,8 @@ at::Tensor interpolate( const IValue& size, const IValue& scale_factors, const std::string& mode, - c10::optional align_corners, - c10::optional recompute_scale_factor) { + std::optional align_corners, + std::optional recompute_scale_factor) { if ((mode == "nearest" || mode == "area")) { if (align_corners != c10::nullopt) { throw std::runtime_error( diff --git a/torch/csrc/jit/runtime/register_special_ops.cpp b/torch/csrc/jit/runtime/register_special_ops.cpp index 5e33d8cf27d39..5b8c70c404ae9 100644 --- a/torch/csrc/jit/runtime/register_special_ops.cpp +++ b/torch/csrc/jit/runtime/register_special_ops.cpp @@ -406,7 +406,7 @@ RegisterOperators reg({ double a; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double b; - c10::optional generator = + std::optional generator = pop(stack).toOptional(); pop(stack, tensor, a, b); @@ -425,7 +425,7 @@ RegisterOperators reg({ double mean; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double std; - c10::optional generator = + std::optional generator = pop(stack).toOptional(); pop(stack, tensor, mean, std); diff --git a/torch/csrc/jit/runtime/simple_graph_executor_impl.cpp b/torch/csrc/jit/runtime/simple_graph_executor_impl.cpp index 742915995469e..c1dbbddc6d337 100644 --- a/torch/csrc/jit/runtime/simple_graph_executor_impl.cpp +++ b/torch/csrc/jit/runtime/simple_graph_executor_impl.cpp @@ -13,7 +13,7 @@ SimpleGraphExecutorImpl::SimpleGraphExecutorImpl( const ExecutionPlan& SimpleGraphExecutorImpl::getPlanFor( Stack& stack, - c10::optional remaining_bailout_depth) { + std::optional remaining_bailout_depth) { std::lock_guard lock(compile_mutex); // IMPORTANT: This is a hot path of calling a torchscript function. Try not to diff --git a/torch/csrc/jit/runtime/simple_graph_executor_impl.h b/torch/csrc/jit/runtime/simple_graph_executor_impl.h index 34272000f0d1a..e1ebed46ede80 100644 --- a/torch/csrc/jit/runtime/simple_graph_executor_impl.h +++ b/torch/csrc/jit/runtime/simple_graph_executor_impl.h @@ -12,12 +12,12 @@ struct TORCH_API SimpleGraphExecutorImpl : public GraphExecutorImplBase { const ExecutionPlan& getPlanFor( Stack& stack, - c10::optional remaining_bailout_depth) override; + std::optional remaining_bailout_depth) override; GraphExecutorState getDebugState() override; ~SimpleGraphExecutorImpl() override = default; private: - c10::optional execution_plan_; + std::optional execution_plan_; }; } // namespace torch::jit diff --git a/torch/csrc/jit/runtime/static/fusion.cpp b/torch/csrc/jit/runtime/static/fusion.cpp index 5ba3b1a0268f2..ffac37efc9b76 100644 --- a/torch/csrc/jit/runtime/static/fusion.cpp +++ b/torch/csrc/jit/runtime/static/fusion.cpp @@ -168,7 +168,7 @@ static void debugDumpFusionGroup(const std::string& msg, Node* n) { } } -static c10::optional tryMerge( +static std::optional tryMerge( Node* fusion_group, Node* to_merge, AliasDb* aliasDb) { diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index 9f62d631bce88..193675672f6b8 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -286,7 +286,7 @@ void PrepareGraphForStaticModule( ForceNonEmptyOutputs(*graph); } -std::pair, c10::optional> PrepareForStaticModule( +std::pair, std::optional> PrepareForStaticModule( const torch::jit::Module& m, bool is_frozen, const StaticModuleOptions& opts, @@ -316,7 +316,7 @@ std::pair, c10::optional> PrepareForStaticModule( return std::make_pair(graph, module); } -std::pair, c10::optional> PrepareForStaticModule( +std::pair, std::optional> PrepareForStaticModule( std::shared_ptr graph, const StaticModuleOptions& opts, std::vector sample_inputs) { @@ -544,7 +544,7 @@ StaticModule::StaticModule( opts) {} StaticModule::StaticModule( - std::pair, c10::optional> + std::pair, std::optional> graph_and_module, const StaticModuleOptions& opts) : opts_(opts), diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h index 48af8ef02afbf..2e840e582a0a1 100644 --- a/torch/csrc/jit/runtime/static/impl.h +++ b/torch/csrc/jit/runtime/static/impl.h @@ -417,7 +417,7 @@ class TORCH_API StaticModule { private: explicit StaticModule( - std::pair, c10::optional> + std::pair, std::optional> graph_and_module, const StaticModuleOptions& opts); @@ -490,7 +490,7 @@ class TORCH_API StaticModule { C10_NODISCARD Node* findNodeWithKindForTesting(const std::string& kind) const; - const c10::optional& schema() const { + const std::optional& schema() const { return schema_; } @@ -539,8 +539,8 @@ class TORCH_API StaticModule { // metadata that is stored in IR nodes as attribute at::intrusive_ptr sr_metadata_; std::shared_ptr graph_; - c10::optional module_; - c10::optional schema_; + std::optional module_; + std::optional schema_; std::unique_ptr cached_runtime_; // Bookkeeping for creating new StaticRuntime instances diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index b4f4c38c2aaf5..b1b8a081c4ce6 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -209,7 +209,7 @@ at::Tensor& to_copy_out( const Tensor& self, bool non_blocking, bool copy_strides, - c10::optional memory_format) { + std::optional memory_format) { if (copy_strides) { at::native::resize_impl_cpu_( out.unsafeGetTensorImpl(), self.sizes(), self.strides()); @@ -259,7 +259,7 @@ static Tensor& linear_out( Tensor& output, const Tensor& input, const Tensor& weight, - const c10::optional& bias_opt) { + const std::optional& bias_opt) { TORCH_CHECK(!input.is_mkldnn()); auto bias = bias_opt.has_value() @@ -1048,7 +1048,7 @@ REGISTER_OPERATOR_FUNCTOR(aten::logit, aten_logit, [](Node* n) -> SROperator { LogAndDumpSchema(n); return nullptr; } - c10::optional clamp = c10::nullopt; + std::optional clamp = c10::nullopt; if (n->inputs()[1]->node()->kind() == prim::Constant) { auto clamp_d = toIValue(n->inputs()[1])->toOptional(); clamp = clamp_d @@ -1353,10 +1353,10 @@ namespace { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) struct ToArgs { - c10::optional dtype; + std::optional dtype; c10::Layout layout; bool know_to_will_alias = false; - c10::optional memory_format; + std::optional memory_format; }; template @@ -1440,8 +1440,8 @@ C10_ALWAYS_INLINE void to_copy_functor_impl( // handle memory format bool copy_strides = false; - c10::optional memory_format = c10::MemoryFormat::Preserve; - c10::optional my_args; + std::optional memory_format = c10::MemoryFormat::Preserve; + std::optional my_args; if (!args) { my_args = extract_to_args< has_constant_non_tensor_dtype_and_flags, @@ -1905,7 +1905,7 @@ REGISTER_OPERATOR_FUNCTOR(aten::div, aten_div, [](Node* n) -> SROperator { return [te = createDiv()](ProcessedNode* p_node) { const auto& in0_t = p_node->Input(0).toTensor(); - c10::optional rounding_mode = c10::nullopt; + std::optional rounding_mode = c10::nullopt; if (p_node->num_inputs() > 2) { rounding_mode = p_node->Input(2).toOptional(); } @@ -2396,8 +2396,8 @@ REGISTER_OPERATOR_FUNCTOR( // device & pin_memory matter only when CUDA is enabled. static bool hasTensorWithOptions( const IValue& ivalue, - c10::optional dtype, - c10::optional layout) { + std::optional dtype, + std::optional layout) { if (!ivalue.isTensor()) { return false; } @@ -2412,9 +2412,9 @@ static bool hasTensorWithOptions( static bool hasTensorWithOptions( const IValue& ivalue, - c10::optional dtype, - c10::optional layout, - c10::optional memory_format) { + std::optional dtype, + std::optional layout, + std::optional memory_format) { return hasTensorWithOptions(ivalue, dtype, layout) && (memory_format == ivalue.toTensor().options().memory_format_opt()); } diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h index 53aa0dc787d1b..362837e7ce78f 100644 --- a/torch/csrc/jit/runtime/static/ops.h +++ b/torch/csrc/jit/runtime/static/ops.h @@ -15,7 +15,7 @@ at::Tensor& to_copy_out( const Tensor& self, bool non_blocking, bool copy_strides, - c10::optional memory_format); + std::optional memory_format); } // namespace at::native namespace torch::jit { diff --git a/torch/csrc/jit/runtime/symbolic_script.cpp b/torch/csrc/jit/runtime/symbolic_script.cpp index ff8513f016daf..6aa65c528a42b 100644 --- a/torch/csrc/jit/runtime/symbolic_script.cpp +++ b/torch/csrc/jit/runtime/symbolic_script.cpp @@ -1614,7 +1614,7 @@ static void loadFunctions() { loadModule(compilation_unit); } -c10::optional gradientInfoForSchema( +std::optional gradientInfoForSchema( const FunctionSchema& schema) { std::lock_guard guard(lock); if (schema_to_graphs.empty()) { diff --git a/torch/csrc/jit/runtime/symbolic_script.h b/torch/csrc/jit/runtime/symbolic_script.h index 64e0d6661baeb..271bf66916f3d 100644 --- a/torch/csrc/jit/runtime/symbolic_script.h +++ b/torch/csrc/jit/runtime/symbolic_script.h @@ -12,7 +12,7 @@ struct GradientPair { std::shared_ptr backward; }; -TORCH_API c10::optional gradientInfoForSchema( +TORCH_API std::optional gradientInfoForSchema( const FunctionSchema& schema); TORCH_API bool hasGradientInfoForSchema(const FunctionSchema& schema); } // namespace torch::jit diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp index 5e380c1f437a7..ddea031aba73c 100644 --- a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp +++ b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp @@ -377,7 +377,7 @@ void loadFunctions() { } } // anonymous namespace -c10::optional> shapeComputeGraphForSchema( +std::optional> shapeComputeGraphForSchema( const FunctionSchema& schema) { std::lock_guard guard(lock); if (cached_schema_to_graph.empty()) { @@ -394,7 +394,7 @@ c10::optional> shapeComputeGraphForSchema( return c10::nullopt; } -TORCH_API c10::optional boundedGraphsForSchema( +TORCH_API std::optional boundedGraphsForSchema( const FunctionSchema& schema) { std::lock_guard guard(lock); if (cached_bounded_schema_to_graph.empty()) { diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry.h b/torch/csrc/jit/runtime/symbolic_shape_registry.h index 2d09eb27876b7..a14d327aab429 100644 --- a/torch/csrc/jit/runtime/symbolic_shape_registry.h +++ b/torch/csrc/jit/runtime/symbolic_shape_registry.h @@ -54,10 +54,10 @@ TORCH_API void RegisterShapeComputeGraphForSchema( const FunctionSchema& schema, std::shared_ptr g); -TORCH_API c10::optional> shapeComputeGraphForSchema( +TORCH_API std::optional> shapeComputeGraphForSchema( const FunctionSchema& schema); -TORCH_API c10::optional boundedGraphsForSchema( +TORCH_API std::optional boundedGraphsForSchema( const FunctionSchema& schema); TORCH_API std::vector RegisteredShapeComputeSchemas(); diff --git a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp index 7674c5324ce9f..4a326285b2974 100644 --- a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp +++ b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp @@ -59,7 +59,7 @@ c10::IValue InlinedCallStackSerializer::serialize( } c10::IValue InlinedCallStackSerializer::serialize_module_instance_info( - const c10::optional& m) { + const std::optional& m) { if (!m) { return c10::IValue(); } @@ -168,7 +168,7 @@ InlinedCallStackPtr InlinedCallStackDeserializer::deserialize( return cs_ptr; } -c10::optional InlinedCallStackDeserializer:: +std::optional InlinedCallStackDeserializer:: deserialize_module_instance_info( const c10::IValue& iv, const std::shared_ptr& cu) { diff --git a/torch/csrc/jit/serialization/callstack_debug_info_serialization.h b/torch/csrc/jit/serialization/callstack_debug_info_serialization.h index ac1bdf8d3b1d8..46fd2850d20bd 100644 --- a/torch/csrc/jit/serialization/callstack_debug_info_serialization.h +++ b/torch/csrc/jit/serialization/callstack_debug_info_serialization.h @@ -32,7 +32,7 @@ class InlinedCallStackSerializer { private: // module_info = [ClassType.qualifiedName, instance_name] c10::IValue serialize_module_instance_info( - const c10::optional& m); + const std::optional& m); // This caches serialized inlined callstack ptr, since many // InlinedCallStackPtr can refer to the same one. @@ -64,7 +64,7 @@ class InlinedCallStackDeserializer { const std::shared_ptr& cu); private: - c10::optional deserialize_module_instance_info( + std::optional deserialize_module_instance_info( const c10::IValue& iv, const std::shared_ptr& cu); diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp index c23e3b52bfb1b..6ef9bdbf4abfa 100644 --- a/torch/csrc/jit/serialization/export.cpp +++ b/torch/csrc/jit/serialization/export.cpp @@ -145,26 +145,6 @@ void validateBlock( "\n\nDefined at:\n" + getNodeStackTraceString(node)) } } else { -#ifdef BUILD_CAFFE2 - // Assuming this is a Caffe2 change as it only modifies an aten op - // for operator_export_type == ONNX_ATEN_FALLBACK, which is a common - // pattern for Caffe2-specific scenarios. - if (node->kind() == aten::expand) { - if (operator_export_type == - onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK) { - WithInsertPoint guard(node); - auto* new_node = - b->owningGraph()->insertNode(b->owningGraph()->create( - Symbol(::c10::aten::ATen), - node->inputs(), - node->outputs().size())); - for (size_t i = 0; i < node->outputs().size(); ++i) { - node->output(i)->replaceAllUsesWith(new_node->output(i)); - } - new_node->s_(Symbol::fromQualString("attr::operator"), "expand"); - } - } -#endif if (node->kind() == prim::PackPadded || node->kind() == prim::PadPacked) { if (operator_export_type != onnx_torch::OperatorExportTypes::ONNX_FALLTHROUGH) { @@ -209,7 +189,7 @@ std::string GetFileRootPath(const std::string& rootPath) { } std::string GetExternalFileName( - const c10::optional& external_ref) { + const std::optional& external_ref) { auto tensorName = external_ref.value(); const std::string illegalChars = "\\/:?\"<>|"; for (char& i : tensorName) { @@ -363,7 +343,7 @@ class GraphEncoder { void EncodeTensor( onnx::TensorProto* tensor_proto, const at::Tensor& tensor, - const c10::optional external_ref = {}, + const std::optional external_ref = {}, const bool use_external_data_format = false, const std::string& onnx_file_path = std::string()); @@ -1300,7 +1280,7 @@ void GraphEncoder::EncodeTypeProto( void GraphEncoder::EncodeTensor( onnx::TensorProto* tensor_proto, const at::Tensor& tensor, - const c10::optional external_ref, + const std::optional external_ref, const bool use_external_data_format, const std::string& onnx_file_path) { for (auto d : tensor.sizes()) { diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h index 3a56cfc7788fb..9a7ab2c4fcc87 100644 --- a/torch/csrc/jit/serialization/export.h +++ b/torch/csrc/jit/serialization/export.h @@ -30,6 +30,7 @@ namespace jit { using RawDataExportMap = std::unordered_map; using SymbolDimMap = std::map; +using DimSymbolMap = std::map; using NodeNameMap = std::unordered_map; diff --git a/torch/csrc/jit/serialization/export_bytecode.cpp b/torch/csrc/jit/serialization/export_bytecode.cpp index 9ec2dbcaa2da3..9f194cd0ad31b 100644 --- a/torch/csrc/jit/serialization/export_bytecode.cpp +++ b/torch/csrc/jit/serialization/export_bytecode.cpp @@ -166,7 +166,7 @@ mobile::Code compileGraphToMobileCode( // and is not allowed. For an operator with num_args = -1, it means the // number of arguments is not available for this operator, we don't do any // backward compatibility adaptation at runtime. - c10::optional num_args = c10::nullopt; + std::optional num_args = c10::nullopt; auto it = op_to_specified_args.find(unique_name); if (it != op_to_specified_args.end()) { num_args = it->second; diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp index cdb878d4062c8..5bd7714c4e8d2 100644 --- a/torch/csrc/jit/serialization/export_module.cpp +++ b/torch/csrc/jit/serialization/export_module.cpp @@ -254,7 +254,7 @@ std::pair getFunctionTuple( // schema const auto& schema = func.getSchema(); - auto type_printer = [&](const c10::Type& t) -> c10::optional { + auto type_printer = [&](const c10::Type& t) -> std::optional { auto namedType = t.cast(); if (namedType && namedType->name()) { return type_name_uniquer_.getUniqueName(namedType).qualifiedName(); @@ -313,7 +313,7 @@ std::pair getFunctionTuple( } auto bytecode_vals = to_tuple({qn, codeTable, schemaTable}); - c10::optional debug_info_vals; + std::optional debug_info_vals; // module debug info // This is just a set of debug handles. // We always save debug handles. @@ -754,7 +754,7 @@ void ScriptModuleSerializer::writeByteCode( namespace { -c10::optional type_printer( +std::optional type_printer( const c10::Type& type, torch::jit::TypeNameUniquer& type_name_uniquer) { if (auto dyn = type.castRaw()) { diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp index a3dada3c715f0..5a47fe900f3fd 100644 --- a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp +++ b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp @@ -61,7 +61,7 @@ static TypePtr realType(TypePtr type) { } } -auto print_type(const c10::Type& t) -> c10::optional { +auto print_type(const c10::Type& t) -> std::optional { auto namedType = t.cast(); if (namedType && namedType->name()) { return namedType->name().value().qualifiedName(); @@ -298,7 +298,7 @@ flatbuffers::Offset FlatbufferSerializer:: auto register_size = static_cast(code.register_size_); // schema - auto type_printer = [&](const c10::Type& t) -> c10::optional { + auto type_printer = [&](const c10::Type& t) -> std::optional { auto namedType = t.cast(); if (namedType && namedType->name()) { return namedType->name().value().qualifiedName(); diff --git a/torch/csrc/jit/serialization/import.cpp b/torch/csrc/jit/serialization/import.cpp index e724853e70c1c..40d155e61c758 100644 --- a/torch/csrc/jit/serialization/import.cpp +++ b/torch/csrc/jit/serialization/import.cpp @@ -152,7 +152,7 @@ class ScriptModuleDeserializer final { reader_->version()) {} Module deserialize( - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool restore_shapes = false); @@ -162,7 +162,7 @@ class ScriptModuleDeserializer final { std::shared_ptr compilation_unit_; std::shared_ptr reader_; std::shared_ptr storage_context_; - c10::optional device_; + std::optional device_; std::vector constants_table_; std::string code_prefix_; std::string pickle_dir_prefix_; @@ -248,7 +248,7 @@ graph(%x, %packed_params, %stride, %padding, %dilation, %groups, %r_scale, %r_ze } Module ScriptModuleDeserializer::deserialize( - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool restore_shapes) { // we populate the upgraders map before any load starts @@ -311,7 +311,7 @@ Module ScriptModuleDeserializer::deserialize( Module import_ir_module( std::shared_ptr cu, std::istream& in, - c10::optional device, + std::optional device, bool load_debug_files) { ExtraFilesMap extra_files; return import_ir_module( @@ -322,7 +322,7 @@ static Module _load_jit_module_from_bytes( std::shared_ptr data, size_t size, std::shared_ptr cu, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool restore_shapes); @@ -330,7 +330,7 @@ Module parse_and_initialize_jit_module( std::shared_ptr data, size_t size, ExtraFilesMap& extra_files, - c10::optional device) { + std::optional device) { populate_upgraders_graph_map(); ExtraFilesMap jit_files; std::vector jit_constants; @@ -349,7 +349,7 @@ Module parse_and_initialize_jit_module( Module load_jit_module_from_file( const std::string& filename, ExtraFilesMap& extra_files, - c10::optional device) { + std::optional device) { auto data = get_file_content(filename.c_str()); return parse_and_initialize_jit_module( std::move(std::get<0>(data)), std::get<1>(data), extra_files, device); @@ -358,7 +358,7 @@ Module load_jit_module_from_file( Module load_jit_module_from_stream( std::istream& in, ExtraFilesMap& extra_files, - c10::optional device) { + std::optional device) { auto data = get_stream_content(in); return parse_and_initialize_jit_module( std::move(std::get<0>(data)), std::get<1>(data), extra_files, device); @@ -367,7 +367,7 @@ Module load_jit_module_from_stream( Module import_ir_module( std::shared_ptr cu, std::istream& in, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files, bool restore_shapes) { @@ -390,7 +390,7 @@ Module import_ir_module( std::shared_ptr cu, std::shared_ptr reader, std::shared_ptr storage_context, - c10::optional device, + std::optional device, std::string ts_id) { ScriptModuleDeserializer deserializer( std::move(cu), @@ -405,7 +405,7 @@ Module import_ir_module( Module import_ir_module( std::shared_ptr cu, const std::string& filename, - c10::optional device, + std::optional device, bool load_debug_files) { ExtraFilesMap extra_files; return import_ir_module( @@ -415,7 +415,7 @@ Module import_ir_module( Module import_ir_module( std::shared_ptr cu, const std::string& filename, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files, bool restore_shapes) { @@ -435,7 +435,7 @@ Module import_ir_module( Module import_ir_module( std::shared_ptr cu, std::unique_ptr rai, - c10::optional device, + std::optional device, bool load_debug_files) { ExtraFilesMap extra_files; return import_ir_module( @@ -445,7 +445,7 @@ Module import_ir_module( Module import_ir_module( std::shared_ptr cu, std::unique_ptr rai, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files) { std::shared_ptr rai_shared = std::move(rai); @@ -456,7 +456,7 @@ Module import_ir_module( Module import_ir_module( std::shared_ptr cu, std::shared_ptr rai, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files) { auto reader = std::make_shared(std::move(rai)); @@ -467,7 +467,7 @@ Module import_ir_module( Module load( std::istream& in, - c10::optional device, + std::optional device, bool load_debug_files) { auto cu = std::make_shared(); return import_ir_module(std::move(cu), in, device, load_debug_files); @@ -475,7 +475,7 @@ Module load( Module load( std::istream& in, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files) { auto cu = std::make_shared(); @@ -485,7 +485,7 @@ Module load( Module load( const std::string& filename, - c10::optional device, + std::optional device, bool load_debug_files) { auto cu = std::make_shared(); return import_ir_module(std::move(cu), filename, device, load_debug_files); @@ -493,7 +493,7 @@ Module load( Module load( const std::string& filename, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files) { auto cu = std::make_shared(); @@ -503,7 +503,7 @@ Module load( Module load( std::shared_ptr rai, - c10::optional device, + std::optional device, bool load_debug_files) { auto cu = std::make_shared(); ExtraFilesMap extra_files; @@ -513,7 +513,7 @@ Module load( Module load( std::shared_ptr rai, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files) { auto cu = std::make_shared(); @@ -525,7 +525,7 @@ Module _load_jit_module_from_bytes( std::shared_ptr data, size_t size, std::shared_ptr cu, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool restore_shapes) { TORCH_CHECK(size >= kFileFormatHeaderSize, "Unrecognized data format"); diff --git a/torch/csrc/jit/serialization/import.h b/torch/csrc/jit/serialization/import.h index c8379f38810f7..b090a1c80a3cd 100644 --- a/torch/csrc/jit/serialization/import.h +++ b/torch/csrc/jit/serialization/import.h @@ -21,25 +21,25 @@ class DeserializationStorageContext; TORCH_API Module import_ir_module( std::shared_ptr cu, const std::string& filename, - c10::optional device = c10::nullopt, + std::optional device = c10::nullopt, bool load_debug_files = true); TORCH_API Module import_ir_module( std::shared_ptr cu, std::istream& in, - c10::optional device = c10::nullopt, + std::optional device = c10::nullopt, bool load_debug_files = true); TORCH_API Module import_ir_module( std::shared_ptr cu, std::unique_ptr rai, - c10::optional device = c10::nullopt, + std::optional device = c10::nullopt, bool load_debug_files = true); TORCH_API Module import_ir_module( std::shared_ptr cu, const std::string& filename, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files = true, bool restore_shapes = false); @@ -49,13 +49,13 @@ TORCH_API Module import_ir_module( std::shared_ptr cu, std::shared_ptr reader, std::shared_ptr storage_context, - c10::optional device, + std::optional device, std::string ts_id /* torchscript identifier inside package */); TORCH_API Module import_ir_module( std::shared_ptr cu, std::istream& in, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files = true, bool restore_shapes = false); @@ -63,14 +63,14 @@ TORCH_API Module import_ir_module( TORCH_API Module import_ir_module( std::shared_ptr cu, std::unique_ptr rai, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files = true); TORCH_API Module import_ir_module( std::shared_ptr cu, std::shared_ptr rai, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files = true); @@ -80,12 +80,12 @@ TORCH_API Module import_ir_module( /// `torch::jit::ExportModule` in C++. TORCH_API Module load( std::istream& in, - c10::optional device = c10::nullopt, + std::optional device = c10::nullopt, bool load_debug_files = true); TORCH_API Module load( std::istream& in, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files = true); @@ -96,12 +96,12 @@ TORCH_API Module load( /// Python or `torch::jit::ExportModule` in C++. TORCH_API Module load( const std::string& filename, - c10::optional device = c10::nullopt, + std::optional device = c10::nullopt, bool load_debug_files = true); TORCH_API Module load( const std::string& filename, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files = true); @@ -112,12 +112,12 @@ TORCH_API Module load( /// Python or `torch::jit::ExportModule` in C++. TORCH_API Module load( std::shared_ptr rai, - c10::optional device = c10::nullopt, + std::optional device = c10::nullopt, bool load_debug_files = true); TORCH_API Module load( std::shared_ptr rai, - c10::optional device, + std::optional device, ExtraFilesMap& extra_files, bool load_debug_files = true); @@ -131,23 +131,23 @@ TORCH_API Module parse_and_initialize_jit_module( std::shared_ptr data, size_t size, ExtraFilesMap& extra_files, - c10::optional device = c10::nullopt); + std::optional device = c10::nullopt); TORCH_API Module load_jit_module_from_file( const std::string& filename, ExtraFilesMap& extra_files, - c10::optional device = c10::nullopt); + std::optional device = c10::nullopt); TORCH_API Module load_jit_module_from_stream( std::istream& in, ExtraFilesMap& extra_files, - c10::optional device = c10::nullopt); + std::optional device = c10::nullopt); TORCH_API Module parse_and_initialize_jit_module( std::shared_ptr data, size_t size, ExtraFilesMap& extra_files, - c10::optional device); + std::optional device); TORCH_API c10::intrusive_ptr ObjLoaderFunc( const at::StrongTypePtr& type, diff --git a/torch/csrc/jit/serialization/import_legacy.cpp b/torch/csrc/jit/serialization/import_legacy.cpp index 85ec2675a9c23..d7c592d18c72f 100644 --- a/torch/csrc/jit/serialization/import_legacy.cpp +++ b/torch/csrc/jit/serialization/import_legacy.cpp @@ -41,7 +41,7 @@ class ScriptModuleDeserializer final { ScriptModuleDeserializer( std::shared_ptr cu, std::shared_ptr reader, - const c10::optional& device) + const std::optional& device) : compilation_unit_(std::move(cu)), reader_(std::move(reader)), device_(device), @@ -77,7 +77,7 @@ class ScriptModuleDeserializer final { std::shared_ptr compilation_unit_; std::shared_ptr reader_; - c10::optional device_; + std::optional device_; // Legacy only tensor can be a constant. std::vector constant_table_; std::vector tensor_table_; @@ -377,7 +377,7 @@ Module ScriptModuleDeserializer::LEGACY_convertModule( Module LEGACY_deserialize( std::shared_ptr cu, std::shared_ptr reader, - const c10::optional& device) { + const std::optional& device) { ScriptModuleDeserializer deserializer( std::move(cu), std::move(reader), device); return deserializer.LEGACY_deserialize(); diff --git a/torch/csrc/jit/serialization/import_legacy.h b/torch/csrc/jit/serialization/import_legacy.h index a261828109596..2e206eae09bcf 100644 --- a/torch/csrc/jit/serialization/import_legacy.h +++ b/torch/csrc/jit/serialization/import_legacy.h @@ -17,7 +17,7 @@ struct CompilationUnit; Module LEGACY_deserialize( std::shared_ptr cu, std::shared_ptr reader, - const c10::optional& device); + const std::optional& device); } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/serialization/import_read.cpp b/torch/csrc/jit/serialization/import_read.cpp index 533fed491773f..eeaa79c856627 100644 --- a/torch/csrc/jit/serialization/import_read.cpp +++ b/torch/csrc/jit/serialization/import_read.cpp @@ -7,9 +7,9 @@ IValue readArchiveAndTensors( const std::string& archive_name, const std::string& pickle_prefix, const std::string& tensor_prefix, - c10::optional type_resolver, - c10::optional obj_loader, - c10::optional device, + std::optional type_resolver, + std::optional obj_loader, + std::optional device, caffe2::serialize::PyTorchStreamReader& stream_reader, c10::TypePtr (*type_parser)(const std::string&), std::shared_ptr storage_context) { diff --git a/torch/csrc/jit/serialization/import_read.h b/torch/csrc/jit/serialization/import_read.h index ab89f93880c34..ae78f1979f10a 100644 --- a/torch/csrc/jit/serialization/import_read.h +++ b/torch/csrc/jit/serialization/import_read.h @@ -16,9 +16,9 @@ TORCH_API IValue readArchiveAndTensors( const std::string& archive_name, const std::string& pickle_prefix, const std::string& tensor_prefix, - c10::optional type_resolver, - c10::optional obj_loader, - c10::optional device, + std::optional type_resolver, + std::optional obj_loader, + std::optional device, caffe2::serialize::PyTorchStreamReader& stream_reader, c10::TypePtr (*type_parser)(const std::string&) = Unpickler::defaultTypeParser, diff --git a/torch/csrc/jit/serialization/import_source.cpp b/torch/csrc/jit/serialization/import_source.cpp index 53d0d9fd47359..f67c2a22e9eb1 100644 --- a/torch/csrc/jit/serialization/import_source.cpp +++ b/torch/csrc/jit/serialization/import_source.cpp @@ -304,7 +304,7 @@ void SourceImporterImpl::importNamedType( } } -c10::optional SourceImporterImpl:: +std::optional SourceImporterImpl:: attributeAssignmentSpecialHandlingHack( const QualifiedName& qualified_classname, const Assign& assign) { @@ -703,7 +703,7 @@ void SourceImporterImpl::importNamedTuple( const auto assign = Assign(statement); auto name = Var(Assign(statement).lhs()).name().name(); - c10::optional default_val; + std::optional default_val; if (assign.rhs().present()) { std::vector parsed = type_parser.evaluateDefaults( assign.rhs().range(), {assign.rhs().get()}, {assign.type().get()}); diff --git a/torch/csrc/jit/serialization/import_source.h b/torch/csrc/jit/serialization/import_source.h index 9a720a81bcbb2..9b364f379b409 100644 --- a/torch/csrc/jit/serialization/import_source.h +++ b/torch/csrc/jit/serialization/import_source.h @@ -45,7 +45,7 @@ struct SourceImporterImpl : public Resolver, private: void importFunction(const std::string& qualifier, const Def& def); void importNamedType(const std::string& qualifier, const ClassDef& class_def); - c10::optional attributeAssignmentSpecialHandlingHack( + std::optional attributeAssignmentSpecialHandlingHack( const QualifiedName& qualified_classname, const Assign& assign); void importClass( @@ -66,7 +66,7 @@ struct SourceImporterImpl : public Resolver, std::shared_ptr cu_; std::unordered_map> env_; SourceLoader source_loader_; - c10::optional version_ = c10::nullopt; + std::optional version_ = c10::nullopt; std::unordered_set loaded_sources_; // named types and functions loaded from a file but not yet defined because // their type has not been requested yet. diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp index 6e1b399e40fd4..173ab5c13e5da 100644 --- a/torch/csrc/jit/serialization/pickler.cpp +++ b/torch/csrc/jit/serialization/pickler.cpp @@ -601,7 +601,7 @@ void Pickler::startTypeTag() { } } namespace { -c10::optional type_printer(const c10::Type& type) { +std::optional type_printer(const c10::Type& type) { if (auto dyn = type.castRaw()) { return dyn->fallback()->annotation_str(type_printer); } diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h index 4f553b6f7ca8a..39726d00b0998 100644 --- a/torch/csrc/jit/serialization/pickler.h +++ b/torch/csrc/jit/serialization/pickler.h @@ -311,14 +311,14 @@ inline std::unordered_set& GetBackendMetaAllowlist() { // Dynamically obtain serialization function pairs // that require the corresponding backend. inline std::array< - c10::optional>, + std::optional>, at::COMPILE_TIME_MAX_DEVICE_TYPES>& GetBackendMetaSerialization() { // The array to save function pointer for BackendMeta serialization. // key is the DeviceType, value is std::pair obj. // value.first represent get function and value.seconde represent set function static std::array< - c10::optional>, + std::optional>, at::COMPILE_TIME_MAX_DEVICE_TYPES> BackendMetaSerialization; return BackendMetaSerialization; @@ -348,7 +348,7 @@ TORCH_API inline void TensorBackendMetaRegistry( t, " has been registered."); BackendMetaSerialization[device_type] = - c10::optional>( + std::optional>( std::make_pair(get_fptr, set_fptr)); } diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp index cac31c6ce5868..f1b0865032c39 100644 --- a/torch/csrc/jit/serialization/python_print.cpp +++ b/torch/csrc/jit/serialization/python_print.cpp @@ -1714,7 +1714,7 @@ static std::vector traverseIValueAndGetObjects(IValue ivalue) { return result; } -static c10::optional printType( +static std::optional printType( const c10::Type& type, torch::jit::TypeNameUniquer& type_name_uniquer) { if (auto dyn = type.castRaw()) { diff --git a/torch/csrc/jit/serialization/source_range_serialization.cpp b/torch/csrc/jit/serialization/source_range_serialization.cpp index d3c4eaf7bf491..118becd20dc7c 100644 --- a/torch/csrc/jit/serialization/source_range_serialization.cpp +++ b/torch/csrc/jit/serialization/source_range_serialization.cpp @@ -68,7 +68,7 @@ std::shared_ptr SourceRangeDeserializer::deserialize_source( const auto& textIndex = tup_elems[0].toIntList(); int64_t fnameIndex = tup_elems[1].toInt(); int64_t starting_line_no_ = tup_elems[2].toInt(); - c10::optional filename = c10::nullopt; + std::optional filename = c10::nullopt; TORCH_CHECK( (uint64_t)fnameIndex < text_table_.size(), @@ -88,7 +88,7 @@ std::shared_ptr SourceRangeDeserializer::deserialize_source( source = std::make_shared(str_cord, filename, starting_line_no_); } else { std::string text_ = tup_elems[0].toStringRef(); - c10::optional filename_ = + std::optional filename_ = tup_elems[1].toOptional(); int64_t starting_line_no_ = tup_elems[2].toInt(); source = std::make_shared( @@ -229,7 +229,7 @@ void ConcreteSourceRangeUnpickler::unpickle() { } } -c10::optional ConcreteSourceRangeUnpickler:: +std::optional ConcreteSourceRangeUnpickler:: findSourceRangeThatGenerated(const SourceRange& range) { unpickle(); diff --git a/torch/csrc/jit/serialization/source_range_serialization.h b/torch/csrc/jit/serialization/source_range_serialization.h index bbfd533cd1789..044e9655a9ea1 100644 --- a/torch/csrc/jit/serialization/source_range_serialization.h +++ b/torch/csrc/jit/serialization/source_range_serialization.h @@ -55,7 +55,7 @@ class SourceRangeDeserializer { class SourceRangeUnpickler { public: - virtual c10::optional findSourceRangeThatGenerated( + virtual std::optional findSourceRangeThatGenerated( const SourceRange& range) = 0; virtual ~SourceRangeUnpickler() = default; diff --git a/torch/csrc/jit/serialization/source_range_serialization_impl.h b/torch/csrc/jit/serialization/source_range_serialization_impl.h index 2b7cd5a14ba92..9b00956ccd048 100644 --- a/torch/csrc/jit/serialization/source_range_serialization_impl.h +++ b/torch/csrc/jit/serialization/source_range_serialization_impl.h @@ -12,7 +12,7 @@ class ConcreteSourceRangeUnpickler : public SourceRangeUnpickler { public: ConcreteSourceRangeUnpickler(at::DataPtr&& data, size_t size); - c10::optional findSourceRangeThatGenerated( + std::optional findSourceRangeThatGenerated( const SourceRange& range) override; private: diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp index 26fa21575368d..ee5793b14856a 100644 --- a/torch/csrc/jit/serialization/unpickler.cpp +++ b/torch/csrc/jit/serialization/unpickler.cpp @@ -822,7 +822,7 @@ void Unpickler::readGlobal( // like the other branches here because no REDUCE or BUILD will // be called on this value. Instead, we just put it on the stack // and return early - c10::optional scalar_type; + std::optional scalar_type; #define CHECK_SCALAR(_, name) \ if (class_name == #name "Storage") { \ scalar_type = c10::k##name; \ @@ -834,7 +834,7 @@ void Unpickler::readGlobal( return; } - c10::optional qscheme; + std::optional qscheme; for (int i = 0; i < at::COMPILE_TIME_NUM_QSCHEMES; ++i) { if (class_name == toString(static_cast(i))) { qscheme = static_cast(i); diff --git a/torch/csrc/jit/serialization/unpickler.h b/torch/csrc/jit/serialization/unpickler.h index bc980bf90522b..eed216455f3e2 100644 --- a/torch/csrc/jit/serialization/unpickler.h +++ b/torch/csrc/jit/serialization/unpickler.h @@ -68,7 +68,7 @@ class TORCH_API Unpickler { TypeResolver type_resolver, ObjLoader obj_loader, std::function read_record, - c10::optional device, + std::optional device, bool use_storage_device = false, TypeParserT type_parser = defaultTypeParser, std::shared_ptr storage_context = nullptr) @@ -178,7 +178,7 @@ class TORCH_API Unpickler { IValue empty_tuple_; std::function read_record_; - c10::optional device_; + std::optional device_; // When set to true, Unpickler will ignore the pickled device and use the // device of the DataPtr returned by the read_record_ function. The default // value of this flag is false. diff --git a/torch/csrc/jit/tensorexpr/codegen.cpp b/torch/csrc/jit/tensorexpr/codegen.cpp index 53754aab7c0d6..e1464d0efc3ec 100644 --- a/torch/csrc/jit/tensorexpr/codegen.cpp +++ b/torch/csrc/jit/tensorexpr/codegen.cpp @@ -95,7 +95,7 @@ void CodeGen::call_with_numel(void** args, int64_t numel) { false, "This codegen backend does not implement call_with_numel"); } -static c10::optional bufSize(BufPtr buf) { +static std::optional bufSize(BufPtr buf) { size_t size = elementSize(buf->dtype().scalar_type()) * buf->dtype().lanes(); for (auto& d : buf->dims()) { if (!d->isConstant()) { diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h index fdcf3425e3abc..42db25c26ea49 100644 --- a/torch/csrc/jit/tensorexpr/codegen.h +++ b/torch/csrc/jit/tensorexpr/codegen.h @@ -85,10 +85,10 @@ class TORCH_API CodeGen { virtual at::Tensor empty_strided( c10::IntArrayRef size, c10::IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) { return at::empty_strided( size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt); } diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp index 07626232399e4..602bc49302c53 100644 --- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp @@ -1275,10 +1275,10 @@ void CudaCodeGen::call(const std::vector& args) { at::Tensor CudaCodeGen::empty_strided( c10::IntArrayRef size, c10::IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) { c10::DeviceGuard device_guard(device_opt.value()); return at::native::empty_strided_cuda( size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt); diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.h b/torch/csrc/jit/tensorexpr/cuda_codegen.h index 22de1ce32d00f..74f3d4ec7835b 100644 --- a/torch/csrc/jit/tensorexpr/cuda_codegen.h +++ b/torch/csrc/jit/tensorexpr/cuda_codegen.h @@ -235,10 +235,10 @@ class TORCH_CUDA_CU_API CudaCodeGen : public CodeGen { at::Tensor empty_strided( c10::IntArrayRef size, c10::IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt) override; + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) override; const std::vector& gpu_block_extents() const { return cuda_analysis_->gpu_block_extents(); diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp index e5a59ae33ef26..be1057e21c3c7 100644 --- a/torch/csrc/jit/tensorexpr/eval.cpp +++ b/torch/csrc/jit/tensorexpr/eval.cpp @@ -1300,7 +1300,7 @@ InterpValue SimpleIREvaluator::value() const { return impl_->value(); } -c10::optional evalInt(ExprPtr e) { +std::optional evalInt(ExprPtr e) { try { return ExprEval(cast(ExprHandle(e))) .value(); diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h index 64ac1edf8f188..9bbea1bd28a43 100644 --- a/torch/csrc/jit/tensorexpr/eval.h +++ b/torch/csrc/jit/tensorexpr/eval.h @@ -307,7 +307,7 @@ class ExprEval { // Evaluates the given expression and returns an int64_t value if the result of // the given expression is int64_t. -c10::optional evalInt(ExprPtr e); +std::optional evalInt(ExprPtr e); // Substitutes the given vars with their corresponding expressions in the input // expression. diff --git a/torch/csrc/jit/tensorexpr/expr.cpp b/torch/csrc/jit/tensorexpr/expr.cpp index cffc5e45dbf46..bf3cc13ccb39f 100644 --- a/torch/csrc/jit/tensorexpr/expr.cpp +++ b/torch/csrc/jit/tensorexpr/expr.cpp @@ -415,7 +415,7 @@ Buf::Buf( std::vector dims, Dtype dtype, ExprPtr initializer, - c10::optional> strides, + std::optional> strides, ExprPtr qscale, ExprPtr qzero) : ExprNodeBase(dtype, kPrimitive), @@ -452,11 +452,11 @@ BufHandle Buf::make( const std::string& name_hint, const std::vector& dims, Dtype dtype, - c10::optional initializer, - c10::optional> strides, - c10::optional qscale, - c10::optional qzero) { - c10::optional> opt_strides; + std::optional initializer, + std::optional> strides, + std::optional qscale, + std::optional qzero) { + std::optional> opt_strides; if (strides) { opt_strides = ExprHandleVectorToExprVector(*strides); } diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h index 1a0cc57875d19..8c8de89975750 100644 --- a/torch/csrc/jit/tensorexpr/expr.h +++ b/torch/csrc/jit/tensorexpr/expr.h @@ -207,10 +207,10 @@ class TORCH_API Buf : public ExprNode { const std::string& name_hint, const std::vector& dims, Dtype dtype, - c10::optional initializer = c10::nullopt, - c10::optional> strides = c10::nullopt, - c10::optional qscale = c10::nullopt, - c10::optional qzero = c10::nullopt); + std::optional initializer = c10::nullopt, + std::optional> strides = c10::nullopt, + std::optional qscale = c10::nullopt, + std::optional qzero = c10::nullopt); // TODO: unique_name VarPtr base_handle() const { @@ -232,7 +232,7 @@ class TORCH_API Buf : public ExprNode { const std::vector& dims, Dtype dtype, ExprPtr initializer = nullptr, - c10::optional> strides = c10::nullopt, + std::optional> strides = c10::nullopt, ExprPtr qscale = nullptr, ExprPtr qzero = nullptr) : Buf(alloc(name_hint, kHandle), @@ -248,7 +248,7 @@ class TORCH_API Buf : public ExprNode { std::vector dims, Dtype dtype, ExprPtr initializer = nullptr, - c10::optional> strides = c10::nullopt, + std::optional> strides = c10::nullopt, ExprPtr qscale = nullptr, ExprPtr qzero = nullptr); diff --git a/torch/csrc/jit/tensorexpr/external_functions.cpp b/torch/csrc/jit/tensorexpr/external_functions.cpp index c593ab80e811c..a3146ccfaff55 100644 --- a/torch/csrc/jit/tensorexpr/external_functions.cpp +++ b/torch/csrc/jit/tensorexpr/external_functions.cpp @@ -80,7 +80,7 @@ std::vector constructTensors( int64_t* buf_dims, int64_t* buf_strides, int8_t* buf_dtypes, - c10::optional>> qdataArg) { + std::optional>> qdataArg) { std::vector buf_data_vec; std::vector> buf_dims_vec; std::vector> buf_strides_vec; @@ -123,7 +123,7 @@ std::vector constructTensors( } } else { // handle quantized - std::vector> qdata(bufs_num, c10::nullopt); + std::vector> qdata(bufs_num, c10::nullopt); for (const auto& qd : *qdataArg) { qdata[qd.first] = qd.second; } @@ -172,7 +172,7 @@ static std::vector constructTensors( int64_t* buf_strides, int8_t* buf_dtypes, std::vector> qdata) { - c10::optional>> opt = std::move(qdata); + std::optional>> opt = std::move(qdata); return constructTensors( bufs_num, buf_data, buf_ranks, buf_dims, buf_strides, buf_dtypes, opt); } @@ -184,7 +184,7 @@ std::vector constructTensors2( int64_t* buf_dims, int64_t* buf_strides, int8_t* buf_dtypes, - c10::optional>> qdataArg, + std::optional>> qdataArg, size_t bufs_out_num) { std::vector buf_data_vec; std::vector> buf_dims_vec; @@ -233,7 +233,7 @@ std::vector constructTensors2( } } else { // handle quantized - std::vector> qdata(bufs_in_num, c10::nullopt); + std::vector> qdata(bufs_in_num, c10::nullopt); for (const auto& qd : *qdataArg) { qdata[qd.first - bufs_out_num] = qd.second; } @@ -283,7 +283,7 @@ static std::vector constructTensors2( int8_t* buf_dtypes, std::vector> qdata, size_t bufs_out_num = 0u) { - c10::optional>> opt = std::move(qdata); + std::optional>> opt = std::move(qdata); return constructTensors2( bufs_in_num, buf_data, @@ -331,15 +331,15 @@ static at::Tensor quantized_mul_scalar(const at::Tensor& x, double scalar) { static at::Tensor quantized_cat( const c10::List& qxs, int64_t dim, - c10::optional scale, - c10::optional zero) { + std::optional scale, + std::optional zero) { const auto op = c10::Dispatcher::singleton() .findSchemaOrThrow("quantized::cat", "") .typed const&, int64_t, - c10::optional, - c10::optional)>(); + std::optional, + std::optional)>(); return op.redispatch( c10::DispatchKeySet({c10::DispatchKey::QuantizedCPU}), qxs, @@ -972,7 +972,7 @@ void nnc_aten_upsample_nearest2d( const int64_t x_qzero = extra_args[1]; const int64_t x_qdtype = extra_args[2]; const auto is_quantized = x_qdtype != -1; - c10::optional>> qdata; + std::optional>> qdata; if (is_quantized) { qdata = { {1u, @@ -992,9 +992,9 @@ void nnc_aten_upsample_nearest2d( auto r = at::upsample_nearest2d( x, (output_size_h != -1) - ? c10::optional({output_size_h, output_size_w}) + ? std::optional({output_size_h, output_size_w}) : c10::nullopt, - (scale_factor_h != -1.f) ? c10::optional>( + (scale_factor_h != -1.f) ? std::optional>( {scale_factor_h, scale_factor_w}) : c10::nullopt); memcpy(buf_data[0], r.const_data_ptr(), r.element_size() * r.numel()); @@ -1015,7 +1015,7 @@ void nnc_aten_upsample_nearest2d_out( const int64_t x_qzero = extra_args[1]; const int64_t x_qdtype = extra_args[2]; const auto is_quantized = x_qdtype != -1; - c10::optional>> qdata; + std::optional>> qdata; if (is_quantized) { qdata = { {1u, @@ -1042,9 +1042,9 @@ void nnc_aten_upsample_nearest2d_out( auto r = at::upsample_nearest2d( x, (output_size_h != -1) - ? c10::optional({output_size_h, output_size_w}) + ? std::optional({output_size_h, output_size_w}) : c10::nullopt, - (scale_factor_h != -1.f) ? c10::optional>( + (scale_factor_h != -1.f) ? std::optional>( {scale_factor_h, scale_factor_w}) : c10::nullopt); buf_data[0] = r.data_ptr(); diff --git a/torch/csrc/jit/tensorexpr/external_functions.h b/torch/csrc/jit/tensorexpr/external_functions.h index 627d67c934d59..1fd90a3f056b8 100644 --- a/torch/csrc/jit/tensorexpr/external_functions.h +++ b/torch/csrc/jit/tensorexpr/external_functions.h @@ -74,7 +74,7 @@ std::vector constructTensors( int64_t* buf_dims, int64_t* buf_strides, int8_t* buf_dtypes, - c10::optional>> qdataArg = + std::optional>> qdataArg = c10::nullopt); std::vector constructTensors2( @@ -84,7 +84,7 @@ std::vector constructTensors2( int64_t* buf_dims, int64_t* buf_strides, int8_t* buf_dtypes, - c10::optional>> qdataArg = + std::optional>> qdataArg = c10::nullopt, size_t bufs_out_num = 0); diff --git a/torch/csrc/jit/tensorexpr/graph_opt.cpp b/torch/csrc/jit/tensorexpr/graph_opt.cpp index c8f06fea063fd..01511b2b4d8c5 100644 --- a/torch/csrc/jit/tensorexpr/graph_opt.cpp +++ b/torch/csrc/jit/tensorexpr/graph_opt.cpp @@ -184,7 +184,7 @@ bool OptimizeCat(const std::shared_ptr& graph) { void annotateInputShapes( const std::shared_ptr& graph, - const std::vector>& example_inputs) { + const std::vector>& example_inputs) { TORCH_INTERNAL_ASSERT( graph->inputs().size() == example_inputs.size(), buildErrorMessage("Given inputs do not match the fuser graph inputs.")); @@ -304,8 +304,8 @@ bool isGraphCompilable(const std::shared_ptr& graph) { static void fixupTypeInfoForValue( Value* v, - c10::optional scalar_type, - c10::optional device) { + std::optional scalar_type, + std::optional device) { Node* n = v->node(); auto const& t = v->type(); if (t->kind() != TypeKind::TensorType) { @@ -339,8 +339,8 @@ static void fixupTypeInfoForValue( v->setType(new_tt); } -static c10::optional inferScalarType(Node* n) { - c10::optional scalar_type; +static std::optional inferScalarType(Node* n) { + std::optional scalar_type; for (auto v : n->inputs()) { auto const& t = v->type(); if (t->kind() == TypeKind::TensorType) { @@ -358,8 +358,8 @@ static c10::optional inferScalarType(Node* n) { return scalar_type; } -static c10::optional inferDevice(Node* n) { - c10::optional device; +static std::optional inferDevice(Node* n) { + std::optional device; for (auto v : n->inputs()) { auto const& t = v->type(); if (t->kind() == TypeKind::TensorType) { @@ -394,8 +394,8 @@ void fixupMissingShapeInfo(const std::shared_ptr& graph) { } for (auto n : graph->nodes()) { - c10::optional scalar_type = inferScalarType(n); - c10::optional device = inferDevice(n); + std::optional scalar_type = inferScalarType(n); + std::optional device = inferDevice(n); for (auto v : n->outputs()) { fixupTypeInfoForValue(v, scalar_type, device); diff --git a/torch/csrc/jit/tensorexpr/graph_opt.h b/torch/csrc/jit/tensorexpr/graph_opt.h index 1180d0ac438b9..5bd2ec8600931 100644 --- a/torch/csrc/jit/tensorexpr/graph_opt.h +++ b/torch/csrc/jit/tensorexpr/graph_opt.h @@ -60,7 +60,7 @@ bool OptimizeCat(const std::shared_ptr& graph); TORCH_API void annotateInputShapes( const std::shared_ptr& graph, - const std::vector>& example_inputs); + const std::vector>& example_inputs); TORCH_API std::shared_ptr removeUnusedSelfArgument( const std::shared_ptr& graph); TORCH_API std::shared_ptr removeGraphOutput( diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h index 1ab21c83ef183..f35bafb332eaf 100644 --- a/torch/csrc/jit/tensorexpr/ir.h +++ b/torch/csrc/jit/tensorexpr/ir.h @@ -361,7 +361,7 @@ ExprPtr immLike(const ExprHandle& e, T v) { return immLike(e.node(), v); } -inline c10::optional intValue(const ExprPtr& e) { +inline std::optional intValue(const ExprPtr& e) { #define TYPE_CASE(Type, Name) \ if (auto v = to(e)) { \ return v->value(); \ @@ -371,7 +371,7 @@ inline c10::optional intValue(const ExprPtr& e) { return c10::nullopt; } -inline c10::optional intValue(const ExprHandle& e) { +inline std::optional intValue(const ExprHandle& e) { return intValue(e.node()); } diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp index 4ce640bb8a739..afb7aefdda652 100644 --- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp +++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp @@ -1867,7 +1867,7 @@ class ModRound { ExprPtr mod_divisor; }; -static c10::optional isModRound(TermPtr e) { +static std::optional isModRound(TermPtr e) { DivPtr div{nullptr}; ModPtr mod{nullptr}; ExprPtr denom{nullptr}; diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index a360762f5bf9c..50578a0414572 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -128,9 +128,9 @@ bool& getOptConditionals() { return opt_conditionals; } -c10::optional pickDeviceType( +std::optional pickDeviceType( const at::ArrayRef& inputs) { - c10::optional device = c10::nullopt; + std::optional device = c10::nullopt; for (auto const& input : inputs) { auto tt = input->type()->cast(); if (tt && tt->device()) { @@ -143,9 +143,9 @@ c10::optional pickDeviceType( return device; } -static c10::optional pickDeviceType( +static std::optional pickDeviceType( const std::shared_ptr& graph) { - c10::optional device = c10::nullopt; + std::optional device = c10::nullopt; for (auto const& node : graph->nodes()) { for (auto const& input : node->inputs()) { if (auto tt = input->type()->cast()) { @@ -179,7 +179,7 @@ static c10::optional pickDeviceType( // If v is a Tensor with concretely-known sizes and dtype, return them, else // nullopt. -static c10::optional getTensorInfoJit(torch::jit::Value* v) { +static std::optional getTensorInfoJit(torch::jit::Value* v) { auto const& it = v->type()->cast(); c10::ScalarType dtype = c10::ScalarType::Float; @@ -527,7 +527,7 @@ std::vector TensorExprKernel::sizesForValue( throw malformed_input(msg); } -static c10::optional findDtypeForValue(const torch::jit::Value* v) { +static std::optional findDtypeForValue(const torch::jit::Value* v) { if (v->type()->kind() == TypeKind::TensorType) { auto tt = v->type()->cast(); if (tt->scalarType()) { @@ -707,7 +707,7 @@ static void fuseAllLoops(StmtPtr st) { } // Compute the trip count of a loop if it is a constant. -static c10::optional tripCount(ForPtr loop) { +static std::optional tripCount(ForPtr loop) { auto tc = IRSimplifier::simplify( cast(ExprHandle(loop->stop()) - ExprHandle(loop->start()))); if (auto val = to(tc.node())) { @@ -958,7 +958,7 @@ std::string TensorExprKernel::getCodeGenName(BackendType backendType) { } template -static bool isValidPrimProperty(const c10::optional& a, T b) { +static bool isValidPrimProperty(const std::optional& a, T b) { return !a.has_value() || *a == b; } diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h index 45658beb750e9..d7c737d8f8f2c 100644 --- a/torch/csrc/jit/tensorexpr/kernel.h +++ b/torch/csrc/jit/tensorexpr/kernel.h @@ -274,10 +274,10 @@ class TORCH_API TensorExprKernel { const std::vector& interm_bufs); struct UnpackedTensorOptions { - c10::optional dtype; - c10::optional layout; - c10::optional device; - c10::optional pinned_memory; + std::optional dtype; + std::optional layout; + std::optional device; + std::optional pinned_memory; UnpackedTensorOptions(const c10::TensorOptions& opts) : dtype(c10::optTypeMetaToScalarType(opts.dtype_opt())), @@ -370,7 +370,7 @@ TORCH_API bool setFallbackAllowed(bool value); TORCH_API bool& getCatWoConditionals(); TORCH_API bool& getOptConditionals(); -TORCH_API c10::optional pickDeviceType( +TORCH_API std::optional pickDeviceType( const at::ArrayRef& inputs); bool isContiguous( diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index fd7f0818996c9..dec03637847e2 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -84,16 +84,16 @@ C10_DEFINE_bool( namespace torch::jit::tensorexpr { -c10::optional& LLVMTargetTriple() { - static c10::optional triple = c10::nullopt; +std::optional& LLVMTargetTriple() { + static std::optional triple = c10::nullopt; return triple; } -c10::optional& LLVMTargetCPU() { - static c10::optional cpu = c10::nullopt; +std::optional& LLVMTargetCPU() { + static std::optional cpu = c10::nullopt; return cpu; } -c10::optional& LLVMTargetAttrs() { - static c10::optional attrs = c10::nullopt; +std::optional& LLVMTargetAttrs() { + static std::optional attrs = c10::nullopt; return attrs; } bool& LLVMAOTWorkflow() { @@ -306,9 +306,9 @@ class LLVMCodeGenImpl : public IRVisitor { at::Device device, Dtype dtype, std::string kernel_func_name, - c10::optional triple, - c10::optional cpu, - c10::optional attrs); + std::optional triple, + std::optional cpu, + std::optional attrs); ~LLVMCodeGenImpl() = default; llvm::JITTargetAddress getKernelAddress() const; @@ -397,9 +397,9 @@ LLVMCodeGen::LLVMCodeGen( at::Device device, const std::string& kernel_func_name, Dtype dtype, - c10::optional triple, - c10::optional cpu, - c10::optional attrs) + std::optional triple, + std::optional cpu, + std::optional attrs) : CodeGen(stmt, args, device, kernel_func_name) { impl_ = std::make_unique( this->stmt(), @@ -446,10 +446,10 @@ void LLVMCodeGen::call(const std::vector& args) { at::Tensor LLVMCodeGen::empty_strided( c10::IntArrayRef size, c10::IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt) { + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) { return at::native::empty_strided_cpu( size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt); } @@ -489,9 +489,9 @@ LLVMCodeGenImpl::LLVMCodeGenImpl( at::Device device, Dtype dtype, std::string kernel_func_name, - c10::optional triple, - c10::optional cpu, - c10::optional attrs) + std::optional triple, + std::optional cpu, + std::optional attrs) : context_(std::make_unique()), irb_(getContext()), kernel_func_name_(std::move(kernel_func_name)), diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.h b/torch/csrc/jit/tensorexpr/llvm_codegen.h index 7ab506fa8fe1e..74271fa879f3d 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.h +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.h @@ -27,9 +27,9 @@ class TORCH_API LLVMCodeGen : public CodeGen { at::Device device = at::kCPU, const std::string& kernel_func_name = "func", Dtype dtype = kInt, - c10::optional triple = c10::nullopt, - c10::optional cpu = c10::nullopt, - c10::optional attrs = c10::nullopt); + std::optional triple = c10::nullopt, + std::optional cpu = c10::nullopt, + std::optional attrs = c10::nullopt); explicit LLVMCodeGen(StmtPtr stmt); LLVMCodeGen() = delete; @@ -48,10 +48,10 @@ class TORCH_API LLVMCodeGen : public CodeGen { at::Tensor empty_strided( c10::IntArrayRef size, c10::IntArrayRef stride, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt) override; + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) override; template T value() { @@ -126,14 +126,14 @@ struct TORCH_API LLVMCodeGenBuilder { at::Device device_ = at::kCPU; std::string kernelFuncName_ = "func"; Dtype dtype_ = kInt; - c10::optional triple_ = c10::nullopt; - c10::optional cpu_ = c10::nullopt; - c10::optional attrs_ = c10::nullopt; + std::optional triple_ = c10::nullopt; + std::optional cpu_ = c10::nullopt; + std::optional attrs_ = c10::nullopt; }; -TORCH_API c10::optional& LLVMTargetTriple(); -TORCH_API c10::optional& LLVMTargetCPU(); -TORCH_API c10::optional& LLVMTargetAttrs(); +TORCH_API std::optional& LLVMTargetTriple(); +TORCH_API std::optional& LLVMTargetCPU(); +TORCH_API std::optional& LLVMTargetAttrs(); TORCH_API bool& LLVMAOTWorkflow(); } // namespace tensorexpr diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.cpp b/torch/csrc/jit/tensorexpr/llvm_jit.cpp index 71f4fed3db3e7..37a4b8db6bb27 100644 --- a/torch/csrc/jit/tensorexpr/llvm_jit.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_jit.cpp @@ -67,8 +67,8 @@ static llvm::SubtargetFeatures getHostSubtargetFeatures() { // Create a JTMB using the host's triple. CPU and attrs default to the host // unless they are supplied. static llvm::orc::JITTargetMachineBuilder makeJTMBFromHost( - c10::optional cpu, - c10::optional attrs) { + std::optional cpu, + std::optional attrs) { llvm::orc::JITTargetMachineBuilder JTMB( (llvm::Triple(llvm::sys::getProcessTriple()))); JTMB.setCPU(cpu.value_or(llvm::sys::getHostCPUName().str())); @@ -85,8 +85,8 @@ static llvm::orc::JITTargetMachineBuilder makeJTMBFromHost( // Create a JTMB using a given triple. Do not set cpu or attrs if not supplied. static llvm::orc::JITTargetMachineBuilder makeJTMBFromTriple( const std::string& triple, - c10::optional cpu, - c10::optional attrs) { + std::optional cpu, + std::optional attrs) { llvm::orc::JITTargetMachineBuilder JTMB((llvm::Triple(triple))); if (cpu) { JTMB.setCPU(*cpu); @@ -100,9 +100,9 @@ static llvm::orc::JITTargetMachineBuilder makeJTMBFromTriple( } static llvm::orc::JITTargetMachineBuilder makeTargetMachineBuilder( - c10::optional triple, - c10::optional cpu, - c10::optional attrs) { + std::optional triple, + std::optional cpu, + std::optional attrs) { auto JTMB = triple ? makeJTMBFromTriple(*triple, cpu, attrs) : makeJTMBFromHost(cpu, attrs); #if LLVM_VERSION_MAJOR >= 18 @@ -160,9 +160,9 @@ class TORCH_API PytorchLLVMJITImpl { public: PytorchLLVMJITImpl( - c10::optional triple, - c10::optional cpu, - c10::optional attrs) + std::optional triple, + std::optional cpu, + std::optional attrs) : TM(assertSuccess(makeTargetMachineBuilder(triple, cpu, attrs) .createTargetMachine())), LLJ(assertSuccess( @@ -241,9 +241,9 @@ class TORCH_API PytorchLLVMJITImpl { public: PytorchLLVMJITImpl( - c10::optional triple, - c10::optional cpu, - c10::optional attrs) + std::optional triple, + std::optional cpu, + std::optional attrs) : Resolver(createLegacyLookupResolver( ES, [this](const std::string& Name) -> JITSymbol { @@ -320,9 +320,9 @@ class TORCH_API PytorchLLVMJITImpl { #endif PytorchLLVMJIT::PytorchLLVMJIT( - c10::optional triple, - c10::optional cpu, - c10::optional attrs) + std::optional triple, + std::optional cpu, + std::optional attrs) : impl_(std::make_unique(triple, cpu, attrs)) {} PytorchLLVMJIT::~PytorchLLVMJIT() = default; diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h index 4aca55a9abf47..98238e0043885 100644 --- a/torch/csrc/jit/tensorexpr/llvm_jit.h +++ b/torch/csrc/jit/tensorexpr/llvm_jit.h @@ -51,9 +51,9 @@ class PytorchLLVMJITImpl; class TORCH_API PytorchLLVMJIT { public: PytorchLLVMJIT( - c10::optional triple, - c10::optional cpu, - c10::optional attrs); + std::optional triple, + std::optional cpu, + std::optional attrs); ~PytorchLLVMJIT(); void addModule(std::unique_ptr M, std::unique_ptr C); diff --git a/torch/csrc/jit/tensorexpr/lowerings.cpp b/torch/csrc/jit/tensorexpr/lowerings.cpp index 79f0c59e59b39..1518e06376c14 100644 --- a/torch/csrc/jit/tensorexpr/lowerings.cpp +++ b/torch/csrc/jit/tensorexpr/lowerings.cpp @@ -55,7 +55,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { auto sub_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) { // NB: sub isn't supported on boolean, no need to promote to integer. @@ -86,7 +86,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_mul", @@ -108,7 +108,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, \ const std::vector& outputShape, \ const std::vector& outputStrides, \ - const c10::optional& outputType, \ + const std::optional& outputType, \ at::Device device) { \ return computeScalar( \ "aten_#op_name", \ @@ -131,7 +131,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeScalar( "aten_div", @@ -155,7 +155,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, \ const std::vector& outputShape, \ const std::vector& outputStrides, \ - const c10::optional& outputType, \ + const std::optional& outputType, \ at::Device device) { \ return computeScalar( \ "aten_#op_name", \ @@ -179,7 +179,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, \ const std::vector& outputShape, \ const std::vector& outputStrides, \ - const c10::optional& outputType, \ + const std::optional& outputType, \ at::Device device) { \ return computeScalar( \ "aten_#op_name", \ @@ -204,7 +204,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, \ const std::vector& outputShape, \ const std::vector& outputStrides, \ - const c10::optional& outputType, \ + const std::optional& outputType, \ at::Device device) { \ return computeScalar( \ "aten_#op_name", \ @@ -225,7 +225,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_div", @@ -245,7 +245,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_and", @@ -264,7 +264,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_or", @@ -283,7 +283,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_xor", @@ -302,7 +302,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_lshift", @@ -321,7 +321,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_rshift", @@ -340,7 +340,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_eq", @@ -359,7 +359,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_ne", @@ -378,7 +378,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_ge", @@ -397,7 +397,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_gt", @@ -416,7 +416,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_le", @@ -435,7 +435,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_lt", @@ -453,7 +453,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_min", @@ -471,7 +471,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_max", @@ -490,7 +490,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeThreeOperand( "aten_masked_fill", @@ -513,7 +513,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { bool noMin = false; bool noMax = false; @@ -561,7 +561,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeFourOperand( "aten_addcmul", @@ -580,7 +580,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { // check if the activation is quantized const BufHandle& x = std::get(inputs[0]); @@ -604,7 +604,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_silu", @@ -620,7 +620,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_reciprocal", @@ -636,7 +636,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_neg", @@ -652,7 +652,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_isnan", @@ -673,7 +673,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { auto A = std::get(inputs[0]); if (A.node()->qscale()) { @@ -697,7 +697,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_leaky_relu", @@ -719,7 +719,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_relu6", @@ -739,7 +739,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { const auto& kApproximate = std::get(inputs[1]); std::vector operands = {inputs.front()}; @@ -787,7 +787,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_log", @@ -805,7 +805,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_log10", @@ -823,7 +823,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_log1p", @@ -841,7 +841,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_log2", @@ -859,7 +859,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_exp", @@ -877,7 +877,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_expm1", @@ -895,7 +895,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_erf", @@ -913,7 +913,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_erfc", @@ -931,7 +931,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_cos", @@ -949,7 +949,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_sin", @@ -967,7 +967,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_tan", @@ -985,7 +985,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { const BufHandle& rhs = std::get(inputs[1]); auto dtype = rhs.dtype(); @@ -1005,7 +1005,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_pow", @@ -1050,7 +1050,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_fmod", @@ -1069,7 +1069,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeThreeOperand( "aten_lerp", @@ -1089,7 +1089,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { auto imodImpl = [](const ExprHandle& lhs, const ExprHandle& rhs) { return Mod::make(lhs, rhs); @@ -1137,7 +1137,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_acos", @@ -1155,7 +1155,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_asin", @@ -1173,7 +1173,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_cosh", @@ -1191,7 +1191,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_sinh", @@ -1209,7 +1209,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_atan", @@ -1227,7 +1227,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_atan2", @@ -1247,7 +1247,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_tanh", @@ -1265,7 +1265,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeThreeOperand( "aten_hardtanh", @@ -1286,7 +1286,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeThreeOperand( "aten_softplus", @@ -1314,7 +1314,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_mish", @@ -1333,7 +1333,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeFourOperand( "aten_elu", @@ -1366,7 +1366,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_hardsigmoid", @@ -1387,7 +1387,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_hardswish", @@ -1410,7 +1410,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTwoOperand( "aten_hardshrink", @@ -1433,7 +1433,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_sqrt", @@ -1451,7 +1451,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_rsqrt", @@ -1469,7 +1469,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_abs", @@ -1488,7 +1488,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeSign(inputs, outputShape); }); RegisterNNCLoweringsFunction aten_ceil( @@ -1496,7 +1496,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_ceil", @@ -1512,7 +1512,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_floor", @@ -1528,7 +1528,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_round", @@ -1544,7 +1544,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_trunc", @@ -1560,7 +1560,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_cast_float", @@ -1582,7 +1582,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { // see handling of aten::to in tensorexpr_fuser.cpp for why we only // need to handle the first input @@ -1604,7 +1604,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeThreeOperand( "aten_threshold", @@ -1628,7 +1628,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeConditionWithTwoOperand( "aten_where", @@ -1646,7 +1646,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_frac", @@ -1666,7 +1666,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "aten_lgamma", @@ -1684,7 +1684,7 @@ int nnc_lowerings_lazy_registration() { // {"aten::rand_like"}, // [](const std::vector& inputs, // const std::vector& outputShape, - // const c10::optional& outputType, + // const std::optional& outputType, // at::Device device) { // return computeOneOperand( // "aten_rand_like", @@ -1701,7 +1701,7 @@ int nnc_lowerings_lazy_registration() { // {"aten::slice"}, // [](const std::vector& inputs, // const std::vector& outputShape, - // const c10::optional& outputType, + // const std::optional& outputType, // at::Device device) { // return Compute( // "aten_slice", @@ -1723,7 +1723,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return Compute( "aten_unsqueeze", @@ -1757,7 +1757,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeTranspose( {inputs[0], (int64_t)1, (int64_t)0}, @@ -1774,7 +1774,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { auto A = std::get(inputs[0]); // Trivial case of 0-dim tensors: just a copy of the input @@ -1848,7 +1848,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeSoftmax(inputs, outputShape, outputStrides, false); }); @@ -1858,7 +1858,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeSoftmax(inputs, outputShape, outputStrides, true); }); @@ -1892,7 +1892,7 @@ int nnc_lowerings_lazy_registration() { [](const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { auto add_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) { return boolToInteger(lhs) + boolToInteger(rhs); diff --git a/torch/csrc/jit/tensorexpr/lowerings.h b/torch/csrc/jit/tensorexpr/lowerings.h index 6d8b2c433ae37..da22899ba28ce 100644 --- a/torch/csrc/jit/tensorexpr/lowerings.h +++ b/torch/csrc/jit/tensorexpr/lowerings.h @@ -32,7 +32,7 @@ using NNCLoweringFunction = std::function&, const std::vector&, const std::vector&, - const c10::optional&, + const std::optional&, at::Device)>; TORCH_API FunctionSchemaMap& getNNCLoweringRegistry(); diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp index 3f29dad4c13f3..bdf313f0ad051 100644 --- a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp +++ b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp @@ -353,7 +353,7 @@ Tensor computeConv2d( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { Dtype dtype = kFloat; if (outputType) { @@ -401,7 +401,7 @@ Tensor computeConv1d( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { Dtype dtype = kFloat; if (outputType) { @@ -435,7 +435,7 @@ Tensor computePrepackedConv2dClampRun( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { Dtype dtype = kFloat; if (outputType) { @@ -454,7 +454,7 @@ Tensor computePrepackedLinearClampRun( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { Dtype dtype = kFloat; if (outputType) { @@ -473,7 +473,7 @@ Tensor computeMkldnnPrepackedConvRun( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { Dtype dtype = kFloat; if (outputType) { diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.h b/torch/csrc/jit/tensorexpr/operators/conv2d.h index 65902960192ab..f842a1350a551 100644 --- a/torch/csrc/jit/tensorexpr/operators/conv2d.h +++ b/torch/csrc/jit/tensorexpr/operators/conv2d.h @@ -74,31 +74,31 @@ Tensor computeConv2d( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computeConv1d( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computePrepackedConv2dClampRun( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computePrepackedLinearClampRun( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computeMkldnnPrepackedConvRun( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); } // namespace tensorexpr } // namespace jit diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.cpp b/torch/csrc/jit/tensorexpr/operators/matmul.cpp index 38b420a7aca1c..92c6c14519325 100644 --- a/torch/csrc/jit/tensorexpr/operators/matmul.cpp +++ b/torch/csrc/jit/tensorexpr/operators/matmul.cpp @@ -9,7 +9,7 @@ Tensor computeMatmul( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { Dtype dtype = kFloat; if (outputType) { @@ -56,7 +56,7 @@ Tensor computeAddMM( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { Dtype dtype = kFloat; if (outputType) { diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.h b/torch/csrc/jit/tensorexpr/operators/matmul.h index 70f3f4bf7bf03..40ef3cfd9b619 100644 --- a/torch/csrc/jit/tensorexpr/operators/matmul.h +++ b/torch/csrc/jit/tensorexpr/operators/matmul.h @@ -10,13 +10,13 @@ Tensor computeMatmul( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computeAddMM( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); } // namespace tensorexpr diff --git a/torch/csrc/jit/tensorexpr/operators/misc.cpp b/torch/csrc/jit/tensorexpr/operators/misc.cpp index c282787485ea4..70991f6db1f4c 100644 --- a/torch/csrc/jit/tensorexpr/operators/misc.cpp +++ b/torch/csrc/jit/tensorexpr/operators/misc.cpp @@ -136,7 +136,7 @@ ExprHandle promoteIntegerToDefaultType(const ExprHandle& e) { ExprHandle demoteOutput( const ExprHandle& e, - const c10::optional type) { + const std::optional type) { if (!type.has_value()) { return e; } @@ -160,7 +160,7 @@ ExprHandle demoteOutput( return e; } -c10::optional getTensorInfo(BufHandle b) { +std::optional getTensorInfo(BufHandle b) { std::vector dims; for (auto dim : b.dims()) { auto val = intValue(dim.node()); @@ -321,7 +321,7 @@ Tensor computeChunk( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return Compute( "prim_constantchunk", @@ -355,7 +355,7 @@ Tensor computeTranspose( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { auto A = std::get(inputs[0]); // Trivial case of 0-dim and 1-dim tensors: transpose is just a copy @@ -382,7 +382,7 @@ Tensor computeExpand( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { auto A = std::get(inputs[0]); return Compute( @@ -396,7 +396,7 @@ Tensor computeReshape( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { auto A = std::get(inputs[0]); if (A.ndim() == 0) { @@ -464,7 +464,7 @@ Tensor computeFlatten( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { std::vector outputShapeVec; for (const auto dim : c10::irange(outputShape.size())) { @@ -622,7 +622,7 @@ Tensor computeCat( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { if (device == at::kCPU && getCatWoConditionals()) { return computeCatWoConditionals(inputs, outputShape, outputStrides); @@ -685,7 +685,7 @@ Tensor computeEmbedding( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { Dtype dtype = kFloat; if (outputType) { diff --git a/torch/csrc/jit/tensorexpr/operators/misc.h b/torch/csrc/jit/tensorexpr/operators/misc.h index 5650b35147b17..50f53b0b50d07 100644 --- a/torch/csrc/jit/tensorexpr/operators/misc.h +++ b/torch/csrc/jit/tensorexpr/operators/misc.h @@ -12,7 +12,7 @@ struct TensorInfo { std::vector dims; c10::ScalarType dtype; }; -c10::optional getTensorInfo(BufHandle b); +std::optional getTensorInfo(BufHandle b); int64_t normalizeAndCheckIndex(int64_t idx, int64_t list_size); @@ -26,7 +26,7 @@ ExprHandle promoteIntegerToDefaultType(const ExprHandle& e); ExprHandle promoteHalfToFloat(const ExprHandle& e); ExprHandle demoteOutput( const ExprHandle& e, - const c10::optional type); + const std::optional type); std::vector broadcastShapes( std::vector> shapes); @@ -51,31 +51,31 @@ Tensor computeChunk( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computeTranspose( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computeExpand( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computeReshape( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computeFlatten( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computeCatWoConditionals( const std::vector& inputs, @@ -84,13 +84,13 @@ Tensor computeCat( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computeEmbedding( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); } // namespace tensorexpr diff --git a/torch/csrc/jit/tensorexpr/operators/norm.cpp b/torch/csrc/jit/tensorexpr/operators/norm.cpp index 335cfae05f4d4..c87a931d1fc43 100644 --- a/torch/csrc/jit/tensorexpr/operators/norm.cpp +++ b/torch/csrc/jit/tensorexpr/operators/norm.cpp @@ -9,7 +9,7 @@ Tensor computeBatchNorm( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { bool hasWeight = true; bool hasBias = true; diff --git a/torch/csrc/jit/tensorexpr/operators/norm.h b/torch/csrc/jit/tensorexpr/operators/norm.h index 7c8cc43387b01..dbe6140cca8b4 100644 --- a/torch/csrc/jit/tensorexpr/operators/norm.h +++ b/torch/csrc/jit/tensorexpr/operators/norm.h @@ -10,7 +10,7 @@ Tensor computeBatchNorm( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); } // namespace tensorexpr diff --git a/torch/csrc/jit/tensorexpr/operators/pointwise.cpp b/torch/csrc/jit/tensorexpr/operators/pointwise.cpp index 57c63fcd92391..19aad4d015e27 100644 --- a/torch/csrc/jit/tensorexpr/operators/pointwise.cpp +++ b/torch/csrc/jit/tensorexpr/operators/pointwise.cpp @@ -10,7 +10,7 @@ using namespace torch::jit::tensorexpr; Tensor computeSign( const std::vector& inputValues, const std::vector& outputShape, - c10::optional> outputStrides) { + std::optional> outputStrides) { return Compute( "aten_sign", outputShape, outputStrides, [&](ParameterList& axes) { std::vector indices(axes.begin(), axes.end()); @@ -28,7 +28,7 @@ Tensor computeOneOperand( const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function& innerExpr, const int checkParamTypes) { return Compute( @@ -51,7 +51,7 @@ Tensor computeTwoOperand( const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function& innerExpr) { return Compute( @@ -76,7 +76,7 @@ Tensor computeTwoOperandWithAlpha( const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function& innerExpr) { return Compute( @@ -102,7 +102,7 @@ Tensor computeConditionWithTwoOperand( const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function< ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>& innerExpr) { @@ -131,7 +131,7 @@ Tensor computeThreeOperand( const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function< ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>& innerExpr, @@ -161,7 +161,7 @@ Tensor computeFourOperand( const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { return computeOneOperand( "copy", @@ -207,7 +207,7 @@ Tensor computeScalar( const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function& innerExpr) { auto dt = Dtype(*outputType); diff --git a/torch/csrc/jit/tensorexpr/operators/pointwise.h b/torch/csrc/jit/tensorexpr/operators/pointwise.h index 8de218dbb0383..0ce10424b3d30 100644 --- a/torch/csrc/jit/tensorexpr/operators/pointwise.h +++ b/torch/csrc/jit/tensorexpr/operators/pointwise.h @@ -9,14 +9,14 @@ namespace tensorexpr { TORCH_API Tensor computeSign( const std::vector& inputs, const std::vector& outputShape, - c10::optional> outputStrides = c10::nullopt); + std::optional> outputStrides = c10::nullopt); Tensor computeOneOperand( const std::string& name, const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function& innerExpr, const int checkParamTypes = kAllTypes); Tensor computeTwoOperand( @@ -24,7 +24,7 @@ Tensor computeTwoOperand( const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function& innerExpr); Tensor computeTwoOperandWithAlpha( @@ -32,7 +32,7 @@ Tensor computeTwoOperandWithAlpha( const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function& innerExpr); Tensor computeConditionWithTwoOperand( @@ -40,7 +40,7 @@ Tensor computeConditionWithTwoOperand( const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function< ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>& innerExpr); @@ -49,7 +49,7 @@ Tensor computeThreeOperand( const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function< ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>& innerExpr, @@ -59,7 +59,7 @@ Tensor computeFourOperand( const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computeScalar( @@ -77,7 +77,7 @@ Tensor computeScalar( const std::vector& inputValues, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, const std::function& innerExpr); diff --git a/torch/csrc/jit/tensorexpr/operators/quantization.cpp b/torch/csrc/jit/tensorexpr/operators/quantization.cpp index da6d43cbb7aa9..66c0688538a1d 100644 --- a/torch/csrc/jit/tensorexpr/operators/quantization.cpp +++ b/torch/csrc/jit/tensorexpr/operators/quantization.cpp @@ -141,7 +141,7 @@ Tensor computeQuantizePerTensor( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional&, + const std::optional&, at::Device) { std::vector vars; std::vector indices; @@ -181,7 +181,7 @@ Tensor computeQuantizedAdd( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device) { const BufHandle& QA = std::get(inputs[0]); const BufHandle& QB = std::get(inputs[1]); @@ -225,7 +225,7 @@ Tensor computeQuantizePerTensorExternalCall( const std::vector& outputShape, const std::vector& outputStrides, // NOLINTNEXTLINE - const c10::optional& outputType, + const std::optional& outputType, at::Device) { const BufHandle& x = std::get(inputs[0]); const auto qscale = std::get(inputs[1]); @@ -257,7 +257,7 @@ Tensor computeDequantizeExternalCall( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device) { Dtype dtype = kFloat; if (outputType) { @@ -282,7 +282,7 @@ Tensor computeQuantizedConv2dPrepack( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device) { Dtype dtype = kFloat; if (outputType) { @@ -332,7 +332,7 @@ Tensor computeQuantizedConv1d( const std::vector& outputShape, const std::vector& outputStrides, // NOLINTNEXTLINE - const c10::optional& outputType, + const std::optional& outputType, // NOLINTNEXTLINE at::Device device) { const BufHandle& qx = std::get(inputs[0]); @@ -364,7 +364,7 @@ Tensor computeQuantizedConv2d( const std::vector& outputShape, const std::vector& outputStrides, // NOLINTNEXTLINE - const c10::optional& outputType, + const std::optional& outputType, // NOLINTNEXTLINE at::Device device) { const BufHandle& qx = std::get(inputs[0]); @@ -396,7 +396,7 @@ Tensor computeQuantizedConv2dRelu( const std::vector& outputShape, const std::vector& outputStrides, // NOLINTNEXTLINE - const c10::optional& outputType, + const std::optional& outputType, // NOLINTNEXTLINE at::Device device) { const BufHandle& qx = std::get(inputs[0]); @@ -428,7 +428,7 @@ Tensor computeQuantizedLinear( const std::vector& outputShape, const std::vector& outputStrides, // NOLINTNEXTLINE - const c10::optional& outputType, + const std::optional& outputType, // NOLINTNEXTLINE at::Device device) { const BufHandle& qx = std::get(inputs[0]); @@ -460,7 +460,7 @@ Tensor computeQuantizedLinearRelu( const std::vector& outputShape, const std::vector& outputStrides, // NOLINTNEXTLINE - const c10::optional& outputType, + const std::optional& outputType, // NOLINTNEXTLINE at::Device device) { const BufHandle& qx = std::get(inputs[0]); @@ -492,7 +492,7 @@ Tensor computeQuantizedAddExternalCall( const std::vector& outputShape, const std::vector& outputStrides, // NOLINTNEXTLINE - const c10::optional& outputType, + const std::optional& outputType, // NOLINTNEXTLINE at::Device device) { const BufHandle& qa = std::get(inputs[0]); @@ -536,7 +536,7 @@ Tensor computeQuantizedMul( const std::vector& outputShape, const std::vector& outputStrides, // NOLINTNEXTLINE - const c10::optional& outputType, + const std::optional& outputType, // NOLINTNEXTLINE at::Device device) { const BufHandle& qa = std::get(inputs[0]); @@ -567,7 +567,7 @@ Tensor computeQuantizedMulScalar( const std::vector& outputShape, const std::vector& outputStrides, // NOLINTNEXTLINE - const c10::optional& outputType, + const std::optional& outputType, // NOLINTNEXTLINE at::Device device) { const BufHandle& qa = std::get(inputs[0]); @@ -594,7 +594,7 @@ Tensor computeQuantizedRelu( const std::vector& outputShape, const std::vector& outputStrides, // NOLINTNEXTLINE - const c10::optional& outputType, + const std::optional& outputType, // NOLINTNEXTLINE at::Device device) { const BufHandle& qa = std::get(inputs[0]); @@ -625,7 +625,7 @@ Tensor computeQuantizedCat( const std::vector& outputShape, const std::vector& outputStrides, // NOLINTNEXTLINE - const c10::optional& outputType, + const std::optional& outputType, // NOLINTNEXTLINE at::Device device) { // NOLINTNEXTLINE(performance-unnecessary-copy-initialization) @@ -663,7 +663,7 @@ Tensor computeDequantize( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device) { Dtype dtype = kFloat; if (outputType) { @@ -695,7 +695,7 @@ Tensor computeUpsampleNearest2d( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device) { auto A = std::get(inputs[0]); const auto& output_height = outputShape[2]; @@ -742,7 +742,7 @@ Tensor computeUpsampleNearest2dExternalCall( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device) { Dtype dtype = kFloat; if (outputType) { @@ -802,7 +802,7 @@ Tensor computeQuantizedSigmoidExternalCall( const std::vector& outputShape, const std::vector& outputStrides, // NOLINTNEXTLINE - const c10::optional& outputType, + const std::optional& outputType, at::Device) { const BufHandle& qx = std::get(inputs[0]); diff --git a/torch/csrc/jit/tensorexpr/operators/quantization.h b/torch/csrc/jit/tensorexpr/operators/quantization.h index 019b2349b1840..d48c9e3273ba0 100644 --- a/torch/csrc/jit/tensorexpr/operators/quantization.h +++ b/torch/csrc/jit/tensorexpr/operators/quantization.h @@ -20,140 +20,140 @@ TORCH_API Tensor computeQuantizePerTensor( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizePerTensorExternalCall( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizedConv1d( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizedConv2dPrepack( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizedConv1d( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizedConv2d( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizedConv2dRelu( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizedLinear( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizedLinearRelu( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizedAdd( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computeQuantizedAddExternalCall( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizedMul( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizedMulScalar( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizedCat( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizedRelu( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeDequantize( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeDequantizeExternalCall( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeUpsampleNearest2d( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeUpsampleNearest2dExternalCall( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeQuantizedSigmoidExternalCall( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device); } // namespace tensorexpr } // namespace jit diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.cpp b/torch/csrc/jit/tensorexpr/operators/reduction.cpp index dfd6e2d01adf5..b5f53560c9be3 100644 --- a/torch/csrc/jit/tensorexpr/operators/reduction.cpp +++ b/torch/csrc/jit/tensorexpr/operators/reduction.cpp @@ -23,7 +23,7 @@ Tensor computeSum( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { std::vector axes; bool keepdim = false; @@ -108,7 +108,7 @@ Tensor computeMean( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { Dtype dtype = kFloat; if (outputType) { @@ -140,7 +140,7 @@ Tensor computeMax( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { Dtype dtype = kFloat; if (outputType) { @@ -164,7 +164,7 @@ Tensor computeAdaptiveAvgPool2d( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device) { Dtype dtype = kFloat; if (outputType) { diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.h b/torch/csrc/jit/tensorexpr/operators/reduction.h index 6265c4d265858..7d25e14a171ce 100644 --- a/torch/csrc/jit/tensorexpr/operators/reduction.h +++ b/torch/csrc/jit/tensorexpr/operators/reduction.h @@ -10,25 +10,25 @@ TORCH_API Tensor computeSum( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeMean( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); TORCH_API Tensor computeAdaptiveAvgPool2d( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); Tensor computeMax( const std::vector& inputs, const std::vector& outputShape, const std::vector& outputStrides, - const c10::optional& outputType, + const std::optional& outputType, at::Device device); } // namespace tensorexpr diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp index 746a9a8cd1f0b..5bc734bb80b83 100644 --- a/torch/csrc/jit/tensorexpr/tensor.cpp +++ b/torch/csrc/jit/tensorexpr/tensor.cpp @@ -99,7 +99,7 @@ StmtPtr Tensor::constructStmt( Tensor Compute( const std::string& name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const std::function&)>& body_func) { std::vector args = create_index_vars(dims); ExprHandle body = body_func(args); @@ -116,7 +116,7 @@ Tensor Compute( Tensor Compute( const std::string& name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const std::function& body_func) { if (dims.size() != 1) { throw malformed_input("mismatch between body and arg size (1)"); @@ -137,7 +137,7 @@ Tensor Compute( Tensor Compute( const std::string& name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const std::function& body_func) { if (dims.size() != 2) { @@ -159,7 +159,7 @@ Tensor Compute( Tensor Compute( const std::string& name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const std::function< ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>& body_func) { @@ -183,7 +183,7 @@ Tensor Compute( Tensor Compute( const std::string& name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const std::function& dims, - c10::optional> strides, + std::optional> strides, const Reducer& reducer, const BufHandle& buffer, const std::vector& reduce_dims) { @@ -235,7 +235,7 @@ Tensor Reduce( Tensor Reduce( const std::string& name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const Reducer& reducer, Tensor tensor, const std::vector& reduce_dims) { diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h index 698de07f2be54..7b589d0974b37 100644 --- a/torch/csrc/jit/tensorexpr/tensor.h +++ b/torch/csrc/jit/tensorexpr/tensor.h @@ -75,7 +75,7 @@ class TORCH_API Tensor { TORCH_API Tensor Compute( const std::string& func_name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const std::function& body_func); TORCH_API Tensor Compute( const std::string& func_name, @@ -84,7 +84,7 @@ TORCH_API Tensor Compute( TORCH_API Tensor Compute( const std::string& func_name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const std::function& body_func); TORCH_API Tensor Compute( @@ -95,7 +95,7 @@ TORCH_API Tensor Compute( TORCH_API Tensor Compute( const std::string& func_name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const std::function< ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>& body_func); @@ -108,7 +108,7 @@ TORCH_API Tensor Compute( TORCH_API Tensor Compute( const std::string& func_name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const std::function& dims, - c10::optional> strides, + std::optional> strides, const std::function&)>& body_func); TORCH_API Tensor Compute( const std::string& func_name, @@ -148,7 +148,7 @@ template Tensor Reduce( const std::string& func_name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const Reducer& reducer, const InitFunc& init_func, const BodyFunc& body_func, @@ -217,7 +217,7 @@ template Tensor Reduce( const std::string& func_name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const Reducer& reducer, const BodyFunc& body_func, const std::vector& reduce_dims) { @@ -246,7 +246,7 @@ template Tensor Reduce( const std::string& func_name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const Reducer& reducer, const BodyFunc&& body_func, const std::vector& reduce_dims) { @@ -265,7 +265,7 @@ Tensor Reduce( TORCH_API Tensor Reduce( const std::string& name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const Reducer& reducer, const BufHandle& buffer, const std::vector& reduce_dims); @@ -281,7 +281,7 @@ TORCH_API Tensor Reduce( TORCH_API Tensor Reduce( const std::string& func_name, const std::vector& dims, - c10::optional> strides, + std::optional> strides, const Reducer& reducer, Tensor tensor, const std::vector& reduce_dims); diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp index f6e0b270c92ca..204326dc03e21 100644 --- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp +++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp @@ -936,13 +936,13 @@ void initTensorExprBindings(PyObject* module) { &tensorexpr::replaceListOutputWithTuple); te.def("trim_graph", &tensorexpr::trimGraph); #ifdef TORCH_ENABLE_LLVM - te.def("set_llvm_target_triple", [](const c10::optional& val) { + te.def("set_llvm_target_triple", [](const std::optional& val) { tensorexpr::LLVMTargetTriple() = val; }); - te.def("set_llvm_target_cpu", [](const c10::optional& val) { + te.def("set_llvm_target_cpu", [](const std::optional& val) { tensorexpr::LLVMTargetCPU() = val; }); - te.def("set_llvm_target_attrs", [](const c10::optional& val) { + te.def("set_llvm_target_attrs", [](const std::optional& val) { tensorexpr::LLVMTargetAttrs() = val; }); te.def("set_llvm_aot_workflow", [](bool val) { diff --git a/torch/csrc/jit/testing/file_check.cpp b/torch/csrc/jit/testing/file_check.cpp index e1f87fccf7266..ec0011f40d775 100644 --- a/torch/csrc/jit/testing/file_check.cpp +++ b/torch/csrc/jit/testing/file_check.cpp @@ -43,17 +43,17 @@ struct Check { Check( CheckType type, std::string str, - c10::optional count = c10::nullopt) + std::optional count = c10::nullopt) : type_(type), count_(count), search_str_(std::move(str)) {} Check( CheckType type, c10::string_view str, - c10::optional count = c10::nullopt) + std::optional count = c10::nullopt) : Check(type, std::string(str.begin(), str.end()), count) {} CheckType type_; - c10::optional count_; + std::optional count_; const std::string search_str_; friend std::ostream& operator<<(std::ostream& out, const Check& c); @@ -234,7 +234,7 @@ struct FileCheckImpl { TORCH_API void addCheck( CheckType type, const std::string& s, - c10::optional count = c10::nullopt) { + std::optional count = c10::nullopt) { addCheck(Check(type, s, count)); } @@ -264,7 +264,7 @@ struct FileCheckImpl { } size_t end_check_string = suffix_pos + check_suffix.size(); CheckType type = check_pair.first; - c10::optional count = c10::nullopt; + std::optional count = c10::nullopt; auto end_line = source->text_str().find("\n", end_check_string); bool exactly = false; if (type == CHECK_COUNT) { diff --git a/torch/csrc/lazy/backend/backend_device.cpp b/torch/csrc/lazy/backend/backend_device.cpp index eaf3d6b28c07c..6d146ca0881ce 100644 --- a/torch/csrc/lazy/backend/backend_device.cpp +++ b/torch/csrc/lazy/backend/backend_device.cpp @@ -54,7 +54,7 @@ c10::Device backendDeviceToAtenDevice(const BackendDevice& device) { return c10::Device(at::kLazy, device.ordinal()); } -c10::optional GetBackendDevice(at::ITensorListRef tensors) { +std::optional GetBackendDevice(at::ITensorListRef tensors) { for (auto& tensor : tensors) { if (auto lt = TryGetLtcTensor(tensor)) { return lt->GetDevice(); @@ -63,26 +63,26 @@ c10::optional GetBackendDevice(at::ITensorListRef tensors) { return c10::nullopt; } -c10::optional GetBackendDevice(at::TensorList tensors) { +std::optional GetBackendDevice(at::TensorList tensors) { return GetBackendDevice(at::ITensorListRef(tensors)); } -c10::optional GetBackendDevice(const at::Tensor& tensor) { +std::optional GetBackendDevice(const at::Tensor& tensor) { if (auto lt = TryGetLtcTensor(tensor)) { return lt->GetDevice(); } return c10::nullopt; } -c10::optional GetBackendDevice( - const c10::optional& device) { +std::optional GetBackendDevice( + const std::optional& device) { if (device) { return c10::make_optional(atenDeviceToBackendDevice(*device)); } return c10::nullopt; } -c10::optional GetBackendDevice() { +std::optional GetBackendDevice() { return c10::nullopt; } diff --git a/torch/csrc/lazy/backend/backend_device.h b/torch/csrc/lazy/backend/backend_device.h index 4c239d1e4b71c..e80c800a2ecea 100644 --- a/torch/csrc/lazy/backend/backend_device.h +++ b/torch/csrc/lazy/backend/backend_device.h @@ -73,20 +73,20 @@ TORCH_API c10::Device backendDeviceToAtenDevice(const BackendDevice& device); // Tries to extract the backend device out of the lazy tensor. Returns nullopt // if the input is not a lazy tensor. -TORCH_API c10::optional GetBackendDevice( +TORCH_API std::optional GetBackendDevice( const at::ITensorListRef tensors); -TORCH_API c10::optional GetBackendDevice( +TORCH_API std::optional GetBackendDevice( const at::TensorList tensors); -TORCH_API c10::optional GetBackendDevice( +TORCH_API std::optional GetBackendDevice( const at::Tensor& tensor); -TORCH_API c10::optional GetBackendDevice( - const c10::optional& device); +TORCH_API std::optional GetBackendDevice( + const std::optional& device); // For variadic template. -TORCH_API c10::optional GetBackendDevice(); +TORCH_API std::optional GetBackendDevice(); template -c10::optional GetBackendDevice( +std::optional GetBackendDevice( const T& tensor, const Args&... forward_tensors) { auto optional_device = GetBackendDevice(tensor); diff --git a/torch/csrc/lazy/backend/backend_interface.h b/torch/csrc/lazy/backend/backend_interface.h index f94d3b602e52c..366311921c394 100644 --- a/torch/csrc/lazy/backend/backend_interface.h +++ b/torch/csrc/lazy/backend/backend_interface.h @@ -63,7 +63,7 @@ class TORCH_API BackendImplInterface { virtual at::Tensor MakeTensorFromComputationData( const BackendDataPtr data, - c10::optional logical_scalar_type) const = 0; + std::optional logical_scalar_type) const = 0; /** * Lowering, Compilation, Execution diff --git a/torch/csrc/lazy/core/hash.h b/torch/csrc/lazy/core/hash.h index bb6a779555f22..19f57546c9a43 100644 --- a/torch/csrc/lazy/core/hash.h +++ b/torch/csrc/lazy/core/hash.h @@ -135,6 +135,12 @@ static inline hash_t TensorHash(const at::Tensor& tensor) { return DataHash(ctensor.const_data_ptr>(), size); case at::ScalarType::ComplexDouble: return DataHash(ctensor.const_data_ptr>(), size); + case at::ScalarType::UInt16: + return DataHash(ctensor.const_data_ptr(), size); + case at::ScalarType::UInt32: + return DataHash(ctensor.const_data_ptr(), size); + case at::ScalarType::UInt64: + return DataHash(ctensor.const_data_ptr(), size); default: TORCH_INTERNAL_ASSERT( false, "Unsupported scalar type:", ctensor.scalar_type()); @@ -163,11 +169,11 @@ static inline hash_t Hash(const at::Generator& value) { // repeatedly hash a constant at runtime. static const int64_t kNullOpt = 0x8655d738f3678dda; -// Hashing for c10::optional types contributes to hash +// Hashing for std::optional types contributes to hash // for optionals with null value, important to distinguish // between and cases template -hash_t Hash(const c10::optional& value) { +hash_t Hash(const std::optional& value) { if (value.has_value()) { return Hash(value.value()); } else { @@ -187,7 +193,7 @@ hash_t Hash(const std::vector& values) { // Need a special case for optional? template -hash_t Hash(const c10::optional>& value) { +hash_t Hash(const std::optional>& value) { if (value.has_value()) { return ContainerHash(value.value()); } else { diff --git a/torch/csrc/lazy/core/ir_builder.h b/torch/csrc/lazy/core/ir_builder.h index 3b58d00aace6c..981e166777294 100644 --- a/torch/csrc/lazy/core/ir_builder.h +++ b/torch/csrc/lazy/core/ir_builder.h @@ -61,7 +61,7 @@ struct IrBuilder { virtual NodePtr MakeCast( const Value& input0, const at::ScalarType& dtype, - const c10::optional& stype = c10::nullopt) const = 0; + const std::optional& stype = c10::nullopt) const = 0; virtual NodePtr MakeTensorList(const OpList& inputs) const = 0; virtual NodePtr MakeGeneric( const OpKind& op, @@ -96,7 +96,7 @@ static inline NodePtr MakeExpand( static inline NodePtr MakeCast( const Value& input0, const at::ScalarType& dtype, - const c10::optional& stype = c10::nullopt) { + const std::optional& stype = c10::nullopt) { return getIrBuilder()->MakeCast(input0, dtype, stype); } static inline NodePtr MakeTensorList(const OpList& inputs) { diff --git a/torch/csrc/lazy/core/ir_dump_util.cpp b/torch/csrc/lazy/core/ir_dump_util.cpp index 19cb2ae7b1624..a4fb11761a67c 100644 --- a/torch/csrc/lazy/core/ir_dump_util.cpp +++ b/torch/csrc/lazy/core/ir_dump_util.cpp @@ -28,7 +28,7 @@ std::string::size_type SkipTagSeparator( return node_string.compare(pos, 2, ", ") == 0 ? pos + 2 : pos; } -c10::optional ParseAttrTag( +std::optional ParseAttrTag( const std::string& node_string, std::string::size_type pos) { // @lint-ignore-every CLANGTIDY facebook-hte-StdRegexIsAwful @@ -97,7 +97,7 @@ std::unordered_map GetRootsIds( return roots_ids; } -c10::optional GetRootNodeId( +std::optional GetRootNodeId( const Node* node, const std::unordered_map& roots_ids) { auto it = roots_ids.find(node); diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp index afeac5e75e6c3..a2b67c958313a 100644 --- a/torch/csrc/lazy/core/lazy_graph_executor.cpp +++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp @@ -610,7 +610,7 @@ LazyGraphExecutor::SyncTensorCollection LazyGraphExecutor::CollectSyncTensors( } else if (config.force_ltc_data) { // The tensor only has at::Tensor data. We need to queue it for a // device upload. - c10::optional tensor_data = tensors[i]->CurrentTensorData(); + std::optional tensor_data = tensors[i]->CurrentTensorData(); TORCH_CHECK(tensor_data); at_tensors.push_back(*tensor_data); devices.push_back(tensors[i]->GetDevice()); @@ -996,7 +996,7 @@ std::vector LazyGraphExecutor::FetchTensors( ++literals_index; ++sync_index; } else { - c10::optional tensor_data = + std::optional tensor_data = (*tensors)[i]->CurrentTensorData(); if (tensor_data) { results.push_back(*tensor_data); diff --git a/torch/csrc/lazy/core/shape.cpp b/torch/csrc/lazy/core/shape.cpp index 200dd8fac7895..939e2745ed393 100644 --- a/torch/csrc/lazy/core/shape.cpp +++ b/torch/csrc/lazy/core/shape.cpp @@ -13,7 +13,7 @@ namespace lazy { Shape::Shape( at::ScalarType scalar_type, c10::ArrayRef sizes, - c10::optional> is_symbolic) + std::optional> is_symbolic) : scalar_type_(scalar_type), sizes_(sizes.begin(), sizes.end()), is_symbolic_(std::move(is_symbolic)) {} @@ -49,7 +49,7 @@ hash_t Shape::hash(bool bakeInSizes) const { } Shape Shape::with_symbolic_dims( - c10::optional> symbolic_dims) const { + std::optional> symbolic_dims) const { Shape copy = *this; copy.is_symbolic_ = symbolic_dims; return copy; @@ -75,7 +75,7 @@ static c10::SymbolicShape get_symbolic_shape(at::Tensor& tensor) { TORCH_INTERNAL_ASSERT( sizes.size() == is_symbolic->size(), "Dims of two values are not consistent"); - std::vector> symbolic_dims; + std::vector> symbolic_dims; for (size_t i = 0; i < sizes.size(); i++) { if (is_symbolic->at(i)) { symbolic_dims.emplace_back(c10::nullopt); diff --git a/torch/csrc/lazy/core/shape.h b/torch/csrc/lazy/core/shape.h index 1c6b4d5bb3d81..63566619fd149 100644 --- a/torch/csrc/lazy/core/shape.h +++ b/torch/csrc/lazy/core/shape.h @@ -19,7 +19,7 @@ class TORCH_API Shape { Shape( at::ScalarType scalar_type, c10::ArrayRef sizes, - c10::optional> is_symbolic = c10::nullopt); + std::optional> is_symbolic = c10::nullopt); std::string to_string() const; @@ -43,13 +43,13 @@ class TORCH_API Shape { sizes_.at(dim) = size; } - const c10::optional>& is_symbolic() const { + const std::optional>& is_symbolic() const { return is_symbolic_; } // Makes a copy with symbolic dims applied Shape with_symbolic_dims( - c10::optional> symbolic_dims) const; + std::optional> symbolic_dims) const; size_t numel() const; hash_t hash(bool bakeInSizes) const; @@ -64,7 +64,7 @@ class TORCH_API Shape { // Stores which dimmensions are symbolic // If nullopt, either it hasn't been initialized or the symbolic // dimmensions are not calculatable - c10::optional> is_symbolic_ = c10::nullopt; + std::optional> is_symbolic_ = c10::nullopt; }; TORCH_API std::ostream& operator<<(std::ostream& out, const Shape& shape); diff --git a/torch/csrc/lazy/core/tensor.cpp b/torch/csrc/lazy/core/tensor.cpp index 541a0f6f5a070..ba0571f87df4d 100644 --- a/torch/csrc/lazy/core/tensor.cpp +++ b/torch/csrc/lazy/core/tensor.cpp @@ -197,7 +197,7 @@ Value LazyTensor::GetIrValue() const { AssignIrValue(CreateTensorNode(handle, /*read_only=*/false)); return data()->ir_value; } - c10::optional tensor_data = CurrentTensorData(); + std::optional tensor_data = CurrentTensorData(); TORCH_CHECK(tensor_data); AssignIrValue(GetIrValueForTensor(*tensor_data, GetDevice())); return data()->ir_value; @@ -211,7 +211,7 @@ void LazyTensor::SetTensorData(at::Tensor tensor_data) { data()->tensor_data = std::move(tensor_data); } -c10::optional LazyTensor::CurrentTensorData() const { +std::optional LazyTensor::CurrentTensorData() const { return data()->tensor_data; } @@ -236,7 +236,7 @@ Value LazyTensor::GetIrValueForTensor( at::Tensor LazyTensor::ToTensor(bool detached) { at::Tensor tensor; - c10::optional tensor_data = CurrentTensorData(); + std::optional tensor_data = CurrentTensorData(); if (!tensor_data) { LazyGraphExecutor::Get()->DeviceBarrier(GetDevice()); // The GetDataHandle() call will trigger an ApplyPendingGraph() if an IR @@ -373,7 +373,7 @@ std::vector GetLtcTensors(c10::ArrayRef tensors) { } LazyTensorPtr GetOrCreateLtcTensor( - const c10::optional& tensor, + const std::optional& tensor, const BackendDevice& device) { return GetOrCreateLtcTensor(tensor.value_or(at::Tensor()), device); } diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h index 3a15c91c03452..afc52376c5545 100644 --- a/torch/csrc/lazy/core/tensor.h +++ b/torch/csrc/lazy/core/tensor.h @@ -47,7 +47,7 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target { BackendDataPtr handle; Value ir_value; - c10::optional tensor_data; + std::optional tensor_data; const BackendDevice device; const int64_t unique_id = 0; size_t generation = 1; @@ -124,7 +124,7 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target { void SetIrValue(Value ir_value); void SetInPlaceIrValue(Value ir_value); - c10::optional CurrentTensorData() const; + std::optional CurrentTensorData() const; std::vector MakeOutputTensors(NodePtr node) const; @@ -191,7 +191,7 @@ TORCH_API std::vector GetLtcTensors( // If tensor is a lazy tensor type, returns the LazyTensor embedded within it, // otherwise creates a new lazy tensor type with tensor as data. TORCH_API LazyTensorPtr GetOrCreateLtcTensor( - const c10::optional& tensor, + const std::optional& tensor, const BackendDevice& device); TORCH_API LazyTensorPtr GetLtcTensorOrCreateForWrappedNumber( diff --git a/torch/csrc/lazy/core/tensor_impl.h b/torch/csrc/lazy/core/tensor_impl.h index 6eca2212c08ed..a35c02a7aeac4 100644 --- a/torch/csrc/lazy/core/tensor_impl.h +++ b/torch/csrc/lazy/core/tensor_impl.h @@ -54,7 +54,7 @@ class TORCH_API LTCTensorImpl final : public c10::TensorImpl { void setup_size_properties(); LazyTensorPtr tensor_; - mutable c10::optional> sym_sizes_; + mutable std::optional> sym_sizes_; size_t generation_{0}; }; diff --git a/torch/csrc/lazy/core/tensor_util.h b/torch/csrc/lazy/core/tensor_util.h index e4e6a1b7f0c26..121235ef9d8c0 100644 --- a/torch/csrc/lazy/core/tensor_util.h +++ b/torch/csrc/lazy/core/tensor_util.h @@ -43,7 +43,7 @@ inline at::Tensor CopyTensor( } template -T OptionalOr(const c10::optional& value, T defval) { +T OptionalOr(const std::optional& value, T defval) { return value ? static_cast(*value) : defval; } diff --git a/torch/csrc/lazy/core/unique.h b/torch/csrc/lazy/core/unique.h index 0b156a29eb906..fc09c8d71d7d8 100644 --- a/torch/csrc/lazy/core/unique.h +++ b/torch/csrc/lazy/core/unique.h @@ -49,7 +49,7 @@ class Unique { } private: - c10::optional value_; + std::optional value_; }; } // namespace lazy diff --git a/torch/csrc/lazy/core/util.h b/torch/csrc/lazy/core/util.h index a3d35783ae969..e535e5365f227 100644 --- a/torch/csrc/lazy/core/util.h +++ b/torch/csrc/lazy/core/util.h @@ -89,7 +89,7 @@ class MaybeRef { } private: - c10::optional storage_; + std::optional storage_; const T& ref_; }; @@ -109,7 +109,7 @@ std::vector ToVector(const S& input) { } template -c10::optional> ToOptionalVector( +std::optional> ToOptionalVector( c10::OptionalArrayRef arrayRef) { if (arrayRef) { return arrayRef->vec(); diff --git a/torch/csrc/lazy/python/python_util.cpp b/torch/csrc/lazy/python/python_util.cpp index 703d43ca65059..90d9797e3fd35 100644 --- a/torch/csrc/lazy/python/python_util.cpp +++ b/torch/csrc/lazy/python/python_util.cpp @@ -11,7 +11,7 @@ namespace torch { namespace lazy { -c10::optional GetPythonFrameTop() { +std::optional GetPythonFrameTop() { if (!Py_IsInitialized()) { return c10::nullopt; } diff --git a/torch/csrc/lazy/python/python_util.h b/torch/csrc/lazy/python/python_util.h index 8040a023de518..456aafa880971 100644 --- a/torch/csrc/lazy/python/python_util.h +++ b/torch/csrc/lazy/python/python_util.h @@ -7,7 +7,7 @@ namespace torch { namespace lazy { -c10::optional TORCH_PYTHON_API GetPythonFrameTop(); +std::optional TORCH_PYTHON_API GetPythonFrameTop(); std::vector TORCH_PYTHON_API GetPythonFrames(); diff --git a/torch/csrc/lazy/ts_backend/ir_builder.h b/torch/csrc/lazy/ts_backend/ir_builder.h index 1f32a3521ba8a..c538292374434 100644 --- a/torch/csrc/lazy/ts_backend/ir_builder.h +++ b/torch/csrc/lazy/ts_backend/ir_builder.h @@ -33,7 +33,7 @@ struct TorchScriptIrBuilder : IrBuilder { NodePtr MakeCast( const Value& input0, const at::ScalarType& dtype, - const c10::optional& stype = + const std::optional& stype = c10::nullopt) const override { return ReuseOrMakeNode(input0, dtype, stype); } diff --git a/torch/csrc/lazy/ts_backend/ops/to_copy.h b/torch/csrc/lazy/ts_backend/ops/to_copy.h index 4b96b1c389f78..3a5f47411dfdd 100644 --- a/torch/csrc/lazy/ts_backend/ops/to_copy.h +++ b/torch/csrc/lazy/ts_backend/ops/to_copy.h @@ -18,12 +18,12 @@ class ToCopy : public torch::lazy::TsNode { ToCopy( const torch::lazy::Value& self, - const c10::optional& dtype, - const c10::optional& layout, - const c10::optional& device, - const c10::optional& pin_memory, + const std::optional& dtype, + const std::optional& layout, + const std::optional& device, + const std::optional& pin_memory, const bool& non_blocking, - const c10::optional& memory_format, + const std::optional& memory_format, std::vector&& shapes) : torch::lazy::TsNode( ClassOpKind(), @@ -47,12 +47,12 @@ class ToCopy : public torch::lazy::TsNode { bool CanBeReused( const torch::lazy::Value& self, - const c10::optional& dtype, - const c10::optional& layout, - const c10::optional& device, - const c10::optional& pin_memory, + const std::optional& dtype, + const std::optional& layout, + const std::optional& device, + const std::optional& pin_memory, const bool& non_blocking, - const c10::optional& memory_format) const { + const std::optional& memory_format) const { size_t i = 0; return ( operand(i++) == self && this->dtype == dtype && @@ -115,12 +115,12 @@ class ToCopy : public torch::lazy::TsNode { return _to_copy_out; } - c10::optional dtype; - c10::optional layout; - c10::optional device; - c10::optional pin_memory; + std::optional dtype; + std::optional layout; + std::optional device; + std::optional pin_memory; bool non_blocking; - c10::optional memory_format; + std::optional memory_format; }; } // namespace lazy diff --git a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp index 927e2ba62c2de..b0a2d7568aef8 100644 --- a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp +++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp @@ -81,7 +81,7 @@ class TSBackendImpl : public torch::lazy::BackendImplInterface { at::Tensor MakeTensorFromComputationData( const torch::lazy::BackendDataPtr data, - c10::optional logical_scalar_type) const override { + std::optional logical_scalar_type) const override { const auto ts_data = std::static_pointer_cast(data); return ts_data->data(); } diff --git a/torch/csrc/lazy/ts_backend/ts_backend_impl.h b/torch/csrc/lazy/ts_backend/ts_backend_impl.h index d238e8263e577..0607c3efb5386 100644 --- a/torch/csrc/lazy/ts_backend/ts_backend_impl.h +++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.h @@ -38,7 +38,7 @@ class TORCH_API TSData : public torch::lazy::BackendData { return data_; } - c10::optional scalar; + std::optional scalar; private: at::Tensor data_; diff --git a/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp b/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp index e59a665d7bc29..42acc2c5df10a 100644 --- a/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp +++ b/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp @@ -69,16 +69,16 @@ std::vector to_eager( return eager_tensors; } -std::vector> to_eager( - const std::vector>& tensors, +std::vector> to_eager( + const std::vector>& tensors, c10::DeviceType device_type) { // We can't just call _to_eager() on the entire list of Tensors because it // will break on undefined tensors. Separate out undefined tensors first. - std::vector> eager_tensors(tensors.size()); + std::vector> eager_tensors(tensors.size()); std::vector valid_tensors; std::vector to_translate(tensors.size()); for (size_t i = 0; i < tensors.size(); ++i) { - const c10::optional& tensor = tensors[i]; + const std::optional& tensor = tensors[i]; // Explicitly handling undefined tensors here instead of letting `_to_eager` // handle it. Otherwise, we'd need to require all backends with their own // implementation of _to_eager to properly handle undefined tensors. @@ -112,10 +112,10 @@ c10::DispatchKey dispatch_key(c10::DeviceType device_type) { } } -c10::optional compute_target_device( +std::optional compute_target_device( std::vector& t_args, std::vector> tlist_args, - std::vector>> opt_tlist_args) { + std::vector>> opt_tlist_args) { // Decide what device to move the output tensor(s) to. // The current convention is that we use the first tensor arg to pick the // device Barring that, we take the first tensor from a TensorList arg. @@ -217,7 +217,7 @@ void ts_eager_fallback( std::vector tensor_args_indices; std::vector> tensorlist_args; - std::vector>> opt_tensorlist_args; + std::vector>> opt_tensorlist_args; // Step 1: Convert all non-eager tensor inputs into eager tensors and put them // on the stack at the correct indices. @@ -236,7 +236,7 @@ void ts_eager_fallback( (*stack)[arguments_begin + idx] = std::move(eager_ivalue); tensorlist_args.push_back(ivalue.toTensorList()); } else if (ivalue.isOptionalTensorList()) { - auto eager_ivalue = c10::IValue(c10::List>( + auto eager_ivalue = c10::IValue(c10::List>( to_eager(ivalue.toOptionalTensorVector(), device_type))); (*stack)[arguments_begin + idx] = std::move(eager_ivalue); opt_tensorlist_args.push_back(ivalue.toOptionalTensorList()); @@ -323,7 +323,7 @@ void ts_eager_fallback( "mutable alias: ", schema_returns[idx]); } else { - c10::optional tgt_device = compute_target_device( + std::optional tgt_device = compute_target_device( tensor_args, tensorlist_args, opt_tensorlist_args); if (alias_info != nullptr && !alias_info->isWrite()) { // immutable alias (view) case: Warn here, since we're copying and diff --git a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp index 456ff4211ac1a..78ae6a6f6e2e5 100644 --- a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp +++ b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp @@ -28,7 +28,7 @@ namespace { at::Tensor CreateLtcTensor( const at::Tensor& tensor, - const c10::optional& device) { + const std::optional& device) { if (tensor.defined() && device) { return torch::lazy::CreateAtenFromLtcTensor( torch::lazy::LazyTensor::Create(tensor, *device)); @@ -36,8 +36,8 @@ at::Tensor CreateLtcTensor( return tensor; } -c10::optional GetLtcDevice( - const c10::optional& device) { +std::optional GetLtcDevice( + const std::optional& device) { if (!device) { return c10::nullopt; } @@ -53,7 +53,7 @@ c10::optional GetLtcDevice( // This should be safe to do, because every operator in the LT is functional. at::Tensor LazyNativeFunctions::clone( const at::Tensor& self, - c10::optional memory_format) { + std::optional memory_format) { auto self_lt = torch::lazy::TryGetLtcTensor(self); return torch::lazy::CreateAtenFromLtcTensor( self_lt->Create(self_lt->GetIrValue(), self_lt->GetDevice())); @@ -138,12 +138,12 @@ at::Tensor LazyNativeFunctions::_copy_from_and_resize( at::Tensor LazyNativeFunctions::_to_copy( const at::Tensor& self, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, bool non_blocking, - c10::optional memory_format) { + std::optional memory_format) { if (force_eager_fallback(at::aten::_to_copy)) { TORCH_INTERNAL_ASSERT( false, @@ -270,11 +270,11 @@ at::Tensor LazyNativeFunctions::_to_copy( at::Tensor LazyNativeFunctions::empty_symint( at::SymIntArrayRef sym_size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory, - c10::optional memory_format) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory, + std::optional memory_format) { // TODO: support this directly auto size = C10_AS_INTARRAYREF_SLOW(sym_size); const auto device_type = torch::lazy::getBackend()->EagerFallbackDeviceType(); @@ -301,10 +301,10 @@ at::Tensor LazyNativeFunctions::empty_symint( at::Tensor LazyNativeFunctions::empty_strided_symint( at::SymIntArrayRef sym_size, at::SymIntArrayRef sym_stride, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { TORCH_LAZY_FN_COUNTER("lazy::"); at::Tensor t = empty_symint(sym_size, dtype, layout, device, pin_memory, c10::nullopt); @@ -406,10 +406,10 @@ at::Tensor LazyNativeFunctions::new_empty_strided_symint( const at::Tensor& self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { return at::functionalization:: functionalize_aten_op_symint::call( self, size, stride, dtype, layout, device, pin_memory); @@ -457,8 +457,8 @@ at::Tensor LazyNativeFunctions::_trilinear( } at::Tensor LazyNativeFunctions::linalg_pinv( const at::Tensor& self, - const c10::optional& atol, - const c10::optional& rtol, + const std::optional& atol, + const std::optional& rtol, bool hermitian) { return at::functionalization::functionalize_aten_op::call(self, atol, rtol, hermitian); @@ -525,8 +525,8 @@ at::Tensor LazyNativeFunctions::slice_backward_symint( // backwards formula for native_group_norm std::tuple LazyNativeFunctions::native_group_norm( const at::Tensor& input, - const c10::optional& weight, - const c10::optional& bias, + const std::optional& weight, + const std::optional& bias, int64_t N, int64_t C, int64_t HxW, diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp index 825ed46e11a50..b8bef342323c5 100644 --- a/torch/csrc/onnx/init.cpp +++ b/torch/csrc/onnx/init.cpp @@ -293,10 +293,6 @@ void initONNXBindings(PyObject* module) { onnx.attr("PRODUCER_VERSION") = py::str(TORCH_VERSION); -#ifdef BUILD_CAFFE2 - onnx.attr("_CAFFE2_ATEN_FALLBACK") = true; -#else onnx.attr("_CAFFE2_ATEN_FALLBACK") = false; -#endif } } // namespace torch::onnx diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h index 3a129b3118d86..6822d39c225ac 100644 --- a/torch/csrc/profiler/collection.h +++ b/torch/csrc/profiler/collection.h @@ -61,9 +61,9 @@ struct TORCH_API RawTensorMetadata : RawTensorMetadataBase { RawTensorMetadata& operator=(RawTensorMetadata&&) noexcept = default; explicit RawTensorMetadata(const at::Tensor& t); - // Wrap `weak_self_` in `c10::optional` and split device into components to + // Wrap `weak_self_` in `std::optional` and split device into components to // keep struct default constructable. (which the std::array initializer needs) - c10::optional weak_self_; + std::optional weak_self_; c10::DeviceType device_type_{c10::DeviceType::CPU}; c10::DeviceIndex device_index_{-1}; }; @@ -85,8 +85,8 @@ struct TORCH_API TensorMetadata : public RawTensorMetadataBase { std::vector strides_; // Set during `calculateUniqueTensorIDs`. - c10::optional id_; - c10::optional allocation_id_; + std::optional id_; + std::optional allocation_id_; }; using op_input_t = std::variant< @@ -207,8 +207,8 @@ struct ExtraFields : RawAllocation { return {device_type_, device_index_}; } - c10::optional id_; - c10::optional allocation_id_; + std::optional id_; + std::optional allocation_id_; }; template <> @@ -246,7 +246,7 @@ struct NNModuleInfo { struct ParameterInfo { std::string name_; TensorMetadata metadata_; - c10::optional grad_metadata_; + std::optional grad_metadata_; }; PyModuleSelf self_; @@ -261,7 +261,7 @@ struct NNModuleInfo { struct OptimizerInfo { struct ParameterInfo { TensorMetadata metadata_; - c10::optional grad_metadata_; + std::optional grad_metadata_; std::vector> state_; }; @@ -293,8 +293,8 @@ template <> struct ExtraFields : public PyExtraFieldsBase { struct args_t { PyFrameState frame_state_; - c10::optional module_info_; - c10::optional optimizer_info_; + std::optional module_info_; + std::optional optimizer_info_; }; ExtraFields( @@ -308,8 +308,8 @@ struct ExtraFields : public PyExtraFieldsBase { optimizer_{std::move(args.optimizer_info_)} {} PyFrameState callsite_; - c10::optional module_; - c10::optional optimizer_; + std::optional module_; + std::optional optimizer_; }; template <> diff --git a/torch/csrc/profiler/combined_traceback.cpp b/torch/csrc/profiler/combined_traceback.cpp index 1cae103efc77c..c727f58d5284e 100644 --- a/torch/csrc/profiler/combined_traceback.cpp +++ b/torch/csrc/profiler/combined_traceback.cpp @@ -1,4 +1,5 @@ #include +#include namespace torch { @@ -77,7 +78,7 @@ SymbolizedTracebacks symbolize( } // gather symbol names for C++ frames if (!all_cpp_ips.empty()) { - r.all_frames = unwind::symbolize(all_cpp_ips); + r.all_frames = unwind::symbolize(all_cpp_ips, torch::get_symbolize_mode()); } // batch symbolization requests so we dedup frame objects diff --git a/torch/csrc/profiler/data_flow.cpp b/torch/csrc/profiler/data_flow.cpp index e719835d7c2c1..9ea79cdbdb27d 100644 --- a/torch/csrc/profiler/data_flow.cpp +++ b/torch/csrc/profiler/data_flow.cpp @@ -18,8 +18,8 @@ struct RawTensorInfo { bool is_free_; // Used to assign back to the original structs. - std::reference_wrapper> allocation_id_ref_; - std::reference_wrapper> id_ref_; + std::reference_wrapper> allocation_id_ref_; + std::reference_wrapper> id_ref_; }; struct RawTensors { @@ -32,7 +32,7 @@ struct RawTensors { t.impl(), t.data_, t.device_, false, t.allocation_id_, t.id_}); } - void operator()(c10::optional& t) { + void operator()(std::optional& t) { if (t.has_value()) { (*this)(*t); } diff --git a/torch/csrc/profiler/orchestration/observer.h b/torch/csrc/profiler/orchestration/observer.h index 4230851607608..b77febb2784ee 100644 --- a/torch/csrc/profiler/orchestration/observer.h +++ b/torch/csrc/profiler/orchestration/observer.h @@ -27,6 +27,7 @@ enum class C10_API_ENUM ProfilerState { CUDA, // CPU + CUDA events NVTX, // only emit NVTX markers ITT, // only emit ITT markers + PRIVATEUSE1, // only emit PRIVATEUSE1 markers KINETO, // use libkineto KINETO_GPU_FALLBACK, // use CUDA events when CUPTI is not available KINETO_PRIVATEUSE1_FALLBACK, // use PrivateUse1 events @@ -39,7 +40,8 @@ enum class C10_API_ENUM ActiveProfilerType { LEGACY, KINETO, NVTX, - ITT + ITT, + PRIVATEUSE1 }; struct TORCH_API ExperimentalConfig { diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp index 966bf68d3ee42..9ecfe5824a385 100644 --- a/torch/csrc/profiler/python/init.cpp +++ b/torch/csrc/profiler/python/init.cpp @@ -79,8 +79,7 @@ PyTypeObject THPCapturedTracebackType = { nullptr, /* tp_new */ }; -namespace pybind11 { -namespace detail { +namespace pybind11::detail { template <> struct type_caster> { @@ -107,11 +106,9 @@ struct type_caster> { } }; -} // namespace detail -} // namespace pybind11 +} // namespace pybind11::detail -namespace torch { -namespace profiler { +namespace torch::profiler { /* [NOTE: RecordFunctionFast] * This is an alternate way to call record_function from python. @@ -308,6 +305,7 @@ void initPythonBindings(PyObject* module) { .value("CUDA", ProfilerState::CUDA) .value("NVTX", ProfilerState::NVTX) .value("ITT", ProfilerState::ITT) + .value("PRIVATEUSE1", ProfilerState::PRIVATEUSE1) .value("KINETO", ProfilerState::KINETO) .value("KINETO_GPU_FALLBACK", ProfilerState::KINETO_GPU_FALLBACK) .value( @@ -319,7 +317,8 @@ void initPythonBindings(PyObject* module) { .value("LEGACY", ActiveProfilerType::LEGACY) .value("KINETO", ActiveProfilerType::KINETO) .value("NVTX", ActiveProfilerType::NVTX) - .value("ITT", ActiveProfilerType::ITT); + .value("ITT", ActiveProfilerType::ITT) + .value("PRIVATEUSE1", ActiveProfilerType::PRIVATEUSE1); py::enum_(m, "ProfilerActivity") .value("CPU", ActivityType::CPU) @@ -606,6 +605,33 @@ void initPythonBindings(PyObject* module) { } return py_symbolize(tb_ptrs); }); + // directly convert address pointers to frames, used for testing symbolize + m.def( + "symbolize_addresses", + [](const std::vector& frames, const std::string& mode_s) { + std::vector> frames_out; + torch::unwind::Mode mode = torch::unwind::Mode::addr2line; + if (mode_s == "fast") { + mode = torch::unwind::Mode::fast; + } else if (mode_s == "addr2line") { + mode = torch::unwind::Mode::addr2line; + } else if (mode_s == "dladdr") { + mode = torch::unwind::Mode::dladdr; + } else { + TORCH_CHECK(false, "unexpected mode ", mode_s); + } + std::vector frames_p; + frames_p.reserve(frames.size()); + for (auto f : frames) { + frames_p.push_back((void*)f); // NOLINT + } + auto frame_objects = unwind::symbolize(frames_p, mode); + frames_out.reserve(frame_objects.size()); + for (auto& frame : frame_objects) { + frames_out.emplace_back(frame.filename, frame.lineno, frame.funcname); + } + return frames_out; + }); installCapturedTracebackPython(); // NOLINTNEXTLINE(*-c-arrays*) @@ -639,5 +665,4 @@ void initPythonBindings(PyObject* module) { throw python_error(); } } -} // namespace profiler -} // namespace torch +} // namespace torch::profiler diff --git a/torch/csrc/profiler/standalone/privateuse1_observer.cpp b/torch/csrc/profiler/standalone/privateuse1_observer.cpp new file mode 100644 index 0000000000000..81eb3074fb3ae --- /dev/null +++ b/torch/csrc/profiler/standalone/privateuse1_observer.cpp @@ -0,0 +1,11 @@ +#include + +namespace torch { +namespace profiler { +namespace impl { + +PushPRIVATEUSE1CallbacksStub pushPRIVATEUSE1CallbacksStub; + +} // namespace impl +} // namespace profiler +} // namespace torch diff --git a/torch/csrc/profiler/standalone/privateuse1_observer.h b/torch/csrc/profiler/standalone/privateuse1_observer.h new file mode 100644 index 0000000000000..39259b7444cfb --- /dev/null +++ b/torch/csrc/profiler/standalone/privateuse1_observer.h @@ -0,0 +1,46 @@ +#pragma once +#include + +namespace torch { +namespace profiler { +namespace impl { + +using CallBackFnPtr = void (*)( + const ProfilerConfig& config, + const std::unordered_set& scopes); + +struct PushPRIVATEUSE1CallbacksStub { + PushPRIVATEUSE1CallbacksStub() = default; + PushPRIVATEUSE1CallbacksStub(const PushPRIVATEUSE1CallbacksStub&) = delete; + PushPRIVATEUSE1CallbacksStub& operator=(const PushPRIVATEUSE1CallbacksStub&) = + delete; + + template + void operator()(ArgTypes&&... args) { + return (*push_privateuse1_callbacks_fn)(std::forward(args)...); + } + + void set_privateuse1_dispatch_ptr(CallBackFnPtr fn_ptr) { + push_privateuse1_callbacks_fn = fn_ptr; + } + + private: + CallBackFnPtr push_privateuse1_callbacks_fn = nullptr; +}; + +extern TORCH_API struct PushPRIVATEUSE1CallbacksStub + pushPRIVATEUSE1CallbacksStub; + +struct RegisterPRIVATEUSE1Observer { + RegisterPRIVATEUSE1Observer( + PushPRIVATEUSE1CallbacksStub& stub, + CallBackFnPtr value) { + stub.set_privateuse1_dispatch_ptr(value); + } +}; + +#define REGISTER_PRIVATEUSE1_OBSERVER(name, fn) \ + static RegisterPRIVATEUSE1Observer name##__register(name, fn); +} // namespace impl +} // namespace profiler +} // namespace torch diff --git a/torch/csrc/profiler/unwind/action.h b/torch/csrc/profiler/unwind/action.h index e1ed407384fc9..672fffad8c917 100644 --- a/torch/csrc/profiler/unwind/action.h +++ b/torch/csrc/profiler/unwind/action.h @@ -2,6 +2,8 @@ #include #include +namespace torch::unwind { + enum { A_UNDEFINED = 0x0, A_REG_PLUS_DATA = 0x1, // exp = REG[reg] + data0 @@ -53,3 +55,5 @@ struct Action { return out; } }; + +} // namespace torch::unwind diff --git a/torch/csrc/profiler/unwind/communicate.h b/torch/csrc/profiler/unwind/communicate.h index 79c27eaeba7fa..063fe542a3419 100644 --- a/torch/csrc/profiler/unwind/communicate.h +++ b/torch/csrc/profiler/unwind/communicate.h @@ -5,6 +5,7 @@ #include #include +namespace torch::unwind { // helper to open a process with stdin/stdout/stderr streams. struct Communicate { Communicate(const char* command, const char** args) { @@ -63,3 +64,5 @@ struct Communicate { std::unique_ptr out_; std::unique_ptr err_; }; + +} // namespace torch::unwind diff --git a/torch/csrc/profiler/unwind/debug_info.h b/torch/csrc/profiler/unwind/debug_info.h new file mode 100644 index 0000000000000..35c770c24e0c9 --- /dev/null +++ b/torch/csrc/profiler/unwind/debug_info.h @@ -0,0 +1,279 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace torch::unwind { + +struct DebugInfo { + DebugInfo(Sections& s) : s_(s) {} + + void parse(uint64_t offset) { + auto L = parseHeader(offset); + parseCompileUnit(L); + } + unwind::optional lineNumberProgramOffset() { + return line_number_program_offset_; + } + uint64_t nextOffset() { + return end_ - s_.debug_info.data; + } + std::vector> ranges() { + if (range_ptr_) { + auto offset = range_ptr_->first; + if (range_ptr_->second == DW_FORM_rnglistx) { + UNWIND_CHECK(rnglists_base_, "rnglistx but not rnglists_base_ set"); + LOG_INFO("index for rnglistx {:x} + {:x}\n", *rnglists_base_, offset); + CheckedLexer L = s_.debug_rnglists.lexer( + *rnglists_base_ + offset * sec_offset_size_); + auto read = readSegmentOffset(L); + offset = *rnglists_base_ + read; + } + return version_ == 4 ? readRanges4(offset) : readRanges5(offset); + } + if (!highpc_) { + return {}; + } + return {{lowpc_, lowpc_ + *highpc_}}; + } + + bool is64bit() { + return is_64bit_; + } + + private: + CheckedLexer parseHeader(uint64_t offset) { + offset_ = offset; + CheckedLexer L = s_.debug_info.lexer(offset_); + std::tie(length_, is_64bit_) = L.readSectionLength(); + sec_offset_size_ = is_64bit_ ? 8 : 4; + end_ = (const char*)L.loc() + length_; + version_ = L.read(); + UNWIND_CHECK( + version_ == 5 || version_ == 4, + "unexpected dwarf version {}", + version_); + uint8_t address_size = 0; + if (version_ == 5) { + auto unit_type = L.read(); + UNWIND_CHECK(unit_type == 0x1, "unexpected unit type {}", unit_type); + address_size = L.read(); + debug_abbrev_offset_ = + is_64bit_ ? L.read() : L.read(); + } else { + debug_abbrev_offset_ = + is_64bit_ ? L.read() : L.read(); + address_size = L.read(); + } + LOG_INFO( + "compilation unit at offset {:x} with length {:x} and debug_abbrev_offset {:x}\n", + offset, + length_, + debug_abbrev_offset_); + UNWIND_CHECK( + address_size == 8, + "expected 64-bit dwarf but found address size {}", + address_size); + return L; + } + + uint64_t readSegmentOffset(CheckedLexer& L) { + return s_.readSegmentOffset(L, is_64bit_); + } + + uint64_t readEncoded(CheckedLexer& L, uint64_t encoding) { + switch (encoding) { + case DW_FORM_data8: + case DW_FORM_addr: + return L.read(); + case DW_FORM_data4: + return L.read(); + case DW_FORM_addrx: { + auto idx = L.readULEB128(); + return s_.debug_addr.lexer(address_base_ + sizeof(uint64_t) * idx) + .read(); + } + case DW_FORM_sec_offset: + return readSegmentOffset(L); + case DW_FORM_rnglistx: { + return L.readULEB128(); + } + default: + UNWIND_CHECK(false, "unexpected encoding"); + } + } + + void parseCompileUnit(CheckedLexer& L) { + auto entry = L.readULEB128(); + auto A = findAbbrev(debug_abbrev_offset_, entry); + while (true) { + auto attr = A.readULEB128(); + auto form = A.readULEB128(); + if (attr == 0 && form == 0) { + break; + } + if (form == DW_FORM_implicit_const) { + A.readSLEB128(); + } + if (attr == DW_AT_low_pc) { + lowpc_ = readEncoded(L, form); + LOG_INFO(" lowpc {:x}\n", lowpc_); + } else if (attr == DW_AT_high_pc) { + highpc_ = readEncoded(L, form); + range_ptr_ = std::nullopt; + LOG_INFO(" highpc {:x}\n", *highpc_); + } else if (attr == DW_AT_addr_base) { + UNWIND_CHECK(form == DW_FORM_sec_offset, "unexpected addr_base form"); + address_base_ = readSegmentOffset(L); + LOG_INFO(" address base {:x}\n", address_base_); + } else if (attr == DW_AT_rnglists_base) { + UNWIND_CHECK( + form == DW_FORM_sec_offset, "unexpected rnglists_base form"); + rnglists_base_ = readSegmentOffset(L); + LOG_INFO(" range base {:x}\n", *rnglists_base_); + } else if (form == DW_FORM_string) { + L.readCString(); + } else if (attr == DW_AT_stmt_list) { + UNWIND_CHECK(form == DW_FORM_sec_offset, "unexpected stmt_list form"); + LOG_INFO(" program table offset {:x}\n", *line_number_program_offset_); + line_number_program_offset_ = readSegmentOffset(L); + } else if (form == DW_FORM_exprloc) { + auto sz = L.readULEB128(); + L.skip(int64_t(sz)); + } else if (form == DW_FORM_block1) { + auto sz = L.read(); + L.skip(int64_t(sz)); + } else if (attr == DW_AT_ranges) { + auto range_offset = readEncoded(L, form); + LOG_INFO("setting range_ptr to {:x} {:x}\n", range_offset, form); + range_ptr_.emplace(range_offset, form); + } else if ( + form == DW_FORM_udata || form == DW_FORM_rnglistx || + form == DW_FORM_strx || form == DW_FORM_loclistx || + form == DW_FORM_addrx) { + L.readULEB128(); + } else if (form == DW_FORM_sdata) { + L.readSLEB128(); + } else { + auto sz = formSize(form, sec_offset_size_); + UNWIND_CHECK(sz, "unsupported form in compilation unit {:x}", form); + L.skip(int64_t(*sz)); + } + } + } + + std::vector> readRanges4(uint64_t offset) { + CheckedLexer L = s_.debug_ranges.lexer(offset); + std::vector> ranges; + uint64_t base = lowpc_; + while (true) { + auto start = L.read(); + auto end = L.read(); + if (start == 0 && end == 0) { + break; + } + if (start == std::numeric_limits::max()) { + base = end; + } else { + ranges.emplace_back(base + start, base + end); + } + } + return ranges; + } + + std::vector> readRanges5(uint64_t offset) { + CheckedLexer L = s_.debug_rnglists.lexer(offset); + uint64_t base = 0; + LOG_INFO("BEGIN RANGES {:x}\n", offset); + std::vector> ranges; + while (true) { + auto op = L.read(); + switch (op) { + case DW_RLE_end_of_list: + LOG_INFO("END RANGES\n"); + return ranges; + case DW_RLE_base_addressx: { + base = readEncoded(L, DW_FORM_addrx); + LOG_INFO("BASE ADDRX {:x}\n", base); + } break; + case DW_RLE_startx_length: { + auto s = readEncoded(L, DW_FORM_addrx); + auto e = L.readULEB128(); + LOG_INFO("startx_length {:x} {:x}\n", s, e); + ranges.emplace_back(s, s + e); + } break; + case DW_RLE_base_address: + base = L.read(); + LOG_INFO("BASE ADDR {:x}\n", base); + break; + case DW_RLE_offset_pair: { + auto s = L.readULEB128(); + auto e = L.readULEB128(); + LOG_INFO("offset_pair {:x} {:x}\n", s, e); + ranges.emplace_back(base + s, base + e); + } break; + case DW_RLE_start_length: { + auto s = L.read(); + auto e = L.readULEB128(); + LOG_INFO("start_length {:x} {:x}\n", s, e); + ranges.emplace_back(s, s + e); + } break; + default: + UNWIND_CHECK(false, "unknown range op: {}", op); + } + } + } + + CheckedLexer findAbbrev(uint64_t offset, uint64_t entry) { + CheckedLexer L = s_.debug_abbrev.lexer(offset); + while (true) { + auto abbrev_code = L.readULEB128(); + UNWIND_CHECK( + abbrev_code != 0, + "could not find entry {} at offset {:x}", + entry, + offset); + auto tag = L.readULEB128(); + L.read(); // has children + if (abbrev_code == entry) { + UNWIND_CHECK( + tag == DW_TAG_compile_unit, + "first entry was not a compile unit but {}", + tag); + return L; + } + while (true) { + auto attr = L.readULEB128(); + auto form = L.readULEB128(); + if (attr == 0 && form == 0) { + break; + } + if (form == DW_FORM_implicit_const) { + L.readSLEB128(); + } + } + } + } + + Sections& s_; + optional line_number_program_offset_; + uint64_t offset_ = 0; + uint8_t sec_offset_size_ = 0; + uint64_t length_ = 0; + const char* end_ = nullptr; + uint64_t debug_abbrev_offset_ = 0; + bool is_64bit_ = false; + + std::optional> range_ptr_; + uint64_t lowpc_ = 0; + optional highpc_; + uint16_t version_ = 0; + uint64_t address_base_ = 0; + optional rnglists_base_; +}; + +} // namespace torch::unwind diff --git a/torch/csrc/profiler/unwind/dwarf_symbolize_enums.h b/torch/csrc/profiler/unwind/dwarf_symbolize_enums.h new file mode 100644 index 0000000000000..2c229823027d3 --- /dev/null +++ b/torch/csrc/profiler/unwind/dwarf_symbolize_enums.h @@ -0,0 +1,181 @@ +#pragma once +#include +#include +#include + +enum { + DW_TAG_subprogram = 0x2e, + DW_TAG_inlined_subroutine = 0x1d, + DW_TAG_compile_unit = 0x11, + DW_AT_sibling = 0x1, // reference + DW_AT_name = 0x3, // string + DW_AT_stmt_list = 0x10, // lineptr + DW_AT_addr_base = 0x73, // sec_offset + DW_AT_rnglists_base = 0x74, // sec_offset + DW_AT_low_pc = 0x11, // address + DW_AT_high_pc = 0x12, // address + DW_AT_specification = 0x47, // reference + DW_AT_abstract_origin = 0x31, // reference + DW_AT_linkage_name = 0x6e, // string + DW_AT_ranges = 0x55, // rnglist + DW_AT_str_offsets_base = 0x72, // sec_offset + DW_FORM_addr = 0x01, + DW_FORM_block2 = 0x03, + DW_FORM_block4 = 0x04, + DW_FORM_data2 = 0x05, + DW_FORM_data4 = 0x06, + DW_FORM_data8 = 0x07, + DW_FORM_string = 0x08, + DW_FORM_block = 0x09, + DW_FORM_block1 = 0x0a, + DW_FORM_data1 = 0x0b, + DW_FORM_flag = 0x0c, + DW_FORM_sdata = 0x0d, + DW_FORM_strp = 0x0e, + DW_FORM_udata = 0x0f, + DW_FORM_ref_addr = 0x10, + DW_FORM_ref1 = 0x11, + DW_FORM_ref2 = 0x12, + DW_FORM_ref4 = 0x13, + DW_FORM_ref8 = 0x14, + DW_FORM_ref_udata = 0x15, + DW_FORM_indirect = 0x16, + DW_FORM_sec_offset = 0x17, + DW_FORM_exprloc = 0x18, + DW_FORM_flag_present = 0x19, + DW_FORM_strx = 0x1a, + DW_FORM_addrx = 0x1b, + DW_FORM_ref_sup4 = 0x1c, + DW_FORM_strp_sup = 0x1d, + DW_FORM_data16 = 0x1e, + DW_FORM_line_strp = 0x1f, + DW_FORM_ref_sig8 = 0x20, + DW_FORM_implicit_const = 0x21, + DW_FORM_loclistx = 0x22, + DW_FORM_rnglistx = 0x23, + DW_FORM_ref_sup8 = 0x24, + DW_FORM_strx1 = 0x25, + DW_FORM_strx2 = 0x26, + DW_FORM_strx3 = 0x27, + DW_FORM_strx4 = 0x28, + DW_FORM_addrx1 = 0x29, + DW_FORM_addrx2 = 0x2a, + DW_FORM_addrx3 = 0x2b, + DW_FORM_addrx4 = 0x2c, + /* GNU Debug Fission extensions. */ + DW_FORM_GNU_addr_index = 0x1f01, + DW_FORM_GNU_str_index = 0x1f02, + DW_FORM_GNU_ref_alt = 0x1f20, /* offset in alternate .debuginfo. */ + DW_FORM_GNU_strp_alt = 0x1f21, /* offset in alternate .debug_str. */ + DW_LNCT_path = 0x1, + DW_LNCT_directory_index = 0x2, + DW_LNS_extended_op = 0x00, + DW_LNE_end_sequence = 0x01, + DW_LNE_set_address = 0x02, + DW_LNS_copy = 0x01, + DW_LNS_advance_pc = 0x02, + DW_LNS_advance_line = 0x03, + DW_LNS_set_file = 0x04, + DW_LNS_const_add_pc = 0x08, + DW_LNS_fixed_advance_pc = 0x09, + DW_RLE_end_of_list = 0x0, + DW_RLE_base_addressx = 0x1, + DW_RLE_startx_endx = 0x2, + DW_RLE_startx_length = 0x3, + DW_RLE_offset_pair = 0x4, + DW_RLE_base_address = 0x5, + DW_RLE_start_end = 0x6, + DW_RLE_start_length = 0x7 +}; + +static torch::unwind::optional formSize( + uint64_t form, + uint8_t sec_offset_size) { + switch (form) { + case DW_FORM_addr: + return sizeof(void*); + case DW_FORM_block2: + case DW_FORM_block4: + return std::nullopt; + case DW_FORM_data2: + return 2; + case DW_FORM_data4: + return 4; + case DW_FORM_data8: + return 8; + case DW_FORM_string: + case DW_FORM_block: + case DW_FORM_block1: + return std::nullopt; + case DW_FORM_data1: + case DW_FORM_flag: + return 1; + case DW_FORM_sdata: + return std::nullopt; + case DW_FORM_strp: + return sec_offset_size; + case DW_FORM_udata: + return std::nullopt; + case DW_FORM_ref_addr: + return sec_offset_size; + case DW_FORM_ref1: + return 1; + case DW_FORM_ref2: + return 2; + case DW_FORM_ref4: + return 4; + case DW_FORM_ref8: + return 8; + case DW_FORM_ref_udata: + case DW_FORM_indirect: + return std::nullopt; + case DW_FORM_sec_offset: + return sec_offset_size; + case DW_FORM_exprloc: + return std::nullopt; + case DW_FORM_flag_present: + return 0; + case DW_FORM_strx: + case DW_FORM_addrx: + return std::nullopt; + case DW_FORM_ref_sup4: + return 4; + case DW_FORM_strp_sup: + return sec_offset_size; + case DW_FORM_data16: + return 16; + case DW_FORM_line_strp: + return sec_offset_size; + case DW_FORM_ref_sig8: + return 8; + case DW_FORM_implicit_const: + return 0; + case DW_FORM_loclistx: + case DW_FORM_rnglistx: + return std::nullopt; + case DW_FORM_ref_sup8: + return 8; + case DW_FORM_strx1: + return 1; + case DW_FORM_strx2: + return 2; + case DW_FORM_strx3: + return 3; + case DW_FORM_strx4: + return 4; + case DW_FORM_addrx1: + return 1; + case DW_FORM_addrx2: + return 2; + case DW_FORM_addrx3: + return 3; + case DW_FORM_addrx4: + return 4; + case DW_FORM_GNU_addr_index: + case DW_FORM_GNU_str_index: + case DW_FORM_GNU_ref_alt: + case DW_FORM_GNU_strp_alt: + default: + return std::nullopt; + } +} diff --git a/torch/csrc/profiler/unwind/eh_frame_hdr.h b/torch/csrc/profiler/unwind/eh_frame_hdr.h index 9800166675093..c69c066dae68f 100644 --- a/torch/csrc/profiler/unwind/eh_frame_hdr.h +++ b/torch/csrc/profiler/unwind/eh_frame_hdr.h @@ -7,6 +7,7 @@ // Overview of the format described in // https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/ehframehdr.html +namespace torch::unwind { struct EHFrameHdr { EHFrameHdr(void* base) : base_(base) { @@ -93,3 +94,5 @@ struct EHFrameHdr { int64_t fde_count_; uint32_t table_size_; }; + +} // namespace torch::unwind diff --git a/torch/csrc/profiler/unwind/fast_symbolizer.h b/torch/csrc/profiler/unwind/fast_symbolizer.h new file mode 100644 index 0000000000000..2c79ed81f5076 --- /dev/null +++ b/torch/csrc/profiler/unwind/fast_symbolizer.h @@ -0,0 +1,108 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace torch::unwind { + +#define UNWIND_WARN(w, ...) \ + do { \ + w.emplace_back(fmt::format(__VA_ARGS__)); \ + LOG_INFO("WARNING: {}\n", w.back()); \ + } while (0); + +struct FastSymbolizer { + FastSymbolizer() = default; + Frame symbolize(const std::string& library, uint64_t offset) { + LOG_INFO("symbolizing {} + 0x{:x}\n", library, offset); + Frame frame; + frame.funcname = "??"; + frame.filename = library; + frame.lineno = offset; + auto s = getOrCreateSections(library); + if (auto e = s->findSubprogramName(offset)) { + frame.funcname = *e; + } else { + UNWIND_WARN( + warnings_, + "failed to find subprogram name for {} 0x{:x}", + library, + offset); + } + if (auto e = findLine(s, offset)) { + frame.filename = e->first; + frame.lineno = e->second; + } else { + UNWIND_WARN( + warnings_, "failed to find file/line for {} 0x{:x}", library, offset); + } + return frame; + } + const std::vector& warnings() { + return warnings_; + } + + private: + void parseDebugInfo(Sections* s) { + uint64_t offset = 0; + while (offset < s->debug_info.size) { + DebugInfo info(*s); + info.parse(offset); + if (auto lnp_offset = info.lineNumberProgramOffset()) { + for (auto r : info.ranges()) { + s->addDebugInfoRange(r.first, r.second, line_number_programs_.size()); + } + line_number_programs_.emplace_back( + std::make_unique(*s, *lnp_offset)); + } + offset = info.nextOffset(); + } + } + Sections* getOrCreateSections(const std::string& library) { + auto it = libraries_.find(library); + if (it == libraries_.end()) { + it = libraries_.insert({library, std::make_unique()}).first; + try { + Sections* s = it->second.get(); + s->parse(library.c_str()); + parseDebugInfo(s); + } catch (UnwindError& err) { + UNWIND_WARN( + warnings_, "failed to parse library {}: {}", library, err.what()); + } + } + return it->second.get(); + } + optional> findLine( + Sections* s, + uint64_t offset) { + if (auto idx = s->findDebugInfoOffset(offset)) { + auto r = line_number_programs_.at(*idx).get(); + try { + r->parse(); + } catch (UnwindError& err) { + UNWIND_WARN( + warnings_, + "failed to read line number program [{:x}] {}", + r->offset(), + err.what()); + } + if (auto e = r->find(offset)) { + return std::make_pair(r->filename(e->file), e->line); + } + } + return std::nullopt; + } + std::unordered_map> libraries_; + std::vector> line_number_programs_; + std::vector warnings_; +}; + +} // namespace torch::unwind diff --git a/torch/csrc/profiler/unwind/fde.h b/torch/csrc/profiler/unwind/fde.h index 5e8cc0baee18f..ea8b4ca94eaea 100644 --- a/torch/csrc/profiler/unwind/fde.h +++ b/torch/csrc/profiler/unwind/fde.h @@ -7,6 +7,8 @@ #include #include +namespace torch::unwind { + struct TableState { Action cfa; std::array registers; @@ -398,3 +400,5 @@ struct FDE { return strstr(augmentation_string_, s) != nullptr; } }; + +} // namespace torch::unwind diff --git a/torch/csrc/profiler/unwind/lexer.h b/torch/csrc/profiler/unwind/lexer.h index 0c1d33abe4e9e..117df6b9b0286 100644 --- a/torch/csrc/profiler/unwind/lexer.h +++ b/torch/csrc/profiler/unwind/lexer.h @@ -1,19 +1,31 @@ #pragma once -#include -#include +#include +#include +#include #include #include -struct Lexer { - Lexer(void* data, void* base = nullptr) - : next_((const char*)data), base_((int64_t)base) {} +namespace torch::unwind { + +template +struct LexerImpl { + LexerImpl(void* data, void* base = nullptr, void* end = nullptr) + : next_((const char*)data), + base_((int64_t)base), + end_((const char*)end) {} template T read() { T result; + auto end = next_ + sizeof(T); + UNWIND_CHECK( + !checked || end <= end_, + "read out of bounds {} >= {}", + (void*)end, + (void*)end_); memcpy(&result, next_, sizeof(T)); - next_ += sizeof(T); + next_ = end; return result; } @@ -21,7 +33,7 @@ struct Lexer { int64_t readSLEB128() { int64_t Value = 0; unsigned Shift = 0; - uint8_t Byte; + uint8_t Byte = 0; do { Byte = read(); uint64_t Slice = Byte & 0x7f; @@ -29,12 +41,12 @@ struct Lexer { (Shift == 63 && Slice != 0 && Slice != 0x7f)) { throw UnwindError("sleb128 too big for int64"); } - Value |= Slice << Shift; + Value |= int64_t(Slice << Shift); Shift += 7; } while (Byte >= 128); // Sign extend negative numbers if needed. if (Shift < 64 && (Byte & 0x40)) { - Value |= (-1ULL) << Shift; + Value |= int64_t((-1ULL) << Shift); } return Value; } @@ -42,7 +54,7 @@ struct Lexer { uint64_t readULEB128() { uint64_t Value = 0; unsigned Shift = 0; - uint8_t p; + uint8_t p = 0; do { p = read(); uint64_t Slice = p & 0x7f; @@ -56,8 +68,17 @@ struct Lexer { } const char* readCString() { auto result = next_; - next_ += strlen(next_) + 1; - return result; + if (!checked) { + next_ += strlen(next_) + 1; + return result; + } + while (next_ < end_) { + if (*next_++ == '\0') { + return result; + } + } + UNWIND_CHECK( + false, "string is out of bounds {} >= {}", (void*)next_, (void*)end_); } int64_t readEncoded(uint8_t enc) { int64_t r = 0; @@ -81,20 +102,27 @@ struct Lexer { } return readEncoded(enc); } + int64_t read4or8Length() { + return readSectionLength().first; + } + + std::pair readSectionLength() { int64_t length = read(); if (length == 0xFFFFFFFF) { - length = read(); + return std::make_pair(read(), true); } - return length; + return std::make_pair(length, false); } + void* loc() const { return (void*)next_; } - Lexer& skip(int64_t bytes) { + LexerImpl& skip(int64_t bytes) { next_ += bytes; return *this; } + int64_t readEncodedValue(uint8_t enc) { switch (enc & 0xF) { case DW_EH_PE_udata2: @@ -121,4 +149,11 @@ struct Lexer { private: const char* next_; int64_t base_; + const char* end_; }; + +// using Lexer = LexerImpl; +using CheckedLexer = LexerImpl; +using Lexer = LexerImpl; + +} // namespace torch::unwind diff --git a/torch/csrc/profiler/unwind/line_number_program.h b/torch/csrc/profiler/unwind/line_number_program.h new file mode 100644 index 0000000000000..4a1ea281e27d0 --- /dev/null +++ b/torch/csrc/profiler/unwind/line_number_program.h @@ -0,0 +1,325 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace torch::unwind { + +struct LineNumberProgram { + LineNumberProgram(Sections& s, uint64_t offset) : s_(s), offset_(offset) {} + + uint64_t offset() { + return offset_; + } + void parse() { + if (parsed_) { + return; + } + parsed_ = true; + CheckedLexer L = s_.debug_line.lexer(offset_); + std::tie(length_, is_64bit_) = L.readSectionLength(); + program_end_ = (char*)L.loc() + length_; + auto version = L.read(); + UNWIND_CHECK( + version == 5 || version == 4, + "expected version 4 or 5 but found {}", + version); + if (version == 5) { + auto address_size = L.read(); + UNWIND_CHECK( + address_size == 8, + "expected 64-bit dwarf but found address size {}", + address_size); + segment_selector_size_ = L.read(); + } + header_length_ = is_64bit_ ? L.read() : L.read(); + program_ = L; + program_.skip(int64_t(header_length_)); + minimum_instruction_length_ = L.read(); + maximum_operations_per_instruction_ = L.read(); + default_is_stmt_ = L.read(); + line_base_ = L.read(); + line_range_ = L.read(); + opcode_base_ = L.read(); + UNWIND_CHECK(line_range_ != 0, "line_range_ must be non-zero"); + standard_opcode_lengths_.resize(opcode_base_); + for (size_t i = 1; i < opcode_base_; i++) { + standard_opcode_lengths_[i] = L.read(); + } + // fmt::print("{:x} {:x} {} {} {} {} {}\n", offset_, header_length_, + // minimum_instruction_length_, maximum_operations_per_instruction_, + // line_base_, line_range_, opcode_base_); + uint8_t directory_entry_format_count = L.read(); + + if (version == 5) { + struct Member { + uint64_t content_type; + uint64_t form; + }; + std::vector directory_members; + for (size_t i = 0; i < directory_entry_format_count; i++) { + directory_members.push_back({L.readULEB128(), L.readULEB128()}); + } + uint64_t directories_count = L.readULEB128(); + for (size_t i = 0; i < directories_count; i++) { + for (auto& member : directory_members) { + switch (member.content_type) { + case DW_LNCT_path: { + include_directories_.emplace_back( + s_.readString(L, member.form, is_64bit_, 0)); + } break; + default: { + skipForm(L, member.form); + } break; + } + } + } + + for (auto i : c10::irange(directories_count)) { + (void)i; + LOG_INFO("{} {}\n", i, include_directories_[i]); + } + auto file_name_entry_format_count = L.read(); + std::vector file_members; + for (size_t i = 0; i < file_name_entry_format_count; i++) { + file_members.push_back({L.readULEB128(), L.readULEB128()}); + } + auto files_count = L.readULEB128(); + for (size_t i = 0; i < files_count; i++) { + for (auto& member : file_members) { + switch (member.content_type) { + case DW_LNCT_path: { + file_names_.emplace_back( + s_.readString(L, member.form, is_64bit_, 0)); + } break; + case DW_LNCT_directory_index: { + file_directory_index_.emplace_back(readData(L, member.form)); + UNWIND_CHECK( + file_directory_index_.back() < include_directories_.size(), + "directory index out of range"); + } break; + default: { + skipForm(L, member.form); + } break; + } + } + } + for (auto i : c10::irange(files_count)) { + (void)i; + LOG_INFO("{} {} {}\n", i, file_names_[i], file_directory_index_[i]); + } + } else { + include_directories_.emplace_back(""); // implicit cwd + while (true) { + auto str = L.readCString(); + if (*str == '\0') { + break; + } + include_directories_.emplace_back(str); + } + file_names_.emplace_back(""); + file_directory_index_.emplace_back(0); + while (true) { + auto str = L.readCString(); + if (*str == '\0') { + break; + } + auto directory_index = L.readULEB128(); + L.readULEB128(); // mod_time + L.readULEB128(); // file_length + file_names_.emplace_back(str); + file_directory_index_.push_back(directory_index); + } + } + UNWIND_CHECK( + maximum_operations_per_instruction_ == 1, + "maximum_operations_per_instruction_ must be 1"); + UNWIND_CHECK( + minimum_instruction_length_ == 1, + "minimum_instruction_length_ must be 1"); + readProgram(); + } + struct Entry { + uint32_t file = 1; + int64_t line = 1; + }; + unwind::optional find(uint64_t address) { + auto e = program_index_.find(address); + if (!e) { + return std::nullopt; + } + return all_programs_.at(*e).find(address); + } + std::string filename(uint64_t index) { + return fmt::format( + "{}/{}", + include_directories_.at(file_directory_index_.at(index)), + file_names_.at(index)); + } + + private: + void skipForm(CheckedLexer& L, uint64_t form) { + auto sz = formSize(form, is_64bit_ ? 8 : 4); + UNWIND_CHECK(sz, "unsupported form {}", form); + L.skip(int64_t(*sz)); + } + + uint64_t readData(CheckedLexer& L, uint64_t encoding) { + switch (encoding) { + case DW_FORM_data1: + return L.read(); + case DW_FORM_data2: + return L.read(); + case DW_FORM_data4: + return L.read(); + case DW_FORM_data8: + return L.read(); + case DW_FORM_udata: + return L.readULEB128(); + default: + UNWIND_CHECK(false, "unsupported data encoding {}", encoding); + } + } + + void produceEntry() { + if (shadow_) { + return; + } + if (ranges_.size() == 1) { + start_address_ = address_; + } + PRINT_LINE_TABLE( + "{:x}\t{}\t{}\n", address_, filename(entry_.file), entry_.line); + UNWIND_CHECK( + entry_.file < file_names_.size(), + "file index {} > {} entries", + entry_.file, + file_names_.size()); + ranges_.add(address_, entry_, true); + } + void endSequence() { + if (shadow_) { + return; + } + PRINT_LINE_TABLE( + "{:x}\tEND\n", address_, filename(entry_.file), entry_.line); + program_index_.add(start_address_, all_programs_.size(), false); + program_index_.add(address_, std::nullopt, false); + all_programs_.emplace_back(std::move(ranges_)); + ranges_ = RangeTable(); + } + void readProgram() { + while (program_.loc() < program_end_) { + PRINT_INST("{:x}: ", (char*)program_.loc() - (s_.debug_line.data)); + uint8_t op = program_.read(); + if (op >= opcode_base_) { + auto op2 = int64_t(op - opcode_base_); + address_ += op2 / line_range_; + entry_.line += line_base_ + (op2 % line_range_); + PRINT_INST( + "address += {}, line += {}\n", + op2 / line_range_, + line_base_ + (op2 % line_range_)); + produceEntry(); + } else { + switch (op) { + case DW_LNS_extended_op: { + auto len = program_.readULEB128(); + auto extended_op = program_.read(); + switch (extended_op) { + case DW_LNE_end_sequence: { + PRINT_INST("end_sequence\n"); + endSequence(); + entry_ = Entry{}; + } break; + case DW_LNE_set_address: { + address_ = program_.read(); + if (!shadow_) { + PRINT_INST( + "set address {:x} {:x} {:x}\n", + address_, + min_address_, + max_address_); + } + shadow_ = address_ == 0; + } break; + default: { + PRINT_INST("skip extended op {}\n", extended_op); + program_.skip(int64_t(len - 1)); + } break; + } + } break; + case DW_LNS_copy: { + PRINT_INST("copy\n"); + produceEntry(); + } break; + case DW_LNS_advance_pc: { + PRINT_INST("advance pc\n"); + address_ += program_.readULEB128(); + } break; + case DW_LNS_advance_line: { + entry_.line += program_.readSLEB128(); + PRINT_INST("advance line {}\n", entry_.line); + + } break; + case DW_LNS_set_file: { + PRINT_INST("set file\n"); + entry_.file = program_.readULEB128(); + } break; + case DW_LNS_const_add_pc: { + PRINT_INST("const add pc\n"); + address_ += (255 - opcode_base_) / line_range_; + } break; + case DW_LNS_fixed_advance_pc: { + PRINT_INST("fixed advance pc\n"); + address_ += program_.read(); + } break; + default: { + PRINT_INST("other {}\n", op); + auto n = standard_opcode_lengths_[op]; + for (int i = 0; i < n; ++i) { + program_.readULEB128(); + } + } break; + } + } + } + PRINT_INST( + "{:x}: end {:x}\n", + ((char*)program_.loc() - s_.debug_line.data), + program_end_ - s_.debug_line.data); + } + + uint64_t address_ = 0; + bool shadow_ = false; + bool parsed_ = false; + Entry entry_ = {}; + std::vector include_directories_; + std::vector file_names_; + std::vector file_directory_index_; + uint8_t segment_selector_size_ = 0; + uint8_t minimum_instruction_length_ = 0; + uint8_t maximum_operations_per_instruction_ = 0; + int8_t line_base_ = 0; + uint8_t line_range_ = 0; + uint8_t opcode_base_ = 0; + bool default_is_stmt_ = false; + CheckedLexer program_ = {nullptr}; + char* program_end_ = nullptr; + uint64_t header_length_ = 0; + uint64_t length_ = 0; + bool is_64bit_ = false; + std::vector standard_opcode_lengths_; + Sections& s_; + uint64_t offset_; + uint64_t start_address_ = 0; + RangeTable program_index_; + std::vector> all_programs_; + RangeTable ranges_; +}; + +} // namespace torch::unwind diff --git a/torch/csrc/profiler/unwind/mem_file.h b/torch/csrc/profiler/unwind/mem_file.h new file mode 100644 index 0000000000000..e82ffeb2cde98 --- /dev/null +++ b/torch/csrc/profiler/unwind/mem_file.h @@ -0,0 +1,150 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace torch::unwind { + +struct Section { + char* data = nullptr; + size_t size = 0; + const char* string(size_t offset) { + return lexer(offset).readCString(); + } + CheckedLexer lexer(size_t offset) { + return CheckedLexer(data + offset, data, data + size); + } +}; + +/// Memory maps a file into the address space read-only, and manages the +/// lifetime of the mapping. Here are a few use cases: +/// 1. Used in the loader to read in initial image, and to inspect +// ELF files for dependencies before callling dlopen. +/// +/// 2. Used in unity to load the elf file. +struct MemFile { + explicit MemFile(const char* filename_) + : fd_(open(filename_, O_RDONLY)), + mem_(nullptr), + n_bytes_(0), + name_(filename_) { + UNWIND_CHECK( + fd_ != -1, "failed to open {}: {}", filename_, strerror(errno)); + // NOLINTNEXTLINE + struct stat s; + if (-1 == fstat(fd_, &s)) { + close(fd_); // destructors don't run during exceptions + UNWIND_CHECK(false, "failed to stat {}: {}", filename_, strerror(errno)); + } + n_bytes_ = s.st_size; + UNWIND_CHECK( + n_bytes_ > sizeof(Elf64_Ehdr), "empty shared library: {}", filename_); + mem_ = (char*)mmap(nullptr, n_bytes_, PROT_READ, MAP_SHARED, fd_, 0); + if (MAP_FAILED == mem_) { + close(fd_); + UNWIND_CHECK(false, "failed to mmap {}: {}", filename_, strerror(errno)); + } + ehdr_ = (Elf64_Ehdr*)mem_; +#define ELF_CHECK(cond) UNWIND_CHECK(cond, "not an ELF file: {}", filename_) + ELF_CHECK(ehdr_->e_ident[EI_MAG0] == ELFMAG0); + ELF_CHECK(ehdr_->e_ident[EI_MAG1] == ELFMAG1); + ELF_CHECK(ehdr_->e_ident[EI_MAG2] == ELFMAG2); + ELF_CHECK(ehdr_->e_ident[EI_MAG3] == ELFMAG3); + ELF_CHECK(ehdr_->e_ident[EI_CLASS] == ELFCLASS64); + ELF_CHECK(ehdr_->e_ident[EI_VERSION] == EV_CURRENT); + ELF_CHECK(ehdr_->e_version == EV_CURRENT); + ELF_CHECK(ehdr_->e_machine == EM_X86_64); +#undef ELF_CHECK + UNWIND_CHECK( + ehdr_->e_shoff + sizeof(Elf64_Shdr) * ehdr_->e_shnum <= n_bytes_, + "invalid section header table {} {} {}", + ehdr_->e_shoff + sizeof(Elf64_Shdr) * ehdr_->e_shnum, + n_bytes_, + ehdr_->e_shnum); + shdr_ = (Elf64_Shdr*)(mem_ + ehdr_->e_shoff); + UNWIND_CHECK( + ehdr_->e_shstrndx < ehdr_->e_shnum, "invalid strtab section offset"); + auto& strtab_hdr = shdr_[ehdr_->e_shstrndx]; + strtab_ = getSection(strtab_hdr); + } + + MemFile(const MemFile&) = delete; + MemFile& operator=(const MemFile&) = delete; + [[nodiscard]] const char* data() const { + return (const char*)mem_; + } + + /// Returns whether or not the file descriptor + /// of the underlying file is valid. + int valid() { + return fcntl(fd_, F_GETFD) != -1 || errno != EBADF; + } + + ~MemFile() { + if (mem_) { + munmap((void*)mem_, n_bytes_); + } + if (fd_) { + close(fd_); + } + } + + /// Returns the size of the underlying file defined by the `MemFile` + size_t size() { + return n_bytes_; + } + [[nodiscard]] int fd() const { + return fd_; + } + + Section getSection(const Elf64_Shdr& shdr) { + UNWIND_CHECK(shdr.sh_offset + shdr.sh_size <= n_bytes_, "invalid section"); + return Section{mem_ + shdr.sh_offset, shdr.sh_size}; + } + + Section getSection(const char* name, bool optional) { + for (int i = 0; i < ehdr_->e_shnum; i++) { + if (strcmp(strtab_.string(shdr_[i].sh_name), name) == 0) { + return getSection(shdr_[i]); + } + } + UNWIND_CHECK(optional, "{} has no section {}", name_, name); + return Section{nullptr, 0}; + } + + Section strtab() { + return strtab_; + } + + private: + template + T* load(size_t offset) { + UNWIND_CHECK(offset < n_bytes_, "out of range"); + return (T*)(mem_ + offset); + } + int fd_; + char* mem_; + size_t n_bytes_; + std::string name_; + Elf64_Ehdr* ehdr_; + Elf64_Shdr* shdr_; + Section strtab_ = {nullptr, 0}; +}; + +} // namespace torch::unwind diff --git a/torch/csrc/profiler/unwind/range_table.h b/torch/csrc/profiler/unwind/range_table.h new file mode 100644 index 0000000000000..08cb4f492fb6c --- /dev/null +++ b/torch/csrc/profiler/unwind/range_table.h @@ -0,0 +1,74 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +namespace torch::unwind { +template +struct RangeTable { + RangeTable() { + // guarentee that lower_bound[-1] is always valid + addresses_.push_back(0); + payloads_.emplace_back(std::nullopt); + } + void add(uint64_t address, unwind::optional payload, bool sorted) { + if (addresses_.back() > address) { + UNWIND_CHECK(!sorted, "expected addresses to be sorted"); + sorted_ = false; + } + addresses_.push_back(address); + payloads_.emplace_back(std::move(payload)); + } + unwind::optional find(uint64_t address) { + maybeSort(); + auto it = std::upper_bound(addresses_.begin(), addresses_.end(), address); + return payloads_.at(it - addresses_.begin() - 1); + } + void dump() { + for (size_t i = 0; i < addresses_.size(); i++) { + fmt::print("{} {:x}: {}\n", i, addresses_[i], payloads_[i] ? "" : "END"); + } + } + size_t size() const { + return addresses_.size(); + } + uint64_t back() { + maybeSort(); + return addresses_.back(); + } + + private: + void maybeSort() { + if (sorted_) { + return; + } + std::vector indices; + indices.reserve(addresses_.size()); + for (size_t i = 0; i < addresses_.size(); i++) { + indices.push_back(i); + } + std::sort(indices.begin(), indices.end(), [&](uint64_t a, uint64_t b) { + return addresses_[a] < addresses_[b] || + (addresses_[a] == addresses_[b] && + bool(payloads_[a]) < bool(payloads_[b])); + }); + std::vector addresses; + std::vector> payloads; + addresses.reserve(addresses_.size()); + payloads.reserve(addresses_.size()); + for (auto i : indices) { + addresses.push_back(addresses_[i]); + payloads.push_back(payloads_[i]); + } + addresses_ = std::move(addresses); + payloads_ = std::move(payloads); + sorted_ = true; + } + bool sorted_ = true; + std::vector addresses_; + std::vector> payloads_; +}; +} // namespace torch::unwind diff --git a/torch/csrc/profiler/unwind/sections.h b/torch/csrc/profiler/unwind/sections.h new file mode 100644 index 0000000000000..bb984cde9b397 --- /dev/null +++ b/torch/csrc/profiler/unwind/sections.h @@ -0,0 +1,124 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +namespace torch::unwind { + +static std::string demangle(const std::string& mangled_name) { + int status = 0; + char* realname = + abi::__cxa_demangle(mangled_name.c_str(), nullptr, nullptr, &status); + if (status == 0) { + std::string demangled_name(realname); + // NOLINTNEXTLINE + free(realname); + return demangled_name; + } else { + return mangled_name; + } +} + +struct Sections { + Sections() = default; + void parse(const char* name) { + library_ = std::make_unique(name); + strtab = library_->getSection(".strtab", false); + + symtab = library_->getSection(".symtab", true); + debug_info = library_->getSection(".debug_info", true); + if (debug_info.size > 0) { + debug_abbrev = library_->getSection(".debug_abbrev", false); + debug_str = library_->getSection(".debug_str", false); + debug_line = library_->getSection(".debug_line", false); + // dwarf 5 + debug_line_str = library_->getSection(".debug_line_str", true); + debug_rnglists = library_->getSection(".debug_rnglists", true); + debug_addr = library_->getSection(".debug_addr", true); + // dwarf 4 + debug_ranges = library_->getSection(".debug_ranges", true); + } + parseSymtab(); + } + + Section debug_info; + Section debug_abbrev; + Section debug_str; + Section debug_line; + Section debug_line_str; + Section debug_rnglists; + Section debug_ranges; + Section debug_addr; + Section symtab; + Section strtab; + + const char* readString( + CheckedLexer& data, + uint64_t encoding, + bool is_64bit, + uint64_t str_offsets_base) { + switch (encoding) { + case DW_FORM_string: { + return data.readCString(); + } + case DW_FORM_strp: { + return debug_str.string(readSegmentOffset(data, is_64bit)); + } + case DW_FORM_line_strp: { + return debug_line_str.string(readSegmentOffset(data, is_64bit)); + } + default: + UNWIND_CHECK(false, "unsupported string encoding {:x}", encoding); + } + } + + uint64_t readSegmentOffset(CheckedLexer& data, bool is_64bit) { + return is_64bit ? data.read() : data.read(); + } + + unwind::optional findDebugInfoOffset(uint64_t address) { + return debug_info_offsets_.find(address); + } + size_t compilationUnitCount() { + return debug_info_offsets_.size() / 2; + } + void addDebugInfoRange( + uint64_t start, + uint64_t end, + uint64_t debug_info_offset) { + debug_info_offsets_.add(start, debug_info_offset, false); + debug_info_offsets_.add(end, std::nullopt, false); + } + optional findSubprogramName(uint64_t address) { + if (auto e = symbol_table_.find(address)) { + return demangle(strtab.string(*e)); + } + return std::nullopt; + } + + private: + void parseSymtab() { + auto L = symtab.lexer(0); + char* end = symtab.data + symtab.size; + while (L.loc() < end) { + auto symbol = L.read(); + if (symbol.st_shndx == SHN_UNDEF || + ELF64_ST_TYPE(symbol.st_info) != STT_FUNC) { + continue; + } + symbol_table_.add(symbol.st_value, symbol.st_name, false); + symbol_table_.add(symbol.st_value + symbol.st_size, std::nullopt, false); + } + } + + std::unique_ptr library_; + RangeTable debug_info_offsets_; + RangeTable symbol_table_; +}; + +} // namespace torch::unwind diff --git a/torch/csrc/profiler/unwind/unwind.cpp b/torch/csrc/profiler/unwind/unwind.cpp index f3fbde151b775..74d7877edadf1 100644 --- a/torch/csrc/profiler/unwind/unwind.cpp +++ b/torch/csrc/profiler/unwind/unwind.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #if !defined(__linux__) || !defined(__x86_64__) || !defined(__has_include) || \ !__has_include("ext/stdio_filebuf.h") @@ -11,14 +12,14 @@ std::vector unwind() { "record_context_cpp is not support on non-linux non-x86_64 platforms"); } -c10::optional> libraryFor(void* addr) { +std::optional> libraryFor(void* addr) { TORCH_CHECK( false, "record_context_cpp is not support on non-linux non-x86_64 platforms"); } #ifndef FBCODE_CAFFE2 -std::vector symbolize(const std::vector& frames) { +std::vector symbolize(const std::vector& frames, Mode mode) { TORCH_CHECK( false, "record_context_cpp is not support on non-linux non-x86_64 platforms"); @@ -48,10 +49,15 @@ Stats stats() { #include #include #include +#include #include #include #include +extern "C" void unwind_c(std::vector* result, int64_t rsp, int64_t rbp); +extern "C" void unwind_entry(std::vector* result); + +namespace torch::unwind { struct UpgradeExclusive { UpgradeExclusive(std::shared_lock& rdlock) : rdlock_(rdlock) { @@ -197,7 +203,7 @@ struct UnwindCache { Unwinder unwinder = Unwinder::unknown(); try { unwinder = libraryFor(addr).unwinderFor(addr); - } catch (UnwindError& err) { + } catch (unwind::UnwindError& err) { // because unwinders are cached this will only print // once per frame that cannot be unwound. TORCH_WARN("Unsupported unwinding pattern: ", err.what()); @@ -276,53 +282,13 @@ struct UnwindCache { static UnwindCache unwind_cache; static std::shared_timed_mutex cache_mutex_; -extern "C" void unwind_c(std::vector* result, int64_t rsp, int64_t rbp); -extern "C" void unwind_c(std::vector* result, int64_t rsp, int64_t rbp) { - std::shared_lock lock(cache_mutex_); - UnwindState state{}; - // NOLINTNEXTLINE(performance-no-int-to-ptr) - state.rip = *(int64_t*)(rsp); - // +8 because we saved rsp after the return address was already pushed - // to the stack - state.rsp = rsp + 8; - state.rbp = rbp; - unwind_cache.checkRefresh(lock); - while (true) { // unwind for _start sets rip as being undefined - // NOLINTNEXTLINE(performance-no-int-to-ptr) - result->push_back((void*)state.rip); - const Unwinder& uw = unwind_cache.unwinderFor(state.rip, lock); - if (uw.terminator()) { - if (uw.isUnknown()) { - result->push_back(nullptr); - } - break; - } - state = uw.run(state); - } -} - -extern "C" void unwind_entry(std::vector* result); - -// calling convention puts the first three pointer/int64_t arguments in -// rdi rsi rdx (all caller-saved) -// rdi already holds the pointer to the result vector -// we add arguments for current rsp and rbp and then tail call -// into unwind_c -__asm__( - ".global unwind_entry\n" - "unwind_entry:\n" - "mov %rsp, %rsi;\n" - "mov %rbp, %rdx;\n" - "jmp unwind_c;\n"); - -namespace torch::unwind { std::vector unwind() { std::vector frames; unwind_entry(&frames); return frames; } -c10::optional> libraryFor(void* addr) { +std::optional> libraryFor(void* addr) { if (!addr) { return c10::nullopt; } @@ -335,6 +301,15 @@ c10::optional> libraryFor(void* addr) { library_info->name(), (uint64_t)addr - library_info->load_bias()); } +static std::string dladdr_lookup(void* addr) { + Dl_info dlinfo; + std::string funcname = "??"; + if (dladdr(addr, &dlinfo) && dlinfo.dli_sname) { + funcname = demangle(dlinfo.dli_sname); + } + return funcname; +} + struct Symbolizer { Symbolizer() { auto envar = std::getenv("TORCH_ADDR2LINE_BINARY"); @@ -345,9 +320,6 @@ struct Symbolizer { } else { addr2line_binary_ = "addr2line"; // default } - if (torch::get_disable_addr2line()) { - addr2line_binary_ = nullptr; - } } static std::lock_guard guard() { static std::mutex mutex; @@ -367,16 +339,6 @@ struct Symbolizer { frame_map_[addr] = Frame{"??", "", 0}; return; } - if (addr2line_binary_ == nullptr) { - Dl_info dlinfo; - std::string funcname = "??"; - if (dladdr(addr, &dlinfo) && dlinfo.dli_sname) { - funcname = demangle(dlinfo.dli_sname); - } - frame_map_[addr] = Frame{ - maybe_library->first, std::move(funcname), maybe_library->second - 1}; - return; - } has_pending_results_ = true; auto& entry = getOrCreate(maybe_library->first); entry.queried.push_back(addr); @@ -448,23 +410,59 @@ struct Symbolizer { frame_map_[e.queried[e.completed]] = std::move(frame); } } - std::string demangle(const std::string& mangled_name) { - int status = 0; - char* realname = - abi::__cxa_demangle(mangled_name.c_str(), nullptr, nullptr, &status); - if (status == 0) { - std::string demangled_name(realname); - // NOLINTNEXTLINE - free(realname); - return demangled_name; - } else { - return mangled_name; +}; + +static std::vector symbolize_fast( + const std::vector& frames, + Mode mode) { + static std::mutex cache_mutex; + static std::array, 2> frame_maps; + auto& frame_map = frame_maps[mode == Mode::fast ? 0 : 1]; + + std::vector indices_to_lookup; + std::vector results; + results.reserve(frames.size()); + { + std::lock_guard lock(cache_mutex); + for (auto i : c10::irange(frames.size())) { + void* f = frames.at(i); + auto it = frame_map.find(f); + if (it == frame_map.end()) { + indices_to_lookup.push_back(i); + results.emplace_back(Frame{"??", "??", 0}); + } else { + results.emplace_back(it->second); + } } } -}; + if (!indices_to_lookup.empty()) { + // do symbolizer work + FastSymbolizer symbolizer; + for (auto i : indices_to_lookup) { + void* addr = frames.at(i); + Frame& f = results.at(i); + auto library = libraryFor(frames.at(i)); + if (library) { + if (mode == Mode::fast) { + f = symbolizer.symbolize(library->first, library->second - 1); + } else { + f = Frame{library->first, "??", library->second - 1}; + } + } + if (f.funcname == "??") { + f.funcname = dladdr_lookup(addr); + } + } + std::lock_guard lock(cache_mutex); + for (auto i : indices_to_lookup) { + frame_map.emplace(frames.at(i), results.at(i)); + } + } + return results; +} -#ifndef FBCODE_CAFFE2 -std::vector symbolize(const std::vector& frames) { +static std::vector symbolize_addr2line( + const std::vector& frames) { auto guard = Symbolizer::guard(); Symbolizer& s = Symbolizer::get(); for (auto f : frames) { @@ -477,6 +475,16 @@ std::vector symbolize(const std::vector& frames) { } return results; } + +// fbcode will use llvm symbolize since there is an llvm dependency already +#ifndef FBCODE_CAFFE2 +std::vector symbolize(const std::vector& frames, Mode mode) { + if (mode == Mode::addr2line) { + return symbolize_addr2line(frames); + } else { + return symbolize_fast(frames, mode); + } +} #endif Stats stats() { @@ -484,4 +492,42 @@ Stats stats() { } } // namespace torch::unwind + +extern "C" void unwind_c(std::vector* result, int64_t rsp, int64_t rbp) { + std::shared_lock lock(torch::unwind::cache_mutex_); + torch::unwind::UnwindState state{}; + // NOLINTNEXTLINE(performance-no-int-to-ptr) + state.rip = *(int64_t*)(rsp); + // +8 because we saved rsp after the return address was already pushed + // to the stack + state.rsp = rsp + 8; + state.rbp = rbp; + torch::unwind::unwind_cache.checkRefresh(lock); + while (true) { // unwind for _start sets rip as being undefined + // NOLINTNEXTLINE(performance-no-int-to-ptr) + result->push_back((void*)state.rip); + const torch::unwind::Unwinder& uw = + torch::unwind::unwind_cache.unwinderFor(state.rip, lock); + if (uw.terminator()) { + if (uw.isUnknown()) { + result->push_back(nullptr); + } + break; + } + state = uw.run(state); + } +} + +// calling convention puts the first three pointer/int64_t arguments in +// rdi rsi rdx (all caller-saved) +// rdi already holds the pointer to the result vector +// we add arguments for current rsp and rbp and then tail call +// into unwind_c +__asm__( + ".global unwind_entry\n" + "unwind_entry:\n" + "mov %rsp, %rsi;\n" + "mov %rbp, %rdx;\n" + "jmp unwind_c;\n"); + #endif diff --git a/torch/csrc/profiler/unwind/unwind.h b/torch/csrc/profiler/unwind/unwind.h index 69b27f49e5b79..1c302dfca445f 100644 --- a/torch/csrc/profiler/unwind/unwind.h +++ b/torch/csrc/profiler/unwind/unwind.h @@ -1,11 +1,11 @@ #pragma once #include #include +#include #include #include -namespace torch { -namespace unwind { +namespace torch::unwind { // gather current stack, relatively fast. // gets faster once the cache of program counter locations is warm. TORCH_API std::vector unwind(); @@ -16,16 +16,20 @@ struct Frame { uint64_t lineno; }; +enum class Mode { addr2line, fast, dladdr }; + // note: symbolize is really slow // it will launch an addr2line process that has to parse dwarf // information from the libraries that frames point into. // Callers should first batch up all the unique void* pointers // across a number of unwind states and make a single call to // symbolize. -TORCH_API std::vector symbolize(const std::vector& frames); +TORCH_API std::vector symbolize( + const std::vector& frames, + Mode mode); // returns path to the library, and the offset of the addr inside the library -TORCH_API c10::optional> libraryFor( +TORCH_API std::optional> libraryFor( void* addr); struct Stats { @@ -36,5 +40,4 @@ struct Stats { }; Stats stats(); -} // namespace unwind -} // namespace torch +} // namespace torch::unwind diff --git a/torch/csrc/profiler/unwind/unwind_error.h b/torch/csrc/profiler/unwind/unwind_error.h index af2e4dff01090..229c5182c4159 100644 --- a/torch/csrc/profiler/unwind/unwind_error.h +++ b/torch/csrc/profiler/unwind/unwind_error.h @@ -1,6 +1,31 @@ #pragma once +#include +#include #include +namespace torch::unwind { + struct UnwindError : public std::runtime_error { using std::runtime_error::runtime_error; }; + +#define UNWIND_CHECK(cond, fmtstring, ...) \ + do { \ + if (!(cond)) { \ + throw unwind::UnwindError(fmt::format( \ + "{}:{}: " fmtstring, __FILE__, __LINE__, ##__VA_ARGS__)); \ + } \ + } while (0) + +// #define LOG_INFO(...) fmt::print(__VA_ARGS__) +#define LOG_INFO(...) + +// #define PRINT_INST(...) LOG_INFO(__VA_ARGS__) +#define PRINT_INST(...) + +// #define PRINT_LINE_TABLE(...) LOG_INFO(__VA_ARGS__) +#define PRINT_LINE_TABLE(...) + +using c10::optional; // NOLINT + +} // namespace torch::unwind diff --git a/torch/csrc/profiler/unwind/unwind_fb.cpp b/torch/csrc/profiler/unwind/unwind_fb.cpp index 22a805036f699..f40005adae829 100644 --- a/torch/csrc/profiler/unwind/unwind_fb.cpp +++ b/torch/csrc/profiler/unwind/unwind_fb.cpp @@ -5,10 +5,9 @@ #include #include -namespace torch { -namespace unwind { +namespace torch::unwind { -std::vector symbolize(const std::vector& frames) { +std::vector symbolize(const std::vector& frames, Mode mode) { static std::mutex symbolize_mutex; static llvm::symbolize::LLVMSymbolizer symbolizer; static ska::flat_hash_map frame_map_; @@ -38,7 +37,6 @@ std::vector symbolize(const std::vector& frames) { return results; } -} // namespace unwind -} // namespace torch +} // namespace torch::unwind #endif diff --git a/torch/csrc/profiler/unwind/unwinder.h b/torch/csrc/profiler/unwind/unwinder.h index 1d0a30e2f919f..d673f47af8db2 100644 --- a/torch/csrc/profiler/unwind/unwinder.h +++ b/torch/csrc/profiler/unwind/unwinder.h @@ -4,6 +4,8 @@ #include #include +namespace torch::unwind { + struct UnwindState { int64_t rip, rbp, rsp; }; @@ -75,3 +77,5 @@ struct Unwinder { int64_t rbp_off_; bool deref_{false}; }; + +} // namespace torch::unwind diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp index 22b645c168673..f301596fca813 100644 --- a/torch/csrc/profiler/util.cpp +++ b/torch/csrc/profiler/util.cpp @@ -10,9 +10,7 @@ #include #endif #ifdef USE_DISTRIBUTED -#ifdef USE_C10D #include -#endif // USE_C10D #endif // USE_DISTRIBUTED namespace torch { @@ -20,10 +18,10 @@ namespace profiler { namespace impl { namespace { -c10::optional soft_assert_raises_; +std::optional soft_assert_raises_; } // namespace -void setSoftAssertRaises(c10::optional value) { +void setSoftAssertRaises(std::optional value) { soft_assert_raises_ = value; } @@ -337,7 +335,6 @@ std::vector inputTypes(const at::RecordFunction& fn) { // -- NCCL Metadata ----------------------------------------------------------- // ---------------------------------------------------------------------------- #ifdef USE_DISTRIBUTED -#ifdef USE_C10D static constexpr auto kCommsName = "Collective name"; static constexpr auto kDtype = "dtype"; static constexpr auto kInMsgNelems = "In msg nelems"; @@ -352,14 +349,12 @@ static constexpr auto kProcessGroupDesc = "Process Group Description"; static constexpr auto kGroupRanks = "Process Group Ranks"; static constexpr int32_t kTruncatLength = 30; -#endif // USE_C10D #endif // USE_DISTRIBUTED std::unordered_map saveNcclMeta( const at::RecordFunction& fn) { std::unordered_map map; #ifdef USE_DISTRIBUTED -#ifdef USE_C10D auto debugInfo = dynamic_cast( c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO)); if (debugInfo == nullptr) { @@ -434,7 +429,6 @@ std::unordered_map saveNcclMeta( ", "), groupRanks.back())); } -#endif // USE_C10D #endif // USE_DISTRIBUTED return map; } diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h index e27d4084412c8..c8216c93f41c5 100644 --- a/torch/csrc/profiler/util.h +++ b/torch/csrc/profiler/util.h @@ -38,7 +38,7 @@ namespace torch { namespace profiler { namespace impl { TORCH_API bool softAssertRaises(); -TORCH_API void setSoftAssertRaises(c10::optional value); +TORCH_API void setSoftAssertRaises(std::optional value); TORCH_API void logSoftAssert( const char* func, const char* file, diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp index 4ea523cedc942..8d18180ed9195 100644 --- a/torch/csrc/tensor/python_tensor.cpp +++ b/torch/csrc/tensor/python_tensor.cpp @@ -314,8 +314,8 @@ static void set_default_storage_type(Backend backend, ScalarType dtype) { } static void set_default_tensor_type( - c10::optional backend, - c10::optional dtype) { + std::optional backend, + std::optional dtype) { if (backend.has_value()) { TORCH_CHECK_TYPE( *backend != Backend::Undefined, "default type cannot be undefined"); diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h index 5a610c28d2b1e..7552f6d0c028a 100644 --- a/torch/csrc/utils.h +++ b/torch/csrc/utils.h @@ -206,7 +206,7 @@ bool maybeThrowBackCompatKeepdimWarn(char* func); // NB: This is in torch/csrc/cuda/utils.cpp, for whatever reason #ifdef USE_CUDA -std::vector> +std::vector> THPUtils_PySequence_to_CUDAStreamList(PyObject* obj); #endif diff --git a/torch/csrc/utils/cpp_stacktraces.cpp b/torch/csrc/utils/cpp_stacktraces.cpp index a04342976e613..715271d76c826 100644 --- a/torch/csrc/utils/cpp_stacktraces.cpp +++ b/torch/csrc/utils/cpp_stacktraces.cpp @@ -47,9 +47,31 @@ bool get_cpp_stacktraces_enabled() { return enabled; } -bool get_disable_addr2line() { - static bool disabled = compute_disable_addr2line(); - return disabled; +static torch::unwind::Mode compute_symbolize_mode() { + auto envar_c = std::getenv("TORCH_SYMBOLIZE_MODE"); + if (envar_c) { + std::string envar = envar_c; + if (envar == "dladdr") { + return unwind::Mode::dladdr; + } else if (envar == "addr2line") { + return unwind::Mode::addr2line; + } else if (envar == "fast") { + return unwind::Mode::fast; + } else { + TORCH_CHECK( + false, + "expected {dladdr, addr2line, fast} for TORCH_SYMBOLIZE_MODE, got ", + envar); + } + } else { + return compute_disable_addr2line() ? unwind::Mode::dladdr + : unwind::Mode::addr2line; + } +} + +unwind::Mode get_symbolize_mode() { + static unwind::Mode mode = compute_symbolize_mode(); + return mode; } } // namespace torch diff --git a/torch/csrc/utils/cpp_stacktraces.h b/torch/csrc/utils/cpp_stacktraces.h index 30602b0c9b731..8c38e972faf71 100644 --- a/torch/csrc/utils/cpp_stacktraces.h +++ b/torch/csrc/utils/cpp_stacktraces.h @@ -1,8 +1,9 @@ #pragma once #include +#include namespace torch { TORCH_API bool get_cpp_stacktraces_enabled(); -TORCH_API bool get_disable_addr2line(); +TORCH_API torch::unwind::Mode get_symbolize_mode(); } // namespace torch diff --git a/torch/csrc/utils/device_lazy_init.h b/torch/csrc/utils/device_lazy_init.h index b290ae04d792e..4d736898e5359 100644 --- a/torch/csrc/utils/device_lazy_init.h +++ b/torch/csrc/utils/device_lazy_init.h @@ -33,7 +33,7 @@ static inline void maybe_initialize_device(at::Device& device) { } } -static inline void maybe_initialize_device(c10::optional& device) { +static inline void maybe_initialize_device(std::optional& device) { if (!device.has_value()) { return; } diff --git a/torch/csrc/utils/out_types.cpp b/torch/csrc/utils/out_types.cpp index 3d55b9caaf1ca..7e712f2087169 100644 --- a/torch/csrc/utils/out_types.cpp +++ b/torch/csrc/utils/out_types.cpp @@ -7,10 +7,10 @@ namespace utils { // consistent with the out tensor's options void check_out_type_matches( const at::Tensor& result, - c10::optional scalarType, + std::optional scalarType, bool scalarType_is_none, - c10::optional layout, - c10::optional device, + std::optional layout, + std::optional device, bool device_is_none) { if (scalarType_is_none && !layout && device_is_none) { // common case return; diff --git a/torch/csrc/utils/out_types.h b/torch/csrc/utils/out_types.h index 1cab00bc270f2..68bf759f30038 100644 --- a/torch/csrc/utils/out_types.h +++ b/torch/csrc/utils/out_types.h @@ -7,10 +7,10 @@ namespace utils { TORCH_API void check_out_type_matches( const at::Tensor& result, - c10::optional scalarType, + std::optional scalarType, bool scalarType_is_none, - c10::optional layout, - c10::optional device, + std::optional layout, + std::optional device, bool device_is_none); } diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp index 9ea90e8911dbd..90c331488e0c9 100644 --- a/torch/csrc/utils/python_arg_parser.cpp +++ b/torch/csrc/utils/python_arg_parser.cpp @@ -267,7 +267,7 @@ static py::object dispatch_on_subclass( PyObject* torch_api_function, bool is_torch_function, const char* torch_function_name_str, - c10::optional maybe_mode_key = + std::optional maybe_mode_key = c10::nullopt) { py::object ret; for (auto& arg : overloaded_args) { @@ -1003,13 +1003,13 @@ std::string FunctionParameter::type_name() const { } } -static inline c10::optional parse_as_integer(const std::string& s) { +static inline std::optional parse_as_integer(const std::string& s) { if (s.empty()) return c10::nullopt; char* str_end = nullptr; long ans = strtol(s.c_str(), &str_end, 0); // *str_end == 0 if the entire string was parsed as an integer. - return (*str_end == 0) ? c10::optional(ans) : c10::nullopt; + return (*str_end == 0) ? std::optional(ans) : c10::nullopt; } /* diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index 7bbef2f622ad6..06c32d52f0172 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -231,12 +231,12 @@ struct PythonArgs { inline bool has_torch_function(); inline std::string get_func_name(); inline at::Tensor tensor(int i); - inline c10::optional optionalTensor(int i); + inline std::optional optionalTensor(int i); inline at::Scalar scalar(int i); inline at::Scalar scalarWithDefault(int i, const at::Scalar& default_scalar); inline std::vector scalarlist(int i); inline std::vector tensorlist(int i); - inline torch::List> list_of_optional_tensors(int i); + inline torch::List> list_of_optional_tensors(int i); template inline std::array tensorlist_n(int i); inline std::vector intlist(int i); @@ -246,7 +246,7 @@ struct PythonArgs { inline std::vector intlistWithDefault( int i, std::vector default_intlist); - inline c10::optional generator(int i); + inline std::optional generator(int i); inline at::Storage storage(int i); inline at::Storage storage( int i, @@ -257,35 +257,35 @@ struct PythonArgs { inline at::ScalarType scalartypeWithDefault( int i, at::ScalarType default_scalartype); - inline c10::optional scalartypeOptional(int i); - inline c10::optional scalarOptional(int i); - inline c10::optional toInt64Optional(int i); - inline c10::optional toSymIntOptional(int i); - inline c10::optional toBoolOptional(int i); - inline c10::optional toDoubleOptional(int i); + inline std::optional scalartypeOptional(int i); + inline std::optional scalarOptional(int i); + inline std::optional toInt64Optional(int i); + inline std::optional toSymIntOptional(int i); + inline std::optional toBoolOptional(int i); + inline std::optional toDoubleOptional(int i); inline c10::OptionalArray doublelistOptional(int i); inline std::vector doublelist(int i); inline std::vector getDoublelist(int i); inline at::Layout layout(int i); inline at::Layout layoutWithDefault(int i, at::Layout default_layout); - inline c10::optional layoutOptional(int i); + inline std::optional layoutOptional(int i); inline at::Device device(int i); inline at::Device deviceWithDefault(int i, const at::Device& default_device); - inline c10::optional deviceOptional(int i); + inline std::optional deviceOptional(int i); inline at::Dimname dimname(int i); inline std::vector dimnamelist(int i); - inline c10::optional> toDimnameListOptional(int i); + inline std::optional> toDimnameListOptional(int i); inline at::MemoryFormat memoryformat(int i); - inline c10::optional memoryformatOptional(int i); + inline std::optional memoryformatOptional(int i); inline at::QScheme toQScheme(int i); inline std::string string(int i); inline std::string stringWithDefault(int i, const std::string& default_str); - inline c10::optional stringOptional(int i); + inline std::optional stringOptional(int i); inline c10::string_view stringView(int i); inline c10::string_view stringViewWithDefault( int i, const c10::string_view default_str); - inline c10::optional stringViewOptional(int i); + inline std::optional stringViewOptional(int i); inline PyObject* pyobject(int i); inline int64_t toInt64(int i); inline c10::SymInt toSymInt(int i); @@ -300,7 +300,7 @@ struct PythonArgs { inline bool toBool(int i); inline bool toBoolWithDefault(int i, bool default_bool); inline bool isNone(int i); - inline c10::optional toDispatchKeySetOptional(int i); + inline std::optional toDispatchKeySetOptional(int i); private: at::Tensor tensor_slow(int i); @@ -393,7 +393,7 @@ inline at::Tensor PythonArgs::tensor(int i) { return tensor_slow(i); } -inline c10::optional PythonArgs::optionalTensor(int i) { +inline std::optional PythonArgs::optionalTensor(int i) { at::Tensor t = tensor(i); // NOLINTNEXTLINE(bugprone-branch-clone) if (t.defined()) { @@ -433,7 +433,7 @@ inline at::Scalar PythonArgs::scalarWithDefault( return scalar_slow(i); } -inline c10::optional PythonArgs::scalarOptional(int i) { +inline std::optional PythonArgs::scalarOptional(int i) { if (!args[i]) return c10::nullopt; return scalar_slow(i); @@ -457,15 +457,15 @@ inline std::vector PythonArgs::tensorlist(int i) { return res; } -inline torch::List> PythonArgs:: +inline torch::List> PythonArgs:: list_of_optional_tensors(int i) { if (!args[i]) - return torch::List>(); + return torch::List>(); auto tuple = six::isTuple(args[i]); THPObjectPtr arg = six::maybeAsTuple(args[i]); // NOLINTNEXTLINE(bugprone-branch-clone) auto size = tuple ? PyTuple_GET_SIZE(arg.get()) : PyList_GET_SIZE(arg.get()); - torch::List> res; + torch::List> res; res.reserve(size); for (const auto idx : c10::irange(size)) { PyObject* obj = tuple ? PyTuple_GET_ITEM(arg.get(), idx) @@ -729,7 +729,7 @@ inline std::vector PythonArgs::doublelist(int i) { return this->getDoublelist(i); } -inline c10::optional PythonArgs::toDispatchKeySetOptional( +inline std::optional PythonArgs::toDispatchKeySetOptional( int i) { if (!args[i]) { return {}; @@ -769,7 +769,7 @@ inline at::ScalarType PythonArgs::scalartype(int i) { return toScalarType(obj); } -inline c10::optional PythonArgs::scalartypeOptional(int i) { +inline std::optional PythonArgs::scalartypeOptional(int i) { if (!args[i]) return c10::nullopt; return scalartype(i); @@ -794,7 +794,7 @@ inline at::Layout PythonArgs::layoutWithDefault( return layout(i); } -inline c10::optional PythonArgs::layoutOptional(int i) { +inline std::optional PythonArgs::layoutOptional(int i) { if (!args[i]) return c10::nullopt; return layout(i); @@ -835,7 +835,7 @@ inline at::Device PythonArgs::deviceWithDefault( return device(i); } -inline c10::optional PythonArgs::deviceOptional(int i) { +inline std::optional PythonArgs::deviceOptional(int i) { if (!args[i]) return c10::nullopt; return device(i); @@ -860,7 +860,7 @@ inline std::vector parseDimnameList(PyObject* arg) { return res; } -inline c10::optional> PythonArgs:: +inline std::optional> PythonArgs:: toDimnameListOptional(int i) { if (!args[i]) return c10::nullopt; @@ -888,7 +888,7 @@ inline at::MemoryFormat PythonArgs::memoryformat(int i) { return memory_format->memory_format; } -inline c10::optional PythonArgs::memoryformatOptional(int i) { +inline std::optional PythonArgs::memoryformatOptional(int i) { if (!args[i]) return c10::nullopt; return memoryformat(i); @@ -916,7 +916,7 @@ inline std::string PythonArgs::stringWithDefault( return THPUtils_unpackString(args[i]); } -inline c10::optional PythonArgs::stringOptional(int i) { +inline std::optional PythonArgs::stringOptional(int i) { if (!args[i]) return c10::nullopt; return THPUtils_unpackString(args[i]); @@ -934,7 +934,7 @@ inline c10::string_view PythonArgs::stringViewWithDefault( return THPUtils_unpackStringView(args[i]); } -inline c10::optional PythonArgs::stringViewOptional(int i) { +inline std::optional PythonArgs::stringViewOptional(int i) { if (!args[i]) return c10::nullopt; return THPUtils_unpackStringView(args[i]); @@ -988,26 +988,26 @@ inline int64_t PythonArgs::toInt64WithDefault(int i, int64_t default_int) { return toInt64(i); } -inline c10::optional PythonArgs::toInt64Optional(int i) { +inline std::optional PythonArgs::toInt64Optional(int i) { if (!args[i]) return c10::nullopt; return toInt64(i); } -inline c10::optional PythonArgs::toSymIntOptional(int i) { +inline std::optional PythonArgs::toSymIntOptional(int i) { if (!args[i]) return c10::nullopt; return toSymInt(i); } -inline c10::optional PythonArgs::toBoolOptional(int i) { +inline std::optional PythonArgs::toBoolOptional(int i) { if (!args[i]) { return c10::nullopt; } return toBool(i); } -inline c10::optional PythonArgs::toDoubleOptional(int i) { +inline std::optional PythonArgs::toDoubleOptional(int i) { if (!args[i]) { return c10::nullopt; } @@ -1069,7 +1069,7 @@ inline bool PythonArgs::isNone(int i) { return args[i] == nullptr; } -inline c10::optional PythonArgs::generator(int i) { +inline std::optional PythonArgs::generator(int i) { if (!args[i]) return c10::nullopt; return reinterpret_cast(args[i])->cdata; diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h index 73b991cf3fbfc..b060db00db733 100644 --- a/torch/csrc/utils/python_compat.h +++ b/torch/csrc/utils/python_compat.h @@ -11,6 +11,7 @@ extern "C" { #define IS_PYTHON_3_11_PLUS PY_VERSION_HEX >= 0x030B00C1 #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000 +#define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000 PYCAPI_COMPAT_STATIC_INLINE(int) PyCode_GetNCellvars(PyCodeObject* code) { @@ -32,6 +33,9 @@ PyCode_GetNFreevars(PyCodeObject* code) { #endif } +// Provided by CPython but getting the header for them is very hard +extern void _PyWeakref_ClearRef(PyWeakReference* self); + #ifdef __cplusplus } #endif diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp index a3e71a2542e3d..e370923b398d8 100644 --- a/torch/csrc/utils/python_dispatch.cpp +++ b/torch/csrc/utils/python_dispatch.cpp @@ -826,7 +826,7 @@ void initDispatchBindings(PyObject* module) { m.def( "_parse_dispatch_key", - [](const char* dispatch_key) -> c10::optional { + [](const char* dispatch_key) -> std::optional { try { return c10::parseDispatchKey(dispatch_key); } catch (const c10::Error& err) { diff --git a/torch/csrc/utils/python_raii.h b/torch/csrc/utils/python_raii.h index 70a5ddfeb55ee..411e558715e8b 100644 --- a/torch/csrc/utils/python_raii.h +++ b/torch/csrc/utils/python_raii.h @@ -22,7 +22,7 @@ struct RAIIContextManager { } private: - c10::optional guard_; + std::optional guard_; std::tuple args_; }; @@ -55,7 +55,7 @@ struct DeprecatedRAIIContextManager { } private: - c10::optional guard_; + std::optional guard_; std::tuple args_; }; diff --git a/torch/csrc/utils/python_symnode.h b/torch/csrc/utils/python_symnode.h index c4814930507bf..f8c710cf6579f 100644 --- a/torch/csrc/utils/python_symnode.h +++ b/torch/csrc/utils/python_symnode.h @@ -140,7 +140,7 @@ class PythonSymNodeImpl : public c10::SymNodeImpl { return getPyObj().attr("int_")().cast(); } - c10::optional maybe_as_int() override { + std::optional maybe_as_int() override { py::gil_scoped_acquire acquire; const auto& r = getPyObj().attr("maybe_as_int")(); if (r.is_none()) { diff --git a/torch/csrc/utils/schema_info.cpp b/torch/csrc/utils/schema_info.cpp index 56e1c6b4a6be2..0caa5b254d279 100644 --- a/torch/csrc/utils/schema_info.cpp +++ b/torch/csrc/utils/schema_info.cpp @@ -6,7 +6,7 @@ namespace utils { void SchemaInfo::addArgumentValue( const std::string& name, const at::IValue& value) { - c10::optional index = schema_.argumentIndexWithName(name); + std::optional index = schema_.argumentIndexWithName(name); TORCH_INTERNAL_ASSERT( index != c10::nullopt, "Schema has no argument named ", name); value_map_[name] = value; @@ -14,7 +14,7 @@ void SchemaInfo::addArgumentValue( } void SchemaInfo::addArgumentValues( - const std::vector>& value_list) { + const std::vector>& value_list) { TORCH_INTERNAL_ASSERT( value_list.size() <= schema_.arguments().size(), "Schema does not have enough arguments for value list"); @@ -106,7 +106,7 @@ bool SchemaInfo::has_argument(c10::string_view name) { } bool SchemaInfo::is_mutable(c10::string_view name) { - c10::optional index = schema_.argumentIndexWithName(name); + std::optional index = schema_.argumentIndexWithName(name); TORCH_INTERNAL_ASSERT( index.has_value(), "Schema has no argument named ", name); @@ -144,10 +144,10 @@ bool SchemaInfo::may_alias( if (basic_check) { return true; } - c10::optional lhsAliasTypeSet = + std::optional lhsAliasTypeSet = schema_.mapTypeToAliasTypeSet( schema_.getCorrectList(lhs.type)[lhs.index].type()); - c10::optional rhsAliasTypeSet = + std::optional rhsAliasTypeSet = schema_.mapTypeToAliasTypeSet( schema_.getCorrectList(rhs.type)[rhs.index].type()); bool types_can_alias = @@ -205,10 +205,10 @@ bool SchemaInfo::may_contain_alias( bool SchemaInfo::mayContainAliasImpl( const c10::SchemaArgument& lhs, const c10::SchemaArgument& rhs) { - c10::optional lhsContainedAliasTypeSet = + std::optional lhsContainedAliasTypeSet = schema_.getAliasTypeSetContainedTypes(schema_.mapTypeToAliasTypeSet( schema_.getCorrectList(lhs.type)[lhs.index].type())); - c10::optional rhsAliasTypeSet = + std::optional rhsAliasTypeSet = schema_.mapTypeToAliasTypeSet( schema_.getCorrectList(rhs.type)[rhs.index].type()); bool types_can_alias = @@ -339,7 +339,7 @@ void SchemaInfo::initSchemaInfo() { } } } - c10::optional contained_types = + std::optional contained_types = schema_.getAliasTypeSetContainedTypes( schema_.mapTypeToAliasTypeSet(argument.type())); if (contained_types && !contained_types->empty()) { diff --git a/torch/csrc/utils/schema_info.h b/torch/csrc/utils/schema_info.h index 461f5a6f0427b..acda1bffc1538 100644 --- a/torch/csrc/utils/schema_info.h +++ b/torch/csrc/utils/schema_info.h @@ -61,7 +61,7 @@ struct TORCH_API SchemaInfo { void addArgumentValue(const std::string& name, const at::IValue& value); void addArgumentValues( - const std::vector>& value_list); + const std::vector>& value_list); void addArgumentValues( const std::unordered_map& values); diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp index e1755b5b36248..4fd398d1a8faf 100644 --- a/torch/csrc/utils/tensor_new.cpp +++ b/torch/csrc/utils/tensor_new.cpp @@ -42,7 +42,7 @@ using at::ScalarType; using at::Storage; using at::Tensor; using at::TensorOptions; -using c10::optional; +using std::optional; namespace torch::utils { namespace { @@ -53,7 +53,7 @@ thread_local bool kOnlyLiftCPUTensors = false; TensorOptions build_options( c10::TensorOptions options, at::ScalarType scalar_type, - const c10::optional& device = c10::nullopt) { + const std::optional& device = c10::nullopt) { options = options.dtype(scalar_type); if (device.has_value()) { return options.device(device); @@ -172,7 +172,7 @@ ScalarType infer_scalar_type(PyObject* obj) { Py_TYPE(obj)->tp_name, "'"); if (PySequence_Check(obj)) { - c10::optional scalarType; + std::optional scalarType; auto length = PySequence_Length(obj); if (length < 0) throw python_error(); @@ -290,7 +290,7 @@ void recursive_store( Tensor internal_new_from_data( c10::TensorOptions options, at::ScalarType scalar_type, - c10::optional device_opt, + std::optional device_opt, PyObject* data, bool copy_variables, bool copy_numpy, @@ -489,7 +489,7 @@ Tensor internal_new_from_data( Tensor new_from_data_copy( c10::TensorOptions options, at::ScalarType scalar_type, - c10::optional device, + std::optional device, PyObject* data) { return internal_new_from_data( options, @@ -504,7 +504,7 @@ Tensor new_from_data_copy( Tensor legacy_new_from_sequence( c10::TensorOptions options, at::ScalarType scalar_type, - c10::optional device, + std::optional device, PyObject* data) { TORCH_CHECK_TYPE( PySequence_Check(data), @@ -570,7 +570,7 @@ void check_base_legacy_new( // TODO: Make this accept options instead of dispatch key void check_legacy_ctor_device( c10::DispatchKey dispatch_key, - c10::optional device) { + std::optional device) { if (device.has_value()) { TORCH_CHECK( dispatchKeyToDeviceType(dispatch_key) == device.value().type(), @@ -833,7 +833,7 @@ Tensor legacy_tensor_new( Tensor indexing_tensor_from_data( c10::TensorOptions options, at::ScalarType scalar_type, - c10::optional device, + std::optional device, PyObject* data) { // Specific to tensor indexing, converts an indexing list to an // indexing tensor (type Byte or Long) @@ -877,7 +877,7 @@ static Tensor sparse_compressed_tensor_ctor_worker( c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r, - c10::optional required_layout) { + std::optional required_layout) { TORCH_INTERNAL_ASSERT(!isSparseCsr(dispatchKeyToBackend(dispatch_key))); TORCH_INTERNAL_ASSERT(!isSparse(dispatchKeyToBackend(dispatch_key))); enum { @@ -971,7 +971,7 @@ static Tensor sparse_compressed_tensor_ctor_worker( /*copy_variables=*/false, /*copy_numpy=*/true, /*type_inference=*/true); - c10::optional layout = + std::optional layout = (required_layout ? r.layoutWithDefault(ARG_LAYOUT, required_layout.value()) : r.layoutOptional(ARG_LAYOUT)); @@ -1027,7 +1027,7 @@ static Tensor sparse_compressed_tensor_ctor_worker( /*copy_variables=*/false, /*copy_numpy=*/true, /*type_inference=*/true); - c10::optional layout = + std::optional layout = (required_layout ? r.layoutWithDefault(ARG_LAYOUT1, required_layout.value()) : r.layoutOptional(ARG_LAYOUT1)); @@ -1054,7 +1054,7 @@ Tensor sparse_compressed_tensor_ctor( c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) { - c10::optional required_layout{}; + std::optional required_layout{}; return sparse_compressed_tensor_ctor_worker( "sparse_compressed_tensor", dispatch_key, @@ -1067,7 +1067,7 @@ Tensor sparse_csr_tensor_ctor( c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) { - c10::optional required_layout(c10::Layout::SparseCsr); + std::optional required_layout(c10::Layout::SparseCsr); return sparse_compressed_tensor_ctor_worker( "sparse_csr_tensor", dispatch_key, scalar_type, r, required_layout); } @@ -1076,7 +1076,7 @@ Tensor sparse_csc_tensor_ctor( c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) { - c10::optional required_layout(c10::Layout::SparseCsc); + std::optional required_layout(c10::Layout::SparseCsc); return sparse_compressed_tensor_ctor_worker( "sparse_csc_tensor", dispatch_key, scalar_type, r, required_layout); } @@ -1085,7 +1085,7 @@ Tensor sparse_bsr_tensor_ctor( c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) { - c10::optional required_layout(c10::Layout::SparseBsr); + std::optional required_layout(c10::Layout::SparseBsr); return sparse_compressed_tensor_ctor_worker( "sparse_bsr_tensor", dispatch_key, scalar_type, r, required_layout); } @@ -1094,7 +1094,7 @@ Tensor sparse_bsc_tensor_ctor( c10::DispatchKey dispatch_key, at::ScalarType scalar_type, PythonArgs& r) { - c10::optional required_layout(c10::Layout::SparseBsc); + std::optional required_layout(c10::Layout::SparseBsc); return sparse_compressed_tensor_ctor_worker( "sparse_bsc_tensor", dispatch_key, scalar_type, r, required_layout); } @@ -1660,9 +1660,9 @@ Tensor tensor_fromDLPack(PyObject* data) { Tensor asarray( PyObject* obj, - c10::optional dtype, - c10::optional device, - c10::optional copy, + std::optional dtype, + std::optional device, + std::optional copy, bool requires_grad) { Tensor tensor; diff --git a/torch/csrc/utils/tensor_new.h b/torch/csrc/utils/tensor_new.h index a1c34bd448882..70a4fbca0bac3 100644 --- a/torch/csrc/utils/tensor_new.h +++ b/torch/csrc/utils/tensor_new.h @@ -44,7 +44,7 @@ at::Tensor legacy_tensor_new( at::Tensor indexing_tensor_from_data( c10::TensorOptions options, at::ScalarType scalar_type, - c10::optional device, + std::optional device, PyObject* data); at::Tensor sparse_coo_tensor_ctor( c10::DispatchKey dispatch_key, @@ -130,9 +130,9 @@ at::Tensor tensor_frombuffer( at::Tensor tensor_fromDLPack(PyObject* data); at::Tensor asarray( PyObject* obj, - c10::optional dtype, - c10::optional device, - c10::optional copy, + std::optional dtype, + std::optional device, + std::optional copy, bool requires_grad); } // namespace utils } // namespace torch diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp index a94ed7783dfd5..9b07b9d32f1c0 100644 --- a/torch/csrc/utils/tensor_numpy.cpp +++ b/torch/csrc/utils/tensor_numpy.cpp @@ -473,7 +473,7 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) { } } - const auto target_device = [&]() -> c10::optional { + const auto target_device = [&]() -> std::optional { // note(crcrpar): zero-size arrays come with nullptr. // ref: // https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html#cuda-array-interface-version-3 diff --git a/torch/csrc/utils/torch_dispatch_mode.h b/torch/csrc/utils/torch_dispatch_mode.h index 79173aeb3e007..d1c1392e37d63 100644 --- a/torch/csrc/utils/torch_dispatch_mode.h +++ b/torch/csrc/utils/torch_dispatch_mode.h @@ -35,7 +35,7 @@ struct StashTorchDispatchModeGuard { private: std::shared_ptr saved_mode_; - c10::optional saved_mode_key_; + std::optional saved_mode_key_; }; struct StashTorchDispatchStackGuard { diff --git a/torch/csrc/utils/variadic.h b/torch/csrc/utils/variadic.h index 9c021d9f5cd3d..78ffe29971423 100644 --- a/torch/csrc/utils/variadic.h +++ b/torch/csrc/utils/variadic.h @@ -18,7 +18,7 @@ struct CountTensors : IterArgs { void operator()(const at::Tensor& x) { out += 1; } - void operator()(const c10::optional& x) { + void operator()(const std::optional& x) { out += x.has_value(); } void operator()(at::ArrayRef xs) { diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index 1344de8b9fde9..b8929179fd4a3 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -154,6 +154,14 @@ def _sleep(cycles): torch._C._cuda_sleep(cycles) +def _extract_arch_version(arch_string: str): + """Extracts the architecture string from a CUDA version""" + base = arch_string.split("_")[1] + if base.endswith("a"): + base = base[:-1] + return int(base) + + def _check_capability(): incorrect_binary_warn = """ Found GPU%d %s which requires CUDA_VERSION >= %d to @@ -177,7 +185,7 @@ def _check_capability(): name = get_device_name(d) current_arch = major * 10 + minor min_arch = min( - (int(arch.split("_")[1]) for arch in torch.cuda.get_arch_list()), + (_extract_arch_version(arch) for arch in torch.cuda.get_arch_list()), default=35, ) if current_arch < min_arch: @@ -198,7 +206,7 @@ def _check_cubins(): arch_list = get_arch_list() if len(arch_list) == 0: return - supported_sm = [int(arch.split("_")[1]) for arch in arch_list if "sm_" in arch] + supported_sm = [_extract_arch_version(arch) for arch in arch_list if "sm_" in arch] for idx in range(device_count()): cap_major, cap_minor = get_device_capability(idx) # NVIDIA GPU compute architectures are backward compatible within major version diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py index 22d541f4e2879..d361213815865 100644 --- a/torch/cuda/streams.py +++ b/torch/cuda/streams.py @@ -36,7 +36,7 @@ def __new__(cls, device=None, priority=0, **kwargs): with torch.cuda.device(device): return super().__new__(cls, priority=priority, **kwargs) - def wait_event(self, event): + def wait_event(self, event) -> None: r"""Make all future work submitted to the stream wait for an event. Args: @@ -53,7 +53,7 @@ def wait_event(self, event): """ event.wait(self) - def wait_stream(self, stream): + def wait_stream(self, stream) -> None: r"""Synchronize with another stream. All future work submitted to this stream will wait until all kernels @@ -82,7 +82,7 @@ def record_event(self, event=None): event.record(self) return event - def query(self): + def query(self) -> bool: r"""Check if all the work submitted has been completed. Returns: @@ -90,7 +90,7 @@ def query(self): """ return super().query() - def synchronize(self): + def synchronize(self) -> None: r"""Wait for all the kernels in this stream to complete. .. note:: This is a wrapper around ``cudaStreamSynchronize()``: see @@ -102,7 +102,7 @@ def synchronize(self): def _as_parameter_(self): return ctypes.c_void_p(self.cuda_stream) - def __eq__(self, o): + def __eq__(self, o) -> bool: if isinstance(o, Stream): return super().__eq__(o) return False @@ -128,7 +128,7 @@ class ExternalStream(Stream): stream_ptr(int): Integer representation of the `cudaStream_t` value. allocated externally. device(torch.device or int, optional): the device where the stream - was originally allocated. if device is specified incorrectly, + was originally allocated. If device is specified incorrectly, subsequent launches using this stream may fail. """ @@ -183,7 +183,7 @@ def record(self, stream=None): stream = torch.cuda.current_stream() super().record(stream) - def wait(self, stream=None): + def wait(self, stream=None) -> None: r"""Make all future work submitted to the given stream wait for this event. Use ``torch.cuda.current_stream()`` if no stream is specified. @@ -212,7 +212,7 @@ def elapsed_time(self, end_event): """ return super().elapsed_time(end_event) - def synchronize(self): + def synchronize(self) -> None: r"""Wait for the event to complete. Waits until the completion of all work currently captured in this event. @@ -234,7 +234,7 @@ def ipc_handle(self): def _as_parameter_(self): return ctypes.c_void_p(self.cuda_event) - def __repr__(self): + def __repr__(self) -> str: if self.cuda_event: return f"" else: diff --git a/torch/custom_class_detail.h b/torch/custom_class_detail.h index 736d5aacdaa32..e27721c349864 100644 --- a/torch/custom_class_detail.h +++ b/torch/custom_class_detail.h @@ -61,7 +61,7 @@ struct arg { // IValue's default constructor makes it None, which is not distinguishable // from an actual, user-provided default value that is None. This boolean // helps distinguish between the two cases. - c10::optional value_; + std::optional value_; }; namespace detail { diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py index 47e0e78a6be27..eb7a690fa9589 100644 --- a/torch/distributed/__init__.py +++ b/torch/distributed/__init__.py @@ -127,6 +127,7 @@ def breakpoint(rank: int = 0): ) from .remote_device import _remote_device + from .device_mesh import init_device_mesh, DeviceMesh set_debug_level_from_env() diff --git a/torch/distributed/_tensor/__init__.py b/torch/distributed/_tensor/__init__.py index 3e5e628b0522b..f7afe41e753c5 100644 --- a/torch/distributed/_tensor/__init__.py +++ b/torch/distributed/_tensor/__init__.py @@ -10,6 +10,8 @@ from torch.distributed._tensor.ops.utils import normalize_to_torch_size from torch.distributed._tensor.placement_types import Placement, Replicate, Shard from torch.distributed.device_mesh import _mesh_resources, DeviceMesh, init_device_mesh +from torch.optim.optimizer import _foreach_supported_types + # All public APIs from dtensor package __all__ = [ @@ -23,6 +25,12 @@ ] +# Append DTensor to the list of supported types for foreach implementation of optimizer +# so that we will try to use foreach over the for-loop implementation on CUDA. +if DTensor not in _foreach_supported_types: + _foreach_supported_types.append(DTensor) + + def _dtensor_init_helper( init_op, size: torch.Size, diff --git a/torch/distributed/_tensor/_collective_utils.py b/torch/distributed/_tensor/_collective_utils.py index cd62a76307f26..93052d6ddd622 100644 --- a/torch/distributed/_tensor/_collective_utils.py +++ b/torch/distributed/_tensor/_collective_utils.py @@ -177,6 +177,21 @@ def unpad_tensor(tensor: torch.Tensor, pad_dim: int, pad_size: int) -> torch.Ten ) +def fill_empty_tensor_to_shards( + shards: List[torch.Tensor], shard_dim: int, num_empty_tensors: int +) -> List[torch.Tensor]: + if num_empty_tensors == 0: + return shards + tensor_size = list(shards[0].size()) + tensor_size = [ + size if idx != shard_dim else 0 for idx, size in enumerate(tensor_size) + ] + tensor = shards[0].new_zeros(tensor_size) + for _ in range(num_empty_tensors): + shards.append(tensor) + return shards + + def spec_to_bytes(spec: "placement_types.DTensorSpec") -> int: assert spec.tensor_meta is not None, "spec should have tensor meta defined!" return spec.tensor_meta.dtype.itemsize * math.prod(spec.shape) diff --git a/torch/distributed/_tensor/placement_types.py b/torch/distributed/_tensor/placement_types.py index 6eb19de18abe0..0fb569c4fac20 100644 --- a/torch/distributed/_tensor/placement_types.py +++ b/torch/distributed/_tensor/placement_types.py @@ -7,6 +7,7 @@ import torch.distributed._functional_collectives as funcol from torch.distributed._tensor._collective_utils import ( + fill_empty_tensor_to_shards, mesh_broadcast, mesh_scatter, pad_tensor, @@ -60,9 +61,21 @@ def _split_tensor( self.dim <= tensor.ndim ), f"Sharding dim {self.dim} greater than tensor ndim {tensor.ndim}" - # chunk tensor over dimension `dim` into n slices with padding if necessary + # chunk tensor over dimension `dim` into n slices tensor_list = list(torch.chunk(tensor, num_chunks, dim=self.dim)) - # compute the chunk size inline with ``torch.chunk`` + num_empty_tensors = num_chunks - len(tensor_list) + + # if no need to have padding or tensor dim size is evenly sharded already + # we can return early. + if not with_padding or tensor.size(self.dim) % num_chunks == 0: + if contiguous: + tensor_list = [t.contiguous() for t in tensor_list] + return ( + fill_empty_tensor_to_shards(tensor_list, self.dim, num_empty_tensors), + [], + ) + + # compute the chunk size inline with ``torch.chunk`` to calculate padding full_chunk_size = (tensor.size(self.dim) + num_chunks - 1) // num_chunks # Compute chunk size for each chunk for ``self.dim`` @@ -74,26 +87,17 @@ def _split_tensor( pad_sizes = [full_chunk_size - chunk_size for chunk_size in chunk_sizes] # Reuse tensor to fill empty chunk with empty tensor - num_empty_tensors = num_chunks - len(tensor_list) - tensor_size = list(tensor_list[0].size()) - tensor_size = [ - size if idx != self.dim else 0 for idx, size in enumerate(tensor_size) - ] - tensor = tensor.new_zeros(tensor_size) - for _ in range(num_empty_tensors): - tensor_list.append(tensor) - - if with_padding or contiguous: - shard_list = [] - for shard, pad_size in zip(tensor_list, pad_sizes): - # Fill the empty tensor with zeroes with padding. - if with_padding and pad_size > 0: - shard = pad_tensor(shard, self.dim, pad_size) - shard = shard.contiguous() if contiguous else shard - shard_list.append(shard) - return shard_list, pad_sizes - else: - return tensor_list, pad_sizes + tensor_list = fill_empty_tensor_to_shards( + tensor_list, self.dim, num_empty_tensors + ) + shard_list = [] + for shard, pad_size in zip(tensor_list, pad_sizes): + # Fill the empty tensor with zeroes with padding. + if with_padding and pad_size > 0: + shard = pad_tensor(shard, self.dim, pad_size) + shard = shard.contiguous() if contiguous else shard + shard_list.append(shard) + return shard_list, pad_sizes @staticmethod def _local_shard_size_on_dim( @@ -141,13 +145,13 @@ def _shard_tensor( tensor, num_chunks, with_padding=True, contiguous=True ) - output = torch.empty_like(scatter_list[my_coordinate[mesh_dim]]) + mesh_dim_local_rank = my_coordinate[mesh_dim] + output = torch.empty_like(scatter_list[mesh_dim_local_rank]) mesh_scatter(output, scatter_list, mesh, mesh_dim=mesh_dim) # Only unpad if the local_tensor was padded on the dimension. - pad_size = pad_sizes[my_coordinate[mesh_dim]] - if pad_size > 0: - output = unpad_tensor(output, self.dim, pad_size) + if pad_sizes and pad_sizes[mesh_dim_local_rank] > 0: + output = unpad_tensor(output, self.dim, pad_sizes[mesh_dim_local_rank]) return output def _reduce_shard_tensor( diff --git a/torch/distributed/checkpoint/default_planner.py b/torch/distributed/checkpoint/default_planner.py index 0249f4bdf7b19..c9590c38d3e61 100644 --- a/torch/distributed/checkpoint/default_planner.py +++ b/torch/distributed/checkpoint/default_planner.py @@ -258,7 +258,7 @@ def _should_include_key(self, key: str, metadata: Metadata) -> bool: for unflattened_key in planner_data: if unflattened_keys: unflattened_keys.append( - ".".join([unflattened_keys[-1], unflattened_key]) + ".".join([unflattened_keys[-1], str(unflattened_key)]) ) else: diff --git a/torch/distributed/checkpoint/format_utils.py b/torch/distributed/checkpoint/format_utils.py index aca8c454db09e..41ebaf8be61bf 100644 --- a/torch/distributed/checkpoint/format_utils.py +++ b/torch/distributed/checkpoint/format_utils.py @@ -222,8 +222,8 @@ def torch_save_to_dcp( Given the location of a torch save file, converts it into a DCP checkpoint. Args: - torch_save_path: Filename to store the converted Torch save file. - dcp_checkpoint_dir: Directory containing the DCP checkpoint. + torch_save_path: Filename of the Torch save file. + dcp_checkpoint_dir: Directory to store the DCP checkpoint. .. warning:: To avoid OOM, it's recommended to only run this function on a single rank. diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py index 72c3955e7d1e5..acfba81899c04 100644 --- a/torch/distributed/elastic/multiprocessing/api.py +++ b/torch/distributed/elastic/multiprocessing/api.py @@ -670,9 +670,13 @@ def _poll(self) -> Optional[RunProcsResult]: if self._is_done(): # we should ALWAYS have ALL the return values when all the processes are done self._worker_finished_event.set() - # Wait untill all processes are finished. At this point workers finished executing - # user function - self._pc.join() + + # At this point workers finished running the user function + # But the child process might still have not exited. Wait for them. + # pc.join() blocks [forever] until "a" proc exits. Loop until all of them exits. + while not self._pc.join(): + logger.debug("entrypoint fn finished, waiting for all child procs to exit...") + _validate_full_rank( self._return_values, self.nprocs, "return_value queue" ) diff --git a/torch/distributed/fsdp/_flat_param.py b/torch/distributed/fsdp/_flat_param.py index 2f344d19e9305..ed141465155c0 100644 --- a/torch/distributed/fsdp/_flat_param.py +++ b/torch/distributed/fsdp/_flat_param.py @@ -1184,12 +1184,14 @@ def init_flat_param_attributes(self) -> None: flat_param._local_shard = flat_param.data if self._offload_params: # Pin the memory for faster H2D transfer - flat_param._local_shard = flat_param._local_shard.pin_memory() + flat_param._local_shard = flat_param._local_shard.pin_memory( + device=self.device + ) # Pre-allocate the sharded gradient on CPU to enable non-blocking # D2H transfer during the backward pass flat_param._cpu_grad = torch.zeros_like( flat_param._local_shard, device=cpu_device - ).pin_memory() + ).pin_memory(device=self.device) if self._uses_param_mixed_precision: # For parameter mixed precision, we maintain a low precision # sharded tensor on the compute device to be all-gathered (for diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py index 89d9638217f1d..f1e579adae009 100644 --- a/torch/distributed/fsdp/_runtime_utils.py +++ b/torch/distributed/fsdp/_runtime_utils.py @@ -387,7 +387,7 @@ def _pre_forward( if handle and handle._offload_params and handle.flat_param._cpu_grad is None: handle.flat_param._cpu_grad = torch.zeros_like( handle.flat_param._local_shard, device=torch.device("cpu") - ).pin_memory() + ).pin_memory(device=state.compute_device) should_cast_forward_inputs = ( state._handle and not state._handle._force_full_precision diff --git a/torch/distributed/optim/functional_adagrad.py b/torch/distributed/optim/functional_adagrad.py index dfd50db175913..96e075c8216ca 100644 --- a/torch/distributed/optim/functional_adagrad.py +++ b/torch/distributed/optim/functional_adagrad.py @@ -30,7 +30,6 @@ def __init__( eps: float = 1e-10, coalesce_grad: bool = True, foreach: bool = False, - fused: bool = False, maximize: bool = False, _allow_empty_param_list: bool = False, ): @@ -45,7 +44,6 @@ def __init__( } self.coalesce_grad = coalesce_grad self.foreach = foreach - self.fused = fused self.maximize = maximize self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {}) @@ -103,7 +101,4 @@ def step(self, gradients: List[Optional[Tensor]]): foreach=self.foreach, maximize=self.maximize, has_complex=has_complex, - fused=self.fused, - grad_scale=None, - found_inf=None, ) diff --git a/torch/distributed/pipelining/_PipelineStage.py b/torch/distributed/pipelining/_PipelineStage.py index b30d99366caf5..db0340677b172 100644 --- a/torch/distributed/pipelining/_PipelineStage.py +++ b/torch/distributed/pipelining/_PipelineStage.py @@ -7,6 +7,7 @@ import torch import torch.distributed as dist import torch.fx as fx +import torch.nn as nn from torch._subclasses.fake_tensor import FakeTensor from torch.distributed._composable.fsdp.fully_shard import FSDPModule from torch.fx.node import map_aggregate @@ -55,11 +56,11 @@ def __repr__(self): def _make_tensor_from_meta( - example: FakeTensor, + example: Union[torch.Tensor, FakeTensor], device: torch.device, ) -> torch.Tensor: """ - Create a real tensor from a fake tensor. + Create a real tensor from a tensor. """ return torch.empty( example.size(), @@ -142,7 +143,7 @@ def __init__( self.log_prefix = f"[Stage {self.stage_index}]" # Forward infra - self.args_recv_info: Dict[int, Tuple[InputInfo]] = {} + self.args_recv_info: Dict[int, Tuple[InputInfo, ...]] = {} self.set_requires_grad: Dict[int, bool] = {} self.act_send_info: Dict[int, List] = {} @@ -211,7 +212,7 @@ def _create_grad_recv_info( def _get_recv_ops( self, - recv_infos: Tuple[InputInfo], + recv_infos: Tuple[InputInfo, ...], ) -> List[dist.P2POp]: """ Helper function shared by `get_fwd_recv_ops` and `get_bwd_recv_ops`. @@ -239,7 +240,7 @@ def get_fwd_recv_ops(self) -> List[dist.P2POp]: Returns a list of ops that are needed to receive the input arguments for this stage. """ - recv_infos: Tuple[InputInfo] = self.args_recv_info[self.fwd_chunk_id] + recv_infos: Tuple[InputInfo, ...] = self.args_recv_info[self.fwd_chunk_id] # In case there is backward pass, set requires_grad for receive buffers # before first forward @@ -360,7 +361,7 @@ def clear_runtime_states(self) -> None: def _map_tensor_from_recv_info( self, - recv_infos: Tuple[InputInfo], + recv_infos: Tuple[InputInfo, ...], ): """ Map tensors from recv infos to a list. @@ -819,3 +820,399 @@ def __init__( # Get my pipe info pipe_info = pipe.info() super().__init__(stage_module, stage_index, pipe_info, device, group) + + +# Manual PipelineStage functions and definition + +METADATA_TENSOR_LEN = 100 +PLACEHOLDER_VAL = -1 + + +def create_empty_tensors( + tensor: Union[torch.Tensor, List[torch.Tensor]], device: torch.device +) -> List[torch.Tensor]: + """ + Creates a list of empty tensors with the same properties (like shape and dtype) as the input tensor(s), + and places them on the specified device. + Args: + tensor (Union[torch.Tensor, List[torch.tensor]]): The input tensor(s). + device (torch.device): The device where the new tensors will be placed. + Returns: + List[torch.Tensor]: A list of empty tensors with the same properties as the input tensor(s). + """ + if isinstance(tensor, torch.Tensor): + return [torch.empty_like(tensor, device=device)] + elif isinstance(tensor, (list, tuple)): + return [torch.empty_like(t, device=device) for t in tensor] + raise TypeError(f"Unsupported type {type(tensor)} cannot create empty tensors") + + +def create_metadata_tensor( + tensors: Optional[List[torch.Tensor]] = None, + device: Optional[torch.device] = torch.device("cpu"), +) -> torch.Tensor: + """ + Create a metadata tensor that can be sent over the wire. + This tensor contains the number of dimensions and the shape of each tensor being sent. + + The data is of format [num_dims, dim1, dim2, ...]. + If the tensor is None, a tensor of only placeholder values will be returned. + + Inputs: + tensors: A list of tensors, the tensors will converted into its shape dimensions and + these dimensions will be concatenated. + device: The device where the metadata tensor will be created. + If the tensor is None, then this tensor will contain PLACEHOLDER_VALs. + + """ + metadata_tensor = torch.full( + (METADATA_TENSOR_LEN,), + PLACEHOLDER_VAL, + dtype=torch.int32, + device=device, + ) + if tensors: + # Create a list of tensors containing the number of dimensions and the shape of each tensor + data = [ + # data is of format [num_dims, dim1, dim2, ...] + torch.tensor( + [len(tensor.shape)] + list(tensor.shape), + dtype=torch.int32, + device=device, + ) + for tensor in tensors + ] + # Concatenate the data into a single tensor + data_tensor = torch.cat(data) + dt_shape = data_tensor.shape[0] + if dt_shape > METADATA_TENSOR_LEN: + raise ValueError( + f"Metadata tensor size ({dt_shape}) exceeds maximum allowed length ({METADATA_TENSOR_LEN})." + ) + metadata_tensor[:dt_shape] = data_tensor + return metadata_tensor + + +def extract_metadata_from_tensor(tensor: torch.Tensor) -> List[torch.Size]: + """ + Extract the number of dimensions and the shape of each tensor from a metadata tensor. + """ + metadata: List[torch.Size] = [] + i = 0 + while i < len(tensor) and tensor[i] != PLACEHOLDER_VAL: + num_dims = int(tensor[i].item()) + shape = torch.Size(tensor[i + 1 : i + 1 + num_dims].tolist()) + metadata.append(shape) + i += num_dims + 1 + return metadata + + +def get_stage_shapes( + stage_modules: List[nn.Module], + stage_ids: List[int], + num_stages: int, + rank: int, + world_size: int, + device: torch.device, + microbatch: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, +): + """ + Performs a dry run through all the pipeline stages (a rank can have multiple pipeline stages in the case of + virtual pipelining) and returns the shape of the inputs and outputs of the module. + Only the first stage must pass in a microbatch. + + Each rank must call get_stage_shapes or the program will hang. + + Args: + stage_modules: The chunks assigned to this rank. Rhe length should be 1 for any + non-interleaved schedules and >1 for any interleaved schedules. + stage_ids: The id of the stages assigned to this rank. + num_stages: Total number of stages. + rank: Rank of the current process. + world_size: Number of processes participating in the pipeline. + device: Device where the tensors are allocated. + + Returns a dictionary containing the following keys: + "inputs": Shape of the inputs to the module + "outputs": Shape of the outputs of the module + """ + + stage_id_to_shapes: Dict[int, Dict[str, list[torch.Size]]] = {} + for stage_id, model in zip(stage_ids, stage_modules): + input_shape_metadata_tensor = create_metadata_tensor(device=device) + # TODO: Assumes prev_stage == rank - 1 and next_stage == rank + 1 + prev_rank = (rank - 1) % world_size + next_rank = (rank + 1) % world_size + shapes = {} + + # first stage doesn't receive anything and uses a microbatch + if stage_id == 0: + if microbatch is None: + raise RuntimeError("Microbatch is required for first stage") + example_fwd_inputs = microbatch + if isinstance(example_fwd_inputs, torch.Tensor): + example_fwd_inputs = [example_fwd_inputs] + else: + # other stages must receive shape information + # TODO: send/recv should take a group, rather than use the default group + dist.recv(input_shape_metadata_tensor, prev_rank) + metadata = extract_metadata_from_tensor(input_shape_metadata_tensor) + example_fwd_inputs = [ + torch.empty(shape_list, device=device) for shape_list in metadata + ] + shapes["inputs"] = [fwd_input.shape for fwd_input in example_fwd_inputs] + + # perform forward + # TODO: if forward fails raise a more descriptive error explaining which stage failed + fwd_outputs = model(*example_fwd_inputs) + fwd_outputs = create_empty_tensors(fwd_outputs, device) + shapes["outputs"] = [fwd_output.shape for fwd_output in fwd_outputs] + + # send shape dims + if stage_id != num_stages - 1: + output_shape_metadata_tensor = create_metadata_tensor( + fwd_outputs, device=device + ) + dist.send(output_shape_metadata_tensor, next_rank) + stage_id_to_shapes[stage_id] = shapes + logger.info(stage_id_to_shapes) + return stage_id_to_shapes + + +class ManualPipelineStage(PipelineStageBase): + """ + A class representing a pipeline stage in a pipeline parallelism setup. + This class is created manually by providing a example input (and optionally output) + as opposed to the PipelineStage class that is outputed from pipeline(). + This class extends the `PipelineStageBase` class and can similarly be used + in `PipelineScheule`. + Args: + submodule (nn.Module): The PyTorch module wrapped by this stage. + stage_index (int): The ID of this stage. + num_stages (int): The total number of stages. + device (torch.device): The device where this stage is located. + num_microbatches (int): The number of microbatches to use. + input_args (Union[torch.Tensor, List[torch.tensor]], optional): The input arguments for the submodule. + output_args (Union[torch.Tensor, List[torch.tensor]], optional): The output arguments for the submodule. + group (dist.ProcessGroup, optional): The process group for distributed training. If None, default group. + """ + + def __init__( + self, + submodule: nn.Module, + stage_index: int, + num_stages: int, + device: torch.device, + num_microbatches: int, + input_args: Union[torch.Tensor, List[torch.Tensor]], + output_args: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None, + group: Optional[dist.ProcessGroup] = None, + ): + super().__init__( + submodule, stage_index, num_stages, device, num_microbatches, group + ) + self.submod.to(self.device) + # When we materialize the model partition on cuda, we call reset_parameters() if it is available + # logger.info(f"input args {input_args=}") + self.inputs: List[torch.Tensor] = [] + self.outputs: List[torch.Tensor] = [] + + self.inputs = create_empty_tensors(input_args, device) + + if output_args is None: + logger.info("output_args not provided, performing forward using input_args") + self.outputs = self.submod(*self.inputs) + # create buffers for the output so that the data is in the correct + # shape in order to use in p2p op (send) + self.outputs = create_empty_tensors(self.outputs, device) + else: + self.outputs = create_empty_tensors(output_args, device) + + # these are the buffers used in backwards send/recv, they are allocated later + self.outputs_grad: List[torch.Tensor] = [] + + def stage_global_rank(peer_rank): + return ( + peer_rank + if self.group is None + else dist.get_global_rank(self.group, peer_rank) + ) + + self.prev_stage = stage_global_rank((self.group_rank - 1) % self.group_size) + self.next_stage = stage_global_rank((self.group_rank + 1) % self.group_size) + + # Receive info during forward + # TODO: create args_recv_info lazily? (same needed for PipelineStage) + for chunk_id in range(self.chunks): + self.set_requires_grad[chunk_id] = False + if not self.is_first: + # We assume that we always receive from stage - 1 + recv_infos = tuple( + [ + RecvInfo( + f"recv_for_{self.stage_index}_from_{self.stage_index - 1}", + self.stage_index - 1, + _make_tensor_from_meta(inp, self.device), + ) + for inp in self.inputs + ] + ) + + self.args_recv_info[chunk_id] = recv_infos + else: + self.args_recv_info[chunk_id] = tuple( + [RootArgPlaceholder() for _ in self.inputs] + ) + + # Send info during forward for each activation + # only need the rank that is being sent to + self.act_send_info: Dict[int, List] = {} + for idx in range(len(self.outputs)): + # We assume we always send to stage + 1 + if not self.is_last: + self.act_send_info[idx] = [self.stage_index + 1] + else: + self.act_send_info[idx] = [] + + logger.debug( + f"finished pipeline stage init, {self.stage_index=}, {self.is_first=}, " # noqa: G004 + f"{self.is_last=}, {self.num_stages=}, " + f"inputs: {[inp.shape for inp in self.inputs]}, " + f"output: {[output.shape for output in self.outputs]}" + ) + + def _create_grad_recv_info( + self, + act_send_info: Dict, + ) -> Tuple[RecvInfo, ...]: + grad_recv_info: Tuple[RecvInfo, ...] = () + if not self.is_last: + # Receiving gradients from multiple sources is not supported + # hence we only take the first destination + grad_recv_info = tuple( + [ + RecvInfo( + f"recv_grad_for_{self.stage_index}_from_{dst_list[0]}", + dst_list[0], + _make_tensor_from_meta(self.outputs[idx], self.device), + ) + for idx, dst_list in act_send_info.items() + ] + ) + return grad_recv_info + + def init_p2p_neighbors(self): + """ + Set up p2p communitors between previous and next stages + by sending a dummy tensor. + + If this is used, must be called for all pipeline stages. + """ + ops = [] + recv_tensor = torch.zeros(1, device="cuda") + send_tensor = torch.ones(1, device="cuda") + # forward + if not self.is_first: + ops.append(dist.P2POp(dist.irecv, recv_tensor, self.prev_stage, self.group)) + if not self.is_last: + ops.append(dist.P2POp(dist.isend, send_tensor, self.next_stage, self.group)) + + # backward + if not self.is_first: + ops.append(dist.P2POp(dist.isend, send_tensor, self.prev_stage, self.group)) + if not self.is_last: + ops.append(dist.P2POp(dist.irecv, recv_tensor, self.next_stage, self.group)) + + return True + + +def validate_stage_shapes(pipeline_stages: List[ManualPipelineStage]): + """ + Check that the buffer shapes match between stages was expected by performing an all_gather between + all stages. + """ + if len(pipeline_stages) == 0: + raise ValueError("No pipeline stages provided.") + + virtual_pipeline_size = len(pipeline_stages) + all_inputs = [] + all_outputs = [] + world_size = pipeline_stages[0].group_size + num_stages = pipeline_stages[0].num_stages + + # perform all gathers between all stages + for virtual_id, stage in enumerate(pipeline_stages): + world_size = stage.group_size + stage_id: int = stage.stage_index + rank = stage.group_rank + # check that world_size and num_stages are consistent across all stages + if stage.group_size != world_size: + raise ValueError( + f"Stage id {stage_id} has world size ({stage.group_size}) \ + which does not match world size ({world_size}) of other stages." + ) + if stage.num_stages != num_stages: + raise ValueError( + f"Stage id {stage_id} has num stages ({stage.num_stages}) \ + which does not match num stages ({num_stages}) of other stages." + ) + + pg_rank = dist.get_rank(stage.group) + if rank != pg_rank: + raise ValueError( + f"Rank {rank} is not equal to process group rank {pg_rank}" + ) + + if (num_stages := stage.num_stages) % world_size != 0: + raise ValueError( + f"Number of stages ({num_stages}) must be a multiple of the world_size ({world_size})" + ) + + # all gather each ranks inputs + tensor_list = [ + create_metadata_tensor(device=stage.device) for _ in range(stage.group_size) + ] + expected_inputs = stage.inputs + stage_input = create_metadata_tensor(expected_inputs, device=stage.device) + dist.all_gather(tensor_list, stage_input) + stage_input_shapes = [ + extract_metadata_from_tensor(tensor) for tensor in tensor_list + ] + + # all gather each ranks outputs + tensor_list = [ + create_metadata_tensor(device=stage.device) for _ in range(stage.group_size) + ] + expected_outputs = stage.outputs + stage_output = create_metadata_tensor(expected_outputs, device=stage.device) + dist.all_gather(tensor_list, stage_output) + stage_output_shapes = [ + extract_metadata_from_tensor(tensor) for tensor in tensor_list + ] + + logger.debug( + f"Rank: {pg_rank}" # noqa: G004 + f"Stage id: {stage_id}" + f"Stage num stages: {stage.num_stages}" + f"Stage rank: {rank}" + f"Stage world size: {world_size}" + f"Stage {virtual_id * world_size}-{(virtual_id + 1) * world_size - 1} input shapes: {stage_input_shapes}" # noqa: G003 + f"Stage {virtual_id * world_size}-{(virtual_id + 1) * world_size - 1} output shapes: {stage_output_shapes}" # noqa: G003 + ) + + all_inputs.extend(stage_input_shapes) + all_outputs.extend(stage_output_shapes) + + # log only rank 0's view, they will all be equivalent + if pg_rank == 0: + logger.info( + f"all stage inputs: {all_inputs}" # noqa: G004 + f"all stage outputs: {all_outputs}" + ) + + # Check if the output for stage 0 matches the input at stage 1, and so forth + for i in range(virtual_pipeline_size * world_size - 1): + if (out := all_outputs[i]) != (inp := all_inputs[i + 1]): + raise ValueError( + f"Stage_id {i} output shape {out} at does not match stage_id {i + 1} input shape {inp}." + ) diff --git a/torch/export/_trace.py b/torch/export/_trace.py index 1ac7fd6b5e9e3..728bdf25a981f 100644 --- a/torch/export/_trace.py +++ b/torch/export/_trace.py @@ -246,6 +246,12 @@ def _get_param_buffer_mapping( for name, buffer in original_module.named_buffers(remove_duplicate=False): buffer_lookup.setdefault(id(buffer), []).append(name) + # reverse lists so FQN assignment is FIFO wrt model structure + for name, fqns in param_lookup.items(): + param_lookup[name] = fqns[::-1] + for name, fqns in buffer_lookup.items(): + buffer_lookup[name] = fqns[::-1] + param_buffer_table: Dict[str, str] = {} for dynamo_name, dynamo_param in traced_module.named_parameters( remove_duplicate=False diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py index 14f91ee64679d..31701d9fb685f 100644 --- a/torch/export/unflatten.py +++ b/torch/export/unflatten.py @@ -172,32 +172,93 @@ def __init__( self.range_constraints = export_module.range_constraints self.equality_constraints: List = [] + # aliasing/unused param or buffer issues: + # in strict-mode export, dynamo export will deduplicate aliased tensors, + # and ignore unused tensors. For aliasing, this causes issues when some aliases + # are unused, and we're unable to match the placeholder node to the correct FQN. + # This leads to the graph signature potentially having the wrong target FQN, + # and downstream issues where parameters are assigned to the wrong target attribute, + # mismatching the relevant placeholder node in the unflattened module. + # To resolve this we restore (_assign_attr) all aliased/unused tensors in + # the state_dict as module attributes, but only keep the used tensors in the + # graph's forward pass (_sink_params). state_dict = export_module.state_dict - for name in self.graph_signature.parameters: - cloned = torch.nn.Parameter(state_dict[name].clone()) + assigned_params: Set[str] = set() # tracking unused params + id_to_param: Dict[int, torch.nn.Parameter] = {} # handling weight-sharing + for name in self.graph_signature.parameters: # this loop adds used params + param = state_dict[name] + if id(param) not in id_to_param: + id_to_param[id(param)] = torch.nn.Parameter(param.clone()) + _assign_attr( - cloned, + id_to_param[id(param)], self, name, attr_kind=_AttrKind.PARAMETER, ) + assigned_params.add(name) non_persistent_buffers = set(self.graph_signature.non_persistent_buffers) - for name in self.graph_signature.buffers: + assigned_buffers: Set[str] = set() # tracking unused buffers + id_to_buffer: Dict[ + int, Tuple[torch.nn.Parameter, bool] + ] = {} # handle weight-sharing + for name in self.graph_signature.buffers: # this loop adds used buffers if name in non_persistent_buffers: persistent = False - cloned = export_module.constants[name].clone() + buffer = export_module.constants[name] else: persistent = True - cloned = state_dict[name].clone() + buffer = state_dict[name] + + if id(buffer) not in id_to_buffer: + id_to_buffer[id(buffer)] = (buffer.clone(), persistent) _assign_attr( - cloned, + id_to_buffer[id(buffer)][0], self, name, attr_kind=_AttrKind.BUFFER, persistent=persistent, ) + assigned_buffers.add(name) + + # restore aliased/unused params and buffers + # these appear in state dict but not graph signature + for name, tensor in state_dict.items(): + if name in assigned_params or name in assigned_buffers: # already assigned + continue + + is_buffer = False + if id(tensor) in id_to_buffer or not isinstance( + tensor, torch.nn.Parameter + ): # aliased buffer + is_buffer = True + + if is_buffer: + if ( + id(tensor) not in id_to_buffer + ): # this is completely unused (not weight-sharing) + id_to_buffer[id(tensor)] = ( + tensor, + True, + ) # assign to respect original model + _assign_attr( + id_to_buffer[id(tensor)][0], + self, + name, + attr_kind=_AttrKind.BUFFER, + persistent=True, + ) + else: + if id(tensor) not in id_to_param: # this is unused + id_to_param[id(tensor)] = tensor + _assign_attr( + id_to_param[id(tensor)], + self, + name, + attr_kind=_AttrKind.PARAMETER, + ) # use id map so we don't double-clone aliased constants id_to_const: Dict[int, Union[torch.Tensor, torch._C.ScriptObject]] = {} @@ -223,6 +284,7 @@ def add_to_consts_map(obj_id, node_name, target_name): name_list = consts_map[obj_id] name_list.append((node_name, target_name)) + added_params_buffers: Set[str] = set() # track aliased/unused params, buffers for s in self.graph_signature.input_specs: if s.kind == InputKind.PARAMETER or ( s.kind == InputKind.BUFFER and s.persistent @@ -233,6 +295,7 @@ def add_to_consts_map(obj_id, node_name, target_name): id(export_module.state_dict[s.target]), s.arg.name, s.target ) consts_targets.add(s.target) + added_params_buffers.add(s.target) elif ( (s.kind == InputKind.BUFFER and not s.persistent) or s.kind == InputKind.CONSTANT_TENSOR @@ -253,6 +316,18 @@ def add_to_consts_map(obj_id, node_name, target_name): ), "Constants should be either aliased or appear in graph signature" ph_name, _ = consts_map[id(const)][0] add_to_consts_map(id(const), ph_name, const_name) + added_params_buffers.add(s.target) + + # add aliased/unused params and buffers that don't appear in graph signature + for fqn, tensor in export_module.state_dict.items(): + if fqn not in added_params_buffers: + if id(tensor) not in consts_map: + # completely unused (no weight-sharing), ignore. + # this weight doesn't appear in graph module, + # so won't cause FQN assignment issues + continue + ph_name, _ = consts_map[id(tensor)][0] + add_to_consts_map(id(tensor), ph_name, fqn) # node name -> list of possible targets inputs_to_state: Dict[str, List[str]] = {} diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py index 07905b0348473..9976c4e9beca2 100644 --- a/torch/fx/experimental/proxy_tensor.py +++ b/torch/fx/experimental/proxy_tensor.py @@ -59,6 +59,7 @@ CONSTANT_NUMEL_LIMIT = 1 +null_ctx_type = type(nullcontext) # We currently convert all SymInt to proxies before we use them. # This could plausibly be handled at the Dynamo level. pytree.register_pytree_node( @@ -214,6 +215,8 @@ def try_set_proxy_slot(outer_s, proxy_callable, *args): set_proxy_slot(tensor, tracer, _ProxyTensor(proxy, constant)) def track_tensor_tree(inner_res, proxy_res, *, constant, tracer): + _set_unbacked_bindings(inner_res, proxy_res) + def wrap_with_proxy(e, proxy, constant): if isinstance(e, torch.Tensor): track_tensor(e, proxy, tracer=tracer, constant=constant) @@ -521,21 +524,6 @@ def can_handle_tensor(x): else: constant = None - from .symbolic_shapes import compute_unbacked_bindings - # Can't use detect_fake_mode here, - # - # python test/distributed/_tensor/test_dtensor_compile.py -k - # test_tp_compile_fullgraph_is_seq_parallel_False - # - # will fail. Very strange, it probably isn't right for them to be using - # two fake modes there... - fake_mode = torch._C._get_dispatch_mode( - torch._C._TorchDispatchModeKey.FAKE - ) - if fake_mode and fake_mode.shape_env: - if symbol_to_path := compute_unbacked_bindings(fake_mode.shape_env, out): - proxy_out.node.meta["unbacked_bindings"] = symbol_to_path - track_tensor_tree(out, proxy_out, constant=constant, tracer=tracer) return out @@ -1138,145 +1126,283 @@ def create_node(self, *args, **kwargs): return node +class _MakefxTracer: + + def __init__( + self, + decomposition_table: Optional[Dict[Callable, Callable]], + tracing_mode: str, + _allow_non_fake_inputs: bool, + pre_dispatch: bool, + record_module_stack: bool, + _allow_fake_constant: bool, + _error_on_data_dependent_ops: bool + ): + # Configurations that are used to initialize the context managers and their states. + # Should not modify them during tracing. + self.decomposition_table: Dict[Callable, Callable] = decomposition_table or {} + self.decomposition_table.setdefault(torch.ops.aten.sym_numel.default, torch._decomp.decompositions.sym_numel) + self.tracing_mode: str = tracing_mode + self._allow_non_fake_inputs: bool = _allow_non_fake_inputs + self.pre_dispatch: bool = pre_dispatch + self.record_module_stack: bool = record_module_stack + self._allow_fake_constant: bool = _allow_fake_constant + self._error_on_data_dependent_ops: bool = _error_on_data_dependent_ops + + # All context managers and their states should be initialized before tracing based on the inputs + # and configurations. After tracing, their states should be cleaned except for shape_env. + # Remember to specify how to intialize it from user inputs and from parent tracer whenever + # adding new modes in _MakefxTracer. + self.fake_tensor_mode: Union[null_ctx_type, FakeTensorMode] = nullcontext() + self.proxy_mode: Union[null_ctx_type, ProxyTorchDispatchMode] = nullcontext() + self.proxy_function_mode: Union[null_ctx_type, PreDispatchTorchFunctionMode] = nullcontext() + self.fx_tracer: Union[null_ctx_type, Tracer] = nullcontext() + self.python_dispatcher_mode: Union[null_ctx_type, Any] = nullcontext() + self.torch_fn_metadata_mode: Union[null_ctx_type, TorchFunctionMetadataMode] = nullcontext() + + def _checkpoint_modes(self) -> List[Any]: + return [ + self.fake_tensor_mode, + self.proxy_mode, + self.proxy_function_mode, + self.fx_tracer, + self.python_dispatcher_mode, + self.torch_fn_metadata_mode + ] + + def _restore_modes( + self, + prev_fake_tensor_mode: Union[null_ctx_type, FakeTensorMode], + prev_proxy_mode: Union[null_ctx_type, ProxyTorchDispatchMode], + prev_proxy_function_mode: Union[null_ctx_type, PreDispatchTorchFunctionMode], + prev_fx_tracer: Union[null_ctx_type, Tracer], + prev_python_dispatcher_mode: Union[null_ctx_type, Any], + prev_torch_fn_metadata_mode : Union[null_ctx_type, TorchFunctionMetadataMode], + ) -> None: + self.fake_tensor_mode = prev_fake_tensor_mode + self.proxy_mode = prev_proxy_mode + self.proxy_function_mode = prev_proxy_function_mode + self.fx_tracer = prev_fx_tracer + self.python_dispatcher_mode = prev_python_dispatcher_mode + self.torch_fn_metadata_mode = prev_torch_fn_metadata_mode -def make_fx(f, - decomposition_table=None, - tracing_mode="real", - _allow_non_fake_inputs=False, - *, - pre_dispatch=False, - record_module_stack=False, - _allow_fake_constant=False, - _error_on_data_dependent_ops=True): - assert tracing_mode in ["real", "fake", "symbolic"] + @contextmanager + def _init_modes_from_inputs(self, f, args): + prev_modes = self._checkpoint_modes() + try: + # Avoid importing sympy at a module level + from .symbolic_shapes import ShapeEnv + if hasattr(f, "_orig_mod") and self.record_module_stack: + scope_root = f._orig_mod + self.fx_tracer = _ModuleStackTracer(scope_root) + else: + self.fx_tracer = PythonKeyTracer() + + if self.tracing_mode == "fake": + import torch._dynamo + fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args) + if fake_tensor_mode is None: + import torch._functorch.config as _config + with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False): + fake_tensor_mode = FakeTensorMode( + allow_fallback_kernels=True, + allow_non_fake_inputs=self._allow_non_fake_inputs, + shape_env=ShapeEnv(), + static_shapes=True, + ) + self.fake_tensor_mode = fake_tensor_mode + elif self.tracing_mode == "symbolic": + import torch._dynamo + fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args) + if fake_tensor_mode is None: + shape_env = ShapeEnv() + import torch._functorch.config as _config + with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False): + fake_tensor_mode = FakeTensorMode( + allow_fallback_kernels=False, + allow_non_fake_inputs=self._allow_non_fake_inputs, + shape_env=shape_env) + assert fake_tensor_mode.shape_env is not None, "shape_env should be set if tracing with 'symbolic'" + self.fake_tensor_mode = fake_tensor_mode + else: + if not self.tracing_mode == "real": + raise AssertionError(f"Unexpected tracing type: {self.tracing_mode}") - if decomposition_table is None: - decomposition_table = {} + self._construct_modes_with_fx_tracer(self.fx_tracer) + yield + finally: + self._restore_modes(*prev_modes) + + def _construct_modes_with_fx_tracer(self, fx_tracer): + self.proxy_mode = ProxyTorchDispatchMode( + fx_tracer, + self.tracing_mode, + pre_dispatch=self.pre_dispatch, + _allow_fake_constant=self._allow_fake_constant, + _error_on_data_dependent_ops=self._error_on_data_dependent_ops + ) - if torch.ops.aten.sym_numel.default not in decomposition_table: - decomposition_table = { - **decomposition_table, - torch.ops.aten.sym_numel.default: torch._decomp.decompositions.sym_numel - } + if self.pre_dispatch: + self.proxy_function_mode = PreDispatchTorchFunctionMode(fx_tracer) - @functools.wraps(f) - def wrapped(*args): - # Avoid importing sympy at a module level - from .symbolic_shapes import ShapeEnv + # pre-autograd tracing uses per-dispatch-key modes, + # which requires the python dispatcher + if self.tracing_mode == "symbolic" or self.pre_dispatch: + self.python_dispatcher_mode = enable_python_dispatcher() - phs = pytree.tree_map(lambda _: fx.PH, args) # type: ignore[attr-defined] + self.torch_fn_metadata_mode = TorchFunctionMetadataMode(fx_tracer) - if hasattr(f, "_orig_mod") and record_module_stack: - scope_root = f._orig_mod - fx_tracer = _ModuleStackTracer(scope_root) - else: - fx_tracer = PythonKeyTracer() - fake_tensor_mode: Any = nullcontext() - if tracing_mode == "real": - fake_tensor_mode = nullcontext() - elif tracing_mode == "fake": - import torch._dynamo - fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args) - if fake_tensor_mode is None: - import torch._functorch.config as _config - with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False): - fake_tensor_mode = FakeTensorMode( - allow_fallback_kernels=True, - allow_non_fake_inputs=_allow_non_fake_inputs, - shape_env=ShapeEnv(), - static_shapes=True, - ) - elif tracing_mode == "symbolic": - import torch._dynamo - fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args) - if fake_tensor_mode is None: - shape_env = ShapeEnv() - import torch._functorch.config as _config - with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False): - fake_tensor_mode = FakeTensorMode( - allow_fallback_kernels=False, - allow_non_fake_inputs=_allow_non_fake_inputs, - shape_env=shape_env) - else: - shape_env = fake_tensor_mode.shape_env - assert shape_env is not None, "shape_env should be set if tracing with 'symbolic'" + @contextmanager + def _init_modes_from_parent(self, parent_tracer): + # By default, subtracer creates new modes based on parent tracer's config. + # However, there are cases where we want to share the same modes with parent tracer + # For example, fake_tensor_mode, we want the example value's fake_mode of parent graph and subgraphs to be the same. + prev_modes = self._checkpoint_modes() + try: + self.fake_tensor_mode = parent_tracer.fake_tensor_mode - else: - raise AssertionError(f"Unexpected tracing type: {tracing_mode}") + def _create_sub_fx_tracer(parent_tracer): + if type(parent_tracer) == PythonKeyTracer: + sub_tracer = PythonKeyTracer() + elif type(parent_tracer) == _ModuleStackTracer: + sub_tracer = _ModuleStackTracer(parent_tracer.scope_root) + else: + raise RuntimeError(f"Unexpected tracer type: {type(parent_tracer)}.") - python_dispatcher_mode: Any = nullcontext() - # pre-autograd tracing uses per-dispatch-key modes, - # which requires the python dispatcher - if tracing_mode == "symbolic" or pre_dispatch: - python_dispatcher_mode = enable_python_dispatcher() - - proxy_function_mode: Any = nullcontext() - if pre_dispatch: - proxy_function_mode = PreDispatchTorchFunctionMode(fx_tracer) - - proxy_mode = ProxyTorchDispatchMode(fx_tracer, - tracing_mode, - pre_dispatch=pre_dispatch, - _allow_fake_constant=_allow_fake_constant, - _error_on_data_dependent_ops=_error_on_data_dependent_ops) - - arg_count = 0 - - def wrap_fake(x): - nonlocal arg_count - # TODO: it would be nice to line these up with the names - # FX will choose for the placeholders, but we don't - # actually know what the names will be at this point yet - # NB: the Source here is actually meaningless - from torch._dynamo.source import ConstantSource - source = ConstantSource(f"input{arg_count}") - if isinstance(x, torch.Tensor): - arg_count += 1 - return fake_tensor_mode.from_tensor(x, source=source) # type: ignore[attr-defined] - # NB: don't match on bools - elif type(x) is int and tracing_mode == "symbolic": - return shape_env.create_symintnode(shape_env.create_symbol(x, source, positive=None), hint=x, source=source) - elif isinstance(x, torch.ScriptObject): - return torch._library.fake_class_registry.to_fake_obj(fake_tensor_mode, x) - - assert not isinstance(x, FakeScriptObject), f"ScriptObject {x} has been fakified. Cannot wrap_fake it again." - return x - - sym_mode = proxy_mode.sym_mode - - wrap_fn_map = { - "real": lambda x: x, - "fake": wrap_fake, - "symbolic": wrap_fake, - } - args = pytree.tree_map(wrap_fn_map[tracing_mode], args) - - if not hasattr(inspect.unwrap(f), '__code__') or inspect.unwrap(f).__code__.co_flags & inspect.CO_VARARGS: - # FX doesn't support varargs, so we gotta fake up a wrapper - # TODO: Would be nice to fix this at the source... - func = fake_signature(f, len(phs)) - else: - func = f + return sub_tracer + + self.fx_tracer = _create_sub_fx_tracer(parent_tracer.fx_tracer) + self._construct_modes_with_fx_tracer(self.fx_tracer) + yield + finally: + self._restore_modes(*prev_modes) - torch_fn_metadata_mode = TorchFunctionMetadataMode(fx_tracer) + def _trace_inner(self, f, *args): + phs = pytree.tree_map(lambda _: fx.PH, args) # type: ignore[attr-defined] + + def _wrap_fake(args: Tuple[Any]) -> Tuple[Any]: + arg_count = 0 + + def inner_wrap_fake(x): + nonlocal arg_count + # TODO: it would be nice to line these up with the names + # FX will choose for the placeholders, but we don't + # actually know what the names will be at this point yet + # NB: the Source here is actually meaningless + from torch._dynamo.source import ConstantSource + source = ConstantSource(f"input{arg_count}") + if isinstance(x, torch.Tensor): + arg_count += 1 + return self.fake_tensor_mode.from_tensor(x, source=source) # type: ignore[attr-defined] + # NB: don't match on bools + elif type(x) is int and self.tracing_mode == "symbolic": + return self.fake_tensor_mode.shape_env.create_symintnode( + self.fake_tensor_mode.shape_env.create_symbol(x, source, positive=None), + hint=x, + source=source + ) + elif isinstance(x, torch.ScriptObject): + return torch._library.fake_class_registry.to_fake_obj(self.fake_tensor_mode, x) + + assert not isinstance(x, FakeScriptObject), f"ScriptObject {x} has been fakified. Cannot wrap_fake it again." + return x + + wrap_fn_map = { + "real": lambda x: x, + "fake": inner_wrap_fake, + "symbolic": inner_wrap_fake, + } + return pytree.tree_map(wrap_fn_map[self.tracing_mode], args) + + def _wrap_func(f, phs): + if not hasattr(inspect.unwrap(f), '__code__') or inspect.unwrap(f).__code__.co_flags & inspect.CO_VARARGS: + # FX doesn't support varargs, so we gotta fake up a wrapper + # TODO: Would be nice to fix this at the source... + return fake_signature(f, len(phs)) + return f + + args = _wrap_fake(args) + func = _wrap_func(f, phs) # We disable the autocast cache as the autocast cache causes type conversions on parameters to # check a cache, which introduces untracked tensors into the graph # # We also disable tracing by any other tensor proxy-based tracers except the current. The # purpose of `make_fx` is to produce graphmodules as a side effect; its internal execution is # thus irrelevant to any external functional trace. - with decompose(decomposition_table), fake_tensor_mode, python_dispatcher_mode, proxy_function_mode, \ - sym_mode, torch_fn_metadata_mode, proxy_mode, disable_autocast_cache(): - t = dispatch_trace(wrap_key(func, args, fx_tracer, pre_dispatch), tracer=fx_tracer, concrete_args=tuple(phs)) + with decompose(self.decomposition_table), self.fake_tensor_mode, self.python_dispatcher_mode, self.proxy_function_mode, \ + self.proxy_mode.sym_mode, self.torch_fn_metadata_mode, \ + self.proxy_mode, disable_autocast_cache(), _set_make_fx_tracer(self): + t = dispatch_trace( + wrap_key(func, args, self.fx_tracer, self.pre_dispatch), + tracer=self.fx_tracer, + concrete_args=tuple(phs) + ) # TODO: kind of a bad way to do it, should maybe figure out a better way - if tracing_mode == "symbolic": - t.shape_env = shape_env # type: ignore[assignment] + if self.tracing_mode == "symbolic": + t.shape_env = self.fake_tensor_mode.shape_env # type: ignore[assignment] return t - return wrapped + def trace(self, f, *args) -> torch.fx.GraphModule: + with self._init_modes_from_inputs(f, args): + return self._trace_inner(f, *args) + + def trace_subgraph(self, f, *args): + # Create a new tracer based on parent's config + sub_tracer = _MakefxTracer( + self.decomposition_table, + self.tracing_mode, + self._allow_non_fake_inputs, + self.pre_dispatch, + self.record_module_stack, + self._allow_fake_constant, + self._error_on_data_dependent_ops + ) + with sub_tracer._init_modes_from_parent(self): + return sub_tracer._trace_inner(f, *args) + +_CURRENT_MAKE_FX_TRACER : Optional[_MakefxTracer] = None + +@contextmanager +def _set_make_fx_tracer(tracer: _MakefxTracer) -> None: + global _CURRENT_MAKE_FX_TRACER + prev_tracer = _CURRENT_MAKE_FX_TRACER + try: + _CURRENT_MAKE_FX_TRACER = tracer + yield + finally: + _CURRENT_MAKE_FX_TRACER = prev_tracer + +def make_fx( + f, + decomposition_table=None, + tracing_mode="real", + _allow_non_fake_inputs=False, + *, + pre_dispatch=False, + record_module_stack=False, + _allow_fake_constant=False, + _error_on_data_dependent_ops=True): + + assert tracing_mode in ["real", "fake", "symbolic"] + + make_fx_tracer = _MakefxTracer( + decomposition_table, + tracing_mode, + _allow_non_fake_inputs, + pre_dispatch, + record_module_stack, + _allow_fake_constant, + _error_on_data_dependent_ops + ) + + @functools.wraps(f) + def wrapped(*args): + return make_fx_tracer.trace(f, *args) + + return wrapped def get_torch_dispatch_modes(): return torch.utils._python_dispatch._get_current_dispatch_mode_stack() @@ -1310,3 +1436,22 @@ def get_isolated_graphmodule(func, args, kwargs, tracing_mode="real"): with disable_proxy_modes_tracing(): gm = make_fx(wrapped, tracing_mode=tracing_mode)(all_args) return gm + + +def _set_unbacked_bindings(out, out_proxy): + """A helper function for setting up unbacked_bindings on the destination FX graph.""" + from .symbolic_shapes import compute_unbacked_bindings + + # Can't use detect_fake_mode here, + # + # python test/distributed/_tensor/test_dtensor_compile.py -k + # test_tp_compile_fullgraph_is_seq_parallel_False + # + # will fail. Very strange, it probably isn't right for them to be using + # two fake modes there... + fake_mode = torch._C._get_dispatch_mode( + torch._C._TorchDispatchModeKey.FAKE + ) + if fake_mode and fake_mode.shape_env: + if symbol_to_path := compute_unbacked_bindings(fake_mode.shape_env, out): + out_proxy.node.meta["unbacked_bindings"] = symbol_to_path diff --git a/torch/fx/experimental/sym_node.py b/torch/fx/experimental/sym_node.py index 8ec9b816beac9..98cba67a73a18 100644 --- a/torch/fx/experimental/sym_node.py +++ b/torch/fx/experimental/sym_node.py @@ -164,6 +164,25 @@ def maybe_as_int(self): else: return None + # NB: This does conversions, not sure if this is good or not + def maybe_as_float(self): + import sympy + + if isinstance(self.expr, sympy.Float): + return float(self.expr) + else: + return None + + def maybe_as_bool(self): + import sympy + + if self.expr is sympy.true: + return True + elif self.expr is sympy.false: + return False + else: + return None + def is_int(self): return self.pytype is int diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py index e44350276cbe9..918707399270d 100644 --- a/torch/fx/experimental/symbolic_shapes.py +++ b/torch/fx/experimental/symbolic_shapes.py @@ -83,6 +83,9 @@ class GuardOnDataDependentSymNode(RuntimeError): pass +class PendingUnbackedSymbolNotFound(RuntimeError): + pass + import sympy from sympy.printing.str import StrPrinter from sympy.printing.precedence import precedence, PRECEDENCE @@ -602,14 +605,19 @@ def free_unbacked_symbols_with_path( return r symbol_to_path = free_unbacked_symbols_with_path(example_value, ()) - assert not pending, ( - f"pending {pending} not in {example_value} " + - ( + if not peek and pending: + extra = ( repr((example_value.stride(), example_value.storage_offset())) if isinstance(example_value, torch.Tensor) else "" ) - ) + raise PendingUnbackedSymbolNotFound( + f"Pending unbacked symbols {pending} not in returned outputs {example_value} {extra}.\n" + "Did you accidentally call new_dynamic_size() or item() more times " + "than you needed to in your fake implementation?\n" + "For more help, see https://docs.google.com/document/d/1RWrH-3wLEpzR9kCS6gGBNen_-Fs-8PVbWWFE5AcgeWE/edit" + ) + # Why do we have to do some rebinding here? If the original FX node # wasn't a binding site because you had a memo hit, but post # translation you aren't a memo hit anymore, there's now a new binding @@ -3016,6 +3024,38 @@ def create_symintnode( out = SymInt(SymNode(sym, self, int, hint, fx_node=fx_node)) return out + @record_shapeenv_event() + def create_symfloatnode( + self, + sym: "sympy.Expr", + *, + hint: Optional[int], + source: Optional[Source] = None, + ): + """Create a SymFloat value from a symbolic expression""" + source_name = source.name() if source else None + + if self._translation_validation_enabled and source is not None: + # Create a new symbol for this source. + symbol = self._create_symbol_for_source(source) + assert symbol is not None + + # Create a new FX placeholder and Z3 variable for 'symbol'. + fx_node = self._create_fx_placeholder_and_z3var(symbol, float) + + # Add an equality assertion for the newly created symbol and 'sym'. + self._add_assertion(sympy.Eq(symbol, sym)) + else: + fx_node = None + + if isinstance(sym, sympy.Float): + if hint is not None: + assert float(sym) == hint + out = float(sym) + else: + out = SymFloat(SymNode(sym, self, float, hint, fx_node=fx_node)) + return out + @record_shapeenv_event() def create_unspecified_symint_and_symbol(self, value, source, dynamic_dim): """Create a SymInt wrapping a new unspecified symbol""" @@ -3200,10 +3240,15 @@ def create_symbol( # If we're not duck shaping, we always create a new symbol # Even if we're duck shaping, if we haven't seen this particular # value before, we also create a new symbol - sympy_expr = make_symbol(SymT.SIZE, len(self.var_to_val), positive=positive, integer=True) + if type(val) is int: + sympy_expr = make_symbol(SymT.SIZE, len(self.var_to_val), positive=positive, integer=True) + else: + sympy_expr = make_symbol(SymT.FLOAT, len(self.var_to_val), positive=positive, real=True) # We always associate vars to vals if isinstance(val, int): self.var_to_val[sympy_expr] = sympy.Integer(val) + elif isinstance(val, float): + self.var_to_val[sympy_expr] = sympy.Float(val) else: # Only used for jagged layout nested tensors self.var_to_val[sympy_expr] = SingletonInt(val.node.nested_int(), coeff=val.node.nested_int_coeff()) @@ -3238,6 +3283,9 @@ def create_symbol( if val not in vr: raise ConstraintViolationError(f"{val} not in range [{vr.lower}, {vr.upper}]") + range_str = f"[{vr.lower}, {vr.upper}]" + elif isinstance(val, float): + self.var_to_range[sympy_expr] = vr = ValueRanges(-sympy.oo, sympy.oo) range_str = f"[{vr.lower}, {vr.upper}]" else: # Skip var_range logic for SingletonInt @@ -3384,7 +3432,7 @@ def _create_no_constraints_context(t): if context is None: input_contexts[i] = _create_no_constraints_context(t) else: - assert isinstance(t, (SymInt, int)) + assert isinstance(t, (SymInt, int, SymFloat, float)) assert not isinstance(context, list) # It took a lot of sweat to figure out the algorithm here. Let's @@ -3592,6 +3640,22 @@ def hint(s): ) record_constraint_violation(constraint.warn_only, self._debug_name(source), msg) + def track_symfloat(source, val): + log.debug("track_symfloat %s %s", LazyString(source.name), val) + assert not isinstance(val, SymFloat) or is_symbolic(val) + + if isinstance(val, SymFloat) and val.node.maybe_as_float() is not None: + val = val.node.maybe_as_float() + + if isinstance(val, SymFloat): + s = val.node.expr + if isinstance(s, sympy.Symbol): + symbol_to_source[s].append(source) + input_guards.append((source, s)) + else: + s = sympy.Float(val) + input_guards.append((source, s)) + for t, source, context in zip(placeholders, sources, input_contexts): if isinstance(source, str): from torch._dynamo.source import LocalSource @@ -3602,6 +3666,9 @@ def hint(s): if isinstance(t, (SymInt, int)): track_symint(source, t) continue + elif isinstance(t, (SymFloat, float)): + track_symfloat(source, t) + continue assert isinstance(t, Tensorlike) if is_traceable_wrapper_subclass(t): from torch._dynamo.source import AttrSource @@ -3788,7 +3855,6 @@ def issue_guard(guard: ShapeGuard) -> None: r = self.var_to_range[symbol] assert sources - assert symbol.is_integer bounds = [] if r.lower != -sympy.oo: if any(is_dim(source) for source in sources): @@ -3834,6 +3900,12 @@ def issue_guard(guard: ShapeGuard) -> None: self._debug_name(source), msg, ) + # We NaN specialize, which means similar to 0/1 specialization we + # should assume that the float is NOT nan. This is load bearing + # if you have something like an equality guard, nan will play + # merry hell with the reasoning. + if symbol_is_type(symbol, SymT.FLOAT): + exprs.append(f"not __math_isnan({source_ref(sources[0])})") if constraint_violations: warn_msgs = [] @@ -4498,6 +4570,7 @@ def _smart_symbol_sort(x): floor_div_atoms = lhs.atoms(FloorDiv).union(rhs.atoms(FloorDiv)) if len(floor_div_atoms) > 0 and any(a.divisor != 1 for a in floor_div_atoms): raise NotImplementedError + # Never replace unbacked symbols with other unbacked symbols. # This is error prone because you can cause references to # unbacked symbols to time travel backwards. E.g., @@ -4512,10 +4585,20 @@ def _smart_symbol_sort(x): # references u2 and u3 prior to them actually being bound at # runtime. It's pretty inconvenient to setup control # dependencies for substitutions, so ban it entirely. - if isinstance(lhs, sympy.Symbol) and free_unbacked_symbols(lhs) and not free_unbacked_symbols(rhs): - # short-circuit when no solving is needed + def trivial_solve(lhs, rhs): + if isinstance(lhs, sympy.Symbol): + if free_unbacked_symbols(lhs) and not free_unbacked_symbols(rhs): + return True + if symbol_is_type(lhs, SymT.FLOAT): + return True + # TODO: Maybe trivial solutions for int should also be + # done? + return False + + # short-circuit when no solving is needed + if trivial_solve(lhs, rhs): self._set_replacement(lhs, self._find(rhs), "trivial_lhs") - elif isinstance(rhs, sympy.Symbol) and free_unbacked_symbols(rhs) and not free_unbacked_symbols(lhs): + elif trivial_solve(rhs, lhs): self._set_replacement(rhs, self._find(lhs), "trivial_rhs") else: r = try_solve(expr, free[0], floordiv_inequality=False) @@ -4791,21 +4874,11 @@ def compute_concrete_val(): # Turn this into a boolean expression, no longer need to consult # concrete_val - suppress_maybe_guard_rel = False if concrete_val is sympy.true: g = expr elif concrete_val is sympy.false: g = sympy.Not(expr) else: - # WARNING: we cannot actually do simplifications on guards - # on floating point values, because Sympy generally does not - # think expressions on integers can ever be equal to floating - # point (e.g., sympy.Eq(s0/6, 0.5) evaluates to False). Without - # very clear algebraic laws that hold for floating point, such - # simplifications are error prone anyway, so be sure not to - # maybe_guard_rel in those cases. - if not isinstance(concrete_val, sympy.Integer): - suppress_maybe_guard_rel = True g = sympy.Eq(expr, concrete_val) # type: ignore[arg-type] if isinstance(g, sympy.Rel): diff --git a/torch/fx/passes/runtime_assert.py b/torch/fx/passes/runtime_assert.py index 870348af6f69e..0d45defe8a48c 100644 --- a/torch/fx/passes/runtime_assert.py +++ b/torch/fx/passes/runtime_assert.py @@ -120,12 +120,13 @@ def add_runtime_asserts(ras): ), ) - for node in graph.nodes: + nodes = list(graph.nodes) + for i, node in enumerate(nodes[:-1]): # Placeholders can match symbols, but when we destructure them # with size we have to make sure we insert the nodes after all # the placeholders with graph.inserting_before( - node.next if node not in placeholders else last_placeholder.next + nodes[i + 1] if node not in placeholders else last_placeholder.next ): # Unfortunately, this logic still must remain because manual # make_fx calls may not explicitly bind all symbolic ints as @@ -150,12 +151,24 @@ def match_symbol(symint, cb): match_symbol(example_value, lambda: node) if isinstance(t := example_value, torch.Tensor): for i, s in enumerate(t.size()): - match_symbol(s, lambda: graph.call_method("size", (node, i))) + match_symbol( + s, + lambda: graph.call_function( + torch.ops.aten.sym_size.int, (node, i) + ), + ) for i, s in enumerate(t.stride()): - match_symbol(s, lambda: graph.call_method("stride", (node, i))) + match_symbol( + s, + lambda: graph.call_function( + torch.ops.aten.sym_stride.int, (node, i) + ), + ) match_symbol( t.storage_offset(), - lambda: graph.call_method("storage_offset", (node,)), + lambda: graph.call_function( + torch.ops.aten.sym_storage_offset.default, (node,) + ), ) # Handle asserts that aren't associated with any symbol. This diff --git a/torch/library.h b/torch/library.h index c38179a6eea1d..3c1d0c415106f 100644 --- a/torch/library.h +++ b/torch/library.h @@ -299,9 +299,9 @@ class TORCH_API CppFunction final { } private: - c10::optional dispatch_key_; + std::optional dispatch_key_; c10::KernelFunction func_; - c10::optional cpp_signature_; + std::optional cpp_signature_; std::unique_ptr schema_; std::string debug_; @@ -316,7 +316,7 @@ class TORCH_API CppFunction final { CppFunction( c10::KernelFunction func, - c10::optional cpp_signature, + std::optional cpp_signature, std::unique_ptr schema); }; @@ -555,7 +555,7 @@ class TORCH_API Library final { Library( Kind kind, std::string ns, - c10::optional k, + std::optional k, const char* file, uint32_t line); @@ -847,9 +847,9 @@ class TORCH_API Library final { private: Kind kind_; - c10::optional ns_; - c10::optional dispatch_key_; - c10::optional> python_module_; + std::optional ns_; + std::optional dispatch_key_; + std::optional> python_module_; const char* file_; uint32_t line_; @@ -889,7 +889,7 @@ class TorchLibraryInit final { Library::Kind kind, InitFn* fn, const char* ns, - c10::optional k, + std::optional k, const char* file, uint32_t line) : lib_(kind, ns, k, file, line) { diff --git a/torch/nn/attention/__init__.py b/torch/nn/attention/__init__.py index fc4835f046e6c..039d76a32f4b0 100644 --- a/torch/nn/attention/__init__.py +++ b/torch/nn/attention/__init__.py @@ -6,6 +6,8 @@ from torch.backends.cuda import ( can_use_efficient_attention, can_use_flash_attention, + cudnn_sdp_enabled, + enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp, @@ -99,19 +101,23 @@ def sdpa_kernel(backends: Union[List[SDPBackend], SDPBackend]): backends = [backends] backends = set(backends) + previous_cudnn: bool = cudnn_sdp_enabled() previous_flash: bool = flash_sdp_enabled() previous_mem_efficient: bool = mem_efficient_sdp_enabled() previous_math: bool = math_sdp_enabled() try: + enable_cudnn = SDPBackend.CUDNN_ATTENTION in backends enable_flash = SDPBackend.FLASH_ATTENTION in backends enable_mem_efficient = SDPBackend.EFFICIENT_ATTENTION in backends enable_math = SDPBackend.MATH in backends + enable_cudnn_sdp(enable_cudnn) enable_flash_sdp(enable_flash) enable_mem_efficient_sdp(enable_mem_efficient) enable_math_sdp(enable_math) yield {} finally: + enable_cudnn_sdp(previous_cudnn) enable_flash_sdp(previous_flash) enable_mem_efficient_sdp(previous_mem_efficient) enable_math_sdp(previous_math) diff --git a/torch/nn/attention/_flex_attention.py b/torch/nn/attention/_flex_attention.py index ee131dfac8524..1acfab57a62ce 100644 --- a/torch/nn/attention/_flex_attention.py +++ b/torch/nn/attention/_flex_attention.py @@ -83,6 +83,10 @@ def score_mod( """ if torch.compiler.is_dynamo_compiling(): + # mark head_dim and dim always to be static + for x in [query, key, value]: + torch._dynamo.mark_static(x, 1) + torch._dynamo.mark_static(x, -1) out, _ = flex_attention_hop(query, key, value, score_mod) return out diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index a4dc9a17089c5..f5206d425b4d8 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -350,11 +350,6 @@ def export( %3 : Float = onnx::Mul(%2, %0) return (%3) - If PyTorch was built with Caffe2 (i.e. with ``BUILD_CAFFE2=1``), then - Caffe2-specific behavior will be enabled, including special support - for ops are produced by the modules described in - `Quantization `_. - .. warning:: Models exported this way are probably runnable only by Caffe2. @@ -1802,9 +1797,8 @@ def _add_output_to_block(block: _C.Block, value: _C.Value) -> int: def _should_aten_fallback( name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes ): - # For BUILD_CAFFE2=0 builds, if domain=="aten" and operator_export_type==ONNX_ATEN, + # For all builds, if domain=="aten" and operator_export_type==ONNX_ATEN, # an aten::ATen operator is created regardless of symbolics existence - # For BUILD_CAFFE2=1, the same applies only if there is no symbolic available is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version) is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN diff --git a/torch/optim/__init__.py b/torch/optim/__init__.py index 5e836b4047ddf..58d9c948416b8 100644 --- a/torch/optim/__init__.py +++ b/torch/optim/__init__.py @@ -22,17 +22,17 @@ from .sgd import SGD from .sparse_adam import SparseAdam -del adadelta # noqa: F821 -del adagrad # noqa: F821 -del adam # noqa: F821 -del adamw # noqa: F821 -del sparse_adam # noqa: F821 -del adamax # noqa: F821 -del asgd # noqa: F821 -del sgd # noqa: F821 -del radam # noqa: F821 -del rprop # noqa: F821 -del rmsprop # noqa: F821 -del optimizer # noqa: F821 -del nadam # noqa: F821 -del lbfgs # noqa: F821 +del adadelta # type: ignore[name-defined] # noqa: F821 +del adagrad # type: ignore[name-defined] # noqa: F821 +del adam # type: ignore[name-defined] # noqa: F821 +del adamw # type: ignore[name-defined] # noqa: F821 +del sparse_adam # type: ignore[name-defined] # noqa: F821 +del adamax # type: ignore[name-defined] # noqa: F821 +del asgd # type: ignore[name-defined] # noqa: F821 +del sgd # type: ignore[name-defined] # noqa: F821 +del radam # type: ignore[name-defined] # noqa: F821 +del rprop # type: ignore[name-defined] # noqa: F821 +del rmsprop # type: ignore[name-defined] # noqa: F821 +del optimizer # type: ignore[name-defined] # noqa: F821 +del nadam # type: ignore[name-defined] # noqa: F821 +del lbfgs # type: ignore[name-defined] # noqa: F821 diff --git a/torch/optim/__init__.pyi b/torch/optim/__init__.pyi deleted file mode 100644 index 8d35bab14c207..0000000000000 --- a/torch/optim/__init__.pyi +++ /dev/null @@ -1,15 +0,0 @@ -from . import lr_scheduler as lr_scheduler, swa_utils as swa_utils -from .adadelta import Adadelta as Adadelta -from .adagrad import Adagrad as Adagrad -from .adam import Adam as Adam -from .adamax import Adamax as Adamax -from .adamw import AdamW as AdamW -from .asgd import ASGD as ASGD -from .lbfgs import LBFGS as LBFGS -from .nadam import NAdam as NAdam -from .optimizer import Optimizer as Optimizer -from .radam import RAdam as RAdam -from .rmsprop import RMSprop as RMSprop -from .rprop import Rprop as Rprop -from .sgd import SGD as SGD -from .sparse_adam import SparseAdam as SparseAdam diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py index b9fcafbbcd9aa..097c8040b63e1 100644 --- a/torch/optim/adadelta.py +++ b/torch/optim/adadelta.py @@ -9,6 +9,7 @@ _differentiable_doc, _disable_dynamo_if_unsupported, _foreach_doc, + _get_capturable_supported_devices, _get_scalar_dtype, _maximize_doc, _use_grad_for_differentiable, @@ -24,10 +25,10 @@ class Adadelta(Optimizer): def __init__( self, params: ParamsT, - lr=1.0, - rho=0.9, - eps=1e-6, - weight_decay=0, + lr: float = 1.0, + rho: float = 0.9, + eps: float = 1e-6, + weight_decay: float = 0, foreach: Optional[bool] = None, *, capturable: bool = False, @@ -254,9 +255,14 @@ def _single_tensor_adadelta( ): # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: + capturable_supported_devices = _get_capturable_supported_devices( + supports_xla=False + ) assert all( - p.is_cuda and step.is_cuda for p, step in zip(params, state_steps) - ), "If capturable=True, params and state_steps must be CUDA tensors." + p.device.type == step.device.type + and p.device.type in capturable_supported_devices + for p, step in zip(params, state_steps) + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." for param, grad, square_avg, acc_delta, step in zip( params, grads, square_avgs, acc_deltas, state_steps @@ -305,9 +311,14 @@ def _multi_tensor_adadelta( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: + capturable_supported_devices = _get_capturable_supported_devices( + supports_xla=False + ) assert all( - p.is_cuda and step.is_cuda for p, step in zip(params, state_steps) - ), "If capturable=True, params and state_steps must be CUDA tensors." + p.device.type == step.device.type + and p.device.type in capturable_supported_devices + for p, step in zip(params, state_steps) + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." if len(params) == 0: return diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py index 8dcfd75001120..0ed8acfac1c61 100644 --- a/torch/optim/adagrad.py +++ b/torch/optim/adagrad.py @@ -2,7 +2,7 @@ import torch from torch import Tensor -from torch.utils._foreach_utils import _get_fused_kernels_supported_devices + from .optimizer import ( _default_to_fused_or_foreach, _differentiable_doc, @@ -23,16 +23,15 @@ class Adagrad(Optimizer): def __init__( self, params: ParamsT, - lr=1e-2, - lr_decay=0, - weight_decay=0, - initial_accumulator_value=0, - eps=1e-10, + lr: float = 1e-2, + lr_decay: float = 0, + weight_decay: float = 0, + initial_accumulator_value: float = 0, + eps: float = 1e-10, foreach: Optional[bool] = None, *, maximize: bool = False, differentiable: bool = False, - fused: Optional[bool] = None, ): if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") @@ -56,41 +55,13 @@ def __init__( foreach=foreach, maximize=maximize, differentiable=differentiable, - fused=fused, ) super().__init__(params, defaults) - if fused: - if differentiable: - raise RuntimeError("`fused` does not support `differentiable`") - self._step_supports_amp_scaling = True - fused_supported_devices = _get_fused_kernels_supported_devices() - # Not support CUDA yet - fused_supported_devices.remove("cuda") - if not all( - p.device.type in fused_supported_devices and torch.is_floating_point(p) - for pg in self.param_groups - for p in pg["params"] - ): - raise RuntimeError( - "`fused=True` requires all the params to be floating point Tensors of " - f"supported devices: {fused_supported_devices}." - ) - if foreach: - raise RuntimeError("`fused` and `foreach` cannot be `True` together.") - for group in self.param_groups: for p in group["params"]: state = self.state[p] - state["step"] = ( - torch.zeros( - (), - dtype=_get_scalar_dtype(is_fused=group["fused"]), - device=p.device, - ) - if group["fused"] - else torch.tensor(0.0, dtype=_get_scalar_dtype()) - ) + state["step"] = torch.tensor(0.0, dtype=_get_scalar_dtype()) init_value = ( complex(initial_accumulator_value, initial_accumulator_value) if torch.is_complex(p) @@ -102,14 +73,10 @@ def __init__( def __setstate__(self, state): super().__setstate__(state) - # define "fused" for - # MYPY error: Name "fused" may be undefined - fused = None for group in self.param_groups: group.setdefault("foreach", None) group.setdefault("maximize", False) group.setdefault("differentiable", False) - fused = group.setdefault("fused", None) state_values = list(self.state.values()) step_is_tensor = (len(state_values) != 0) and torch.is_tensor( @@ -117,9 +84,7 @@ def __setstate__(self, state): ) if not step_is_tensor: for s in state_values: - s["step"] = torch.tensor( - float(s["step"]), dtype=_get_scalar_dtype(is_fused=fused) - ) + s["step"] = torch.tensor(float(s["step"]), dtype=_get_scalar_dtype()) def share_memory(self): for group in self.param_groups: @@ -179,9 +144,6 @@ def step(self, closure=None): maximize=group["maximize"], differentiable=group["differentiable"], has_complex=has_complex, - fused=group["fused"], - grad_scale=getattr(self, "grad_scale", None), - found_inf=getattr(self, "found_inf", None), ) return loss @@ -228,10 +190,7 @@ def step(self, closure=None): {_foreach_doc} {_maximize_doc} {_differentiable_doc} - fused (bool, optional): whether the fused implementation (CPU only) is used. - Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16` - are supported. (default: None). Please note that the fused implementations does not - support sparse or complex gradients. + .. _Adaptive Subgradient Methods for Online Learning and Stochastic Optimization: http://jmlr.org/papers/v12/duchi11a.html @@ -244,9 +203,6 @@ def adagrad( grads: List[Tensor], state_sums: List[Tensor], state_steps: List[Tensor], - fused: Optional[bool] = None, - grad_scale: Optional[Tensor] = None, - found_inf: Optional[Tensor] = None, # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627 # setting these as kwargs for now as functional API is compiled by torch/distributed/optim has_sparse_grad: bool = False, @@ -269,28 +225,15 @@ def adagrad( "API has changed, `state_steps` argument must contain a list of singleton tensors" ) - # Respect when the user inputs False/True for foreach or fused. We only want to change - # the default when neither have been user-specified. Note that we default to foreach - # and pass False to use_fused. This is not a mistake--we want to give the fused impl - # bake-in time before making it the default, even if it is typically faster. - if fused is None and foreach is None: + if foreach is None: _, foreach = _default_to_fused_or_foreach( params, differentiable, use_fused=False ) - if fused is None: - fused = False - if foreach is None: - foreach = False - if foreach and torch.jit.is_scripting(): raise RuntimeError("torch.jit.script not supported with foreach optimizers") - if fused and torch.jit.is_scripting(): - raise RuntimeError("torch.jit.script not supported with fused optimizers") - if fused and not torch.jit.is_scripting(): - func = _fused_adagrad - elif foreach and not torch.jit.is_scripting(): + if foreach and not torch.jit.is_scripting(): func = _multi_tensor_adagrad else: func = _single_tensor_adagrad @@ -308,8 +251,6 @@ def adagrad( maximize=maximize, differentiable=differentiable, has_complex=has_complex, - grad_scale=grad_scale, - found_inf=found_inf, ) @@ -325,8 +266,6 @@ def _single_tensor_adagrad( grads: List[Tensor], state_sums: List[Tensor], state_steps: List[Tensor], - grad_scale: Optional[Tensor], - found_inf: Optional[Tensor], *, lr: float, weight_decay: float, @@ -337,7 +276,6 @@ def _single_tensor_adagrad( differentiable: bool, has_complex: bool, ): - assert grad_scale is None and found_inf is None for param, grad, state_sum, step_t in zip(params, grads, state_sums, state_steps): # update step step_t += 1 @@ -386,8 +324,6 @@ def _multi_tensor_adagrad( grads: List[Tensor], state_sums: List[Tensor], state_steps: List[Tensor], - grad_scale: Optional[Tensor], - found_inf: Optional[Tensor], *, lr: float, weight_decay: float, @@ -399,7 +335,6 @@ def _multi_tensor_adagrad( has_complex: bool, ): assert not differentiable, "_foreach ops don't support autograd" - assert grad_scale is None and found_inf is None # Foreach functions will throw errors if given empty lists if len(params) == 0: @@ -432,8 +367,6 @@ def _multi_tensor_adagrad( maximize=maximize, differentiable=differentiable, has_complex=has_complex, - grad_scale=grad_scale, - found_inf=found_inf, ) continue @@ -481,76 +414,3 @@ def _multi_tensor_adagrad( numerator = torch._foreach_mul(device_grads, minus_clr) # type: ignore[assignment] torch._foreach_addcdiv_(device_params, numerator, std) - - -def _fused_adagrad( - params: List[Tensor], - grads: List[Tensor], - state_sums: List[Tensor], - state_steps: List[Tensor], - grad_scale: Optional[Tensor], - found_inf: Optional[Tensor], - *, - lr: float, - weight_decay: float, - lr_decay: float, - eps: float, - has_sparse_grad: bool, - maximize: bool, - differentiable: bool, - has_complex: bool, -) -> None: - if not params: - return - if has_sparse_grad or has_complex: - raise RuntimeError("`fused` does not support sparse grad or complex param") - - if differentiable: - raise RuntimeError( - "adagrad with fused=True does not support differentiable=True" - ) - - grad_scale_dict = ( - {grad_scale.device: grad_scale} if grad_scale is not None else None - ) - found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None - - grouped_tensors = Optimizer._group_tensors_by_device_and_dtype( - [params, grads, state_sums, state_steps] - ) - for (device, _), ( - ( - device_params, - device_grads, - device_state_sums, - device_state_steps, - ), - _, - ) in grouped_tensors.items(): - device_grad_scale, device_found_inf = None, None - if grad_scale is not None and grad_scale_dict is not None: - if device not in grad_scale_dict: - grad_scale_dict[device] = grad_scale.to(device, non_blocking=True) # type: ignore[index] - device_grad_scale = grad_scale_dict[device] # type: ignore[index] - if found_inf is not None and found_inf_dict is not None: - if found_inf not in found_inf_dict: - found_inf_dict[device] = found_inf.to(device, non_blocking=True) # type: ignore[index] - device_found_inf = found_inf_dict[device] # type: ignore[index] - torch._foreach_add_(device_state_steps, 1) - torch._fused_adagrad_( - device_params, - device_grads, - device_state_sums, - device_state_steps, - lr=lr, - lr_decay=lr_decay, - weight_decay=weight_decay, - eps=eps, - maximize=maximize, - grad_scale=device_grad_scale, - found_inf=device_found_inf, - ) - if device_found_inf is not None: - torch._foreach_sub_( - device_state_steps, [device_found_inf] * len(device_state_steps) - ) diff --git a/torch/optim/adam.py b/torch/optim/adam.py index 04c93989576b0..fba4b2027b05d 100644 --- a/torch/optim/adam.py +++ b/torch/optim/adam.py @@ -11,12 +11,14 @@ _dispatch_sqrt, _foreach_doc, _fused_doc, + _get_capturable_supported_devices, _get_scalar_dtype, _get_value, _maximize_doc, _stack_if_compiling, _use_grad_for_differentiable, _view_as_real, + DeviceDict, Optimizer, ParamsT, ) @@ -202,12 +204,12 @@ def step(self, closure=None): loss = closure() for group in self.param_groups: - params_with_grad = [] - grads = [] - exp_avgs = [] - exp_avg_sqs = [] - max_exp_avg_sqs = [] - state_steps = [] + params_with_grad: List[Tensor] = [] + grads: List[Tensor] = [] + exp_avgs: List[Tensor] = [] + exp_avg_sqs: List[Tensor] = [] + max_exp_avg_sqs: List[Tensor] = [] + state_steps: List[Tensor] = [] beta1, beta2 = group["betas"] has_complex = self._init_group( @@ -352,9 +354,11 @@ def _single_tensor_adam( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: - assert (param.is_cuda and step_t.is_cuda) or ( - param.is_xla and step_t.is_xla - ), "If capturable=True, params and state_steps must be CUDA or XLA tensors." + capturable_supported_devices = _get_capturable_supported_devices() + assert ( + param.device.type == step_t.device.type + and param.device.type in capturable_supported_devices + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." # update step step_t += 1 @@ -463,9 +467,14 @@ def _multi_tensor_adam( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: + capturable_supported_devices = _get_capturable_supported_devices( + supports_xla=False + ) assert all( - p.is_cuda and step.is_cuda for p, step in zip(params, state_steps) - ), "If capturable=True, params and state_steps must be CUDA tensors." + p.device.type == step.device.type + and p.device.type in capturable_supported_devices + for p, step in zip(params, state_steps) + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." assert grad_scale is None and found_inf is None @@ -498,7 +507,7 @@ def _multi_tensor_adam( ) if maximize: - device_grads = torch._foreach_neg(device_grads) + device_grads = torch._foreach_neg(device_grads) # type: ignore[assignment] # Update steps # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over @@ -516,7 +525,7 @@ def _multi_tensor_adam( if maximize: torch._foreach_add_(device_grads, device_params, alpha=weight_decay) else: - device_grads = torch._foreach_add( + device_grads = torch._foreach_add( # type: ignore[assignment] device_grads, device_params, alpha=weight_decay ) @@ -531,6 +540,9 @@ def _multi_tensor_adam( # Delete the local intermediate since it won't be used anymore to save on peak memory del device_grads + bias_correction1: Union[Tuple[Tensor, ...], List[Tensor]] + bias_correction2: Union[Tuple[Tensor, ...], List[Tensor]] + bias_correction2_sqrt: Union[Tuple[Tensor, ...], List[Tensor]] if capturable: bias_correction1 = torch._foreach_pow(beta1, device_state_steps) bias_correction2 = torch._foreach_pow(beta2, device_state_steps) @@ -577,7 +589,7 @@ def _multi_tensor_adam( step_size = _stack_if_compiling([(lr / bc) * -1 for bc in bias_correction1]) - bias_correction2_sqrt = [_dispatch_sqrt(bc) for bc in bias_correction2] + bias_correction2_sqrt = [_dispatch_sqrt(bc) for bc in bias_correction2] # type: ignore[arg-type] if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now @@ -591,7 +603,7 @@ def _multi_tensor_adam( torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt) torch._foreach_add_(exp_avg_sq_sqrt, eps) torch._foreach_addcdiv_( - device_params, device_exp_avgs, exp_avg_sq_sqrt, step_size + device_params, device_exp_avgs, exp_avg_sq_sqrt, step_size # type: ignore[arg-type] ) @@ -621,17 +633,18 @@ def _fused_adam( if differentiable: raise RuntimeError("Adam with fused=True does not support differentiable=True") - grad_scale_dict = ( - {grad_scale.device: grad_scale} if grad_scale is not None else None + grad_scale_dict: DeviceDict = ( + {grad_scale.device: grad_scale} if grad_scale is not None else {} + ) + found_inf_dict: DeviceDict = ( + {found_inf.device: found_inf} if found_inf is not None else {} ) - found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer # treating it as a scalar. - lr_dict = ( + lr_dict: Optional[DeviceDict] = ( {lr.device: lr} if isinstance(lr, Tensor) and str(lr.device) != "cpu" else None ) - grouped_tensors = Optimizer._group_tensors_by_device_and_dtype( [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps] ) @@ -648,15 +661,15 @@ def _fused_adam( ) in grouped_tensors.items(): device_grad_scale, device_found_inf = None, None if grad_scale is not None: - if device not in grad_scale_dict: - grad_scale_dict[device] = grad_scale.to(device, non_blocking=True) - device_grad_scale = grad_scale_dict[device] + device_grad_scale = grad_scale_dict.setdefault( + device, grad_scale.to(device, non_blocking=True) + ) if found_inf is not None: - if found_inf not in found_inf_dict: - found_inf_dict[device] = found_inf.to(device, non_blocking=True) - device_found_inf = found_inf_dict[device] + device_found_inf = found_inf_dict.setdefault( + device, found_inf.to(device, non_blocking=True) + ) if lr_dict is not None and device not in lr_dict: - lr_dict[device] = lr.to(device=device, non_blocking=True) + lr_dict[device] = lr.to(device=device, non_blocking=True) # type: ignore[union-attr] lr = lr_dict[device] torch._foreach_add_(device_state_steps, 1) torch._fused_adam_( diff --git a/torch/optim/adam.pyi b/torch/optim/adam.pyi deleted file mode 100644 index aef8ed69a9c99..0000000000000 --- a/torch/optim/adam.pyi +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Optional, Tuple, Union - -from torch import Tensor - -from .optimizer import Optimizer, ParamsT - -class Adam(Optimizer): - def __init__( - self, - params: ParamsT, - lr: Union[float, Tensor] = 1e-3, - betas: Tuple[float, float] = (0.9, 0.999), - eps: float = 1e-8, - weight_decay: float = 0, - amsgrad: bool = False, - *, - foreach: Optional[bool] = None, - maximize: bool = False, - capturable: bool = False, - differentiable: bool = False, - fused: Optional[bool] = None, - ) -> None: ... diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py index 6fa335de4d8b8..8af468ba83869 100644 --- a/torch/optim/adamax.py +++ b/torch/optim/adamax.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List, Optional, Tuple, Union import torch from torch import Tensor @@ -9,12 +9,14 @@ _differentiable_doc, _disable_dynamo_if_unsupported, _foreach_doc, + _get_capturable_supported_devices, _get_scalar_dtype, _get_value, _maximize_doc, _use_grad_for_differentiable, _view_as_real, Optimizer, + ParamsT, ) __all__ = ["Adamax", "adamax"] @@ -23,11 +25,11 @@ class Adamax(Optimizer): def __init__( self, - params, - lr=2e-3, - betas=(0.9, 0.999), - eps=1e-8, - weight_decay=0, + params: ParamsT, + lr: float = 2e-3, + betas: Tuple[float, float] = (0.9, 0.999), + eps: float = 1e-8, + weight_decay: float = 0, foreach: Optional[bool] = None, *, maximize: bool = False, @@ -127,11 +129,11 @@ def step(self, closure=None): loss = closure() for group in self.param_groups: - params_with_grad = [] - grads = [] - exp_avgs = [] - exp_infs = [] - state_steps = [] + params_with_grad: List[Tensor] = [] + grads: List[Tensor] = [] + exp_avgs: List[Tensor] = [] + exp_infs: List[Tensor] = [] + state_steps: List[Tensor] = [] beta1, beta2 = group["betas"] eps = group["eps"] @@ -242,9 +244,11 @@ def _single_tensor_adamax( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: - assert (param.is_cuda and step_t.is_cuda) or ( - param.is_xla and step_t.is_xla - ), "If capturable=True, params and state_steps must be CUDA or XLA tensors." + capturable_supported_devices = _get_capturable_supported_devices() + assert ( + param.device.type == step_t.device.type + and param.device.type in capturable_supported_devices + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." # update step step_t += 1 @@ -295,11 +299,11 @@ def _multi_tensor_adamax( exp_infs: List[Tensor], state_steps: List[Tensor], *, + eps: float, beta1: float, beta2: float, lr: float, weight_decay: float, - eps: float, maximize: bool, differentiable: bool, capturable: bool, @@ -311,14 +315,15 @@ def _multi_tensor_adamax( return # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] - if ( - not torch._utils.is_compiling() - and capturable - and not all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)) - ): - raise RuntimeError( - "If capturable=True, params and state_steps must be CUDA tensors." + if not torch._utils.is_compiling() and capturable: + capturable_supported_devices = _get_capturable_supported_devices( + supports_xla=False ) + assert all( + p.device.type == step.device.type + and p.device.type in capturable_supported_devices + for p, step in zip(params, state_steps) + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." grouped_tensors = Optimizer._group_tensors_by_device_and_dtype( [params, grads, exp_avgs, exp_infs, state_steps] @@ -336,7 +341,7 @@ def _multi_tensor_adamax( ) if maximize: - grouped_grads = torch._foreach_neg(grouped_grads) + grouped_grads = torch._foreach_neg(grouped_grads) # type: ignore[assignment] # Update steps # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over @@ -354,7 +359,7 @@ def _multi_tensor_adamax( # Re-use the intermediate memory (grouped_grads) already allocated for maximize torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay) else: - grouped_grads = torch._foreach_add( + grouped_grads = torch._foreach_add( # type: ignore[assignment] grouped_grads, grouped_params, alpha=weight_decay ) @@ -367,13 +372,14 @@ def _multi_tensor_adamax( # in this case, we need to introduce a copy of the grads # since one has not been introduced previously if not maximize and weight_decay == 0: - grouped_grads = torch._foreach_abs(grouped_grads) + grouped_grads = torch._foreach_abs(grouped_grads) # type: ignore[assignment] else: torch._foreach_abs_(grouped_grads) torch._foreach_add_(grouped_grads, eps) torch._foreach_maximum_(grouped_exp_infs, grouped_grads) + bias_corrections: Union[Tuple[Tensor, ...], List[Tensor]] if capturable: bias_corrections = torch._foreach_pow(beta1, grouped_state_steps) # foreach_sub doesn't allow a scalar as the first arg diff --git a/torch/optim/adamax.pyi b/torch/optim/adamax.pyi deleted file mode 100644 index d38cfaefe388c..0000000000000 --- a/torch/optim/adamax.pyi +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Tuple - -from .optimizer import Optimizer, ParamsT - -class Adamax(Optimizer): - def __init__( - self, - params: ParamsT, - lr: float = ..., - betas: Tuple[float, float] = ..., - eps: float = ..., - weight_decay: float = ..., - ) -> None: ... diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py index aa46c7a537e77..e58b28244083a 100644 --- a/torch/optim/adamw.py +++ b/torch/optim/adamw.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, Union +from typing import cast, List, Optional, Tuple, Union import torch from torch import Tensor @@ -11,12 +11,14 @@ _dispatch_sqrt, _foreach_doc, _fused_doc, + _get_capturable_supported_devices, _get_scalar_dtype, _get_value, _maximize_doc, _stack_if_compiling, _use_grad_for_differentiable, _view_as_real, + DeviceDict, Optimizer, ParamsT, ) @@ -201,14 +203,14 @@ def step(self, closure=None): loss = closure() for group in self.param_groups: - params_with_grad = [] - grads = [] - exp_avgs = [] - exp_avg_sqs = [] - max_exp_avg_sqs = [] - state_steps = [] - amsgrad = group["amsgrad"] - beta1, beta2 = group["betas"] + params_with_grad: List[Tensor] = [] + grads: List[Tensor] = [] + exp_avgs: List[Tensor] = [] + exp_avg_sqs: List[Tensor] = [] + max_exp_avg_sqs: List[Tensor] = [] + state_steps: List[Tensor] = [] + amsgrad: bool = group["amsgrad"] + beta1, beta2 = cast(Tuple[float, float], group["betas"]) has_complex = self._init_group( group, @@ -353,9 +355,11 @@ def _single_tensor_adamw( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: - assert (param.is_cuda and step_t.is_cuda) or ( - param.is_xla and step_t.is_xla - ), "If capturable=True, params and state_steps must be CUDA or XLA tensors." + capturable_supported_devices = _get_capturable_supported_devices() + assert ( + param.device.type == step_t.device.type + and param.device.type in capturable_supported_devices + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." if torch.is_complex(param): grad = torch.view_as_real(grad) @@ -464,9 +468,14 @@ def _multi_tensor_adamw( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: + capturable_supported_devices = _get_capturable_supported_devices( + supports_xla=False + ) assert all( - p.is_cuda and step.is_cuda for p, step in zip(params, state_steps) - ), "If capturable=True, params and state_steps must be CUDA tensors." + p.device.type == step.device.type + and p.device.type in capturable_supported_devices + for p, step in zip(params, state_steps) + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." assert not differentiable, "_foreach ops don't support autograd" @@ -498,7 +507,7 @@ def _multi_tensor_adamw( ) if maximize: - device_grads = torch._foreach_neg(device_grads) + device_grads = torch._foreach_neg(device_grads) # type: ignore[assignment] # Update steps # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over @@ -526,6 +535,10 @@ def _multi_tensor_adamw( # Delete the local intermediate since it won't be used anymore to save on peak memory del device_grads + bias_correction1: Union[Tuple[Tensor, ...], List[Tensor]] + bias_correction2: Union[Tuple[Tensor, ...], List[Tensor]] + bias_correction2_sqrt: Union[Tuple[Tensor, ...], List[Tensor]] + if capturable: bias_correction1 = torch._foreach_pow(beta1, device_state_steps) bias_correction2 = torch._foreach_pow(beta2, device_state_steps) @@ -572,7 +585,9 @@ def _multi_tensor_adamw( step_size = _stack_if_compiling([(lr / bc) * -1 for bc in bias_correction1]) - bias_correction2_sqrt = [_dispatch_sqrt(bc) for bc in bias_correction2] + bias_correction2_sqrt = [ + _dispatch_sqrt(bc) for bc in bias_correction2 # type: ignore[arg-type] + ] if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now @@ -586,7 +601,10 @@ def _multi_tensor_adamw( torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt) torch._foreach_add_(exp_avg_sq_sqrt, eps) torch._foreach_addcdiv_( - device_params, device_exp_avgs, exp_avg_sq_sqrt, step_size + device_params, + device_exp_avgs, + exp_avg_sq_sqrt, + step_size, # type: ignore[arg-type] ) @@ -603,27 +621,29 @@ def _fused_adamw( amsgrad: bool, beta1: float, beta2: float, - lr: Union[float, Tensor], + lr: Union[Tensor, float], weight_decay: float, eps: float, maximize: bool, capturable: bool, # Needed for consistency. differentiable: bool, - has_complex: bool, + has_complex: bool, # Needed for consistency. ) -> None: if not params: return if differentiable: raise RuntimeError("Adam with fused=True does not support differentiable=True") - grad_scale_dict = ( - {grad_scale.device: grad_scale} if grad_scale is not None else None + grad_scale_dict: DeviceDict = ( + {grad_scale.device: grad_scale} if grad_scale is not None else {} + ) + found_inf_dict: DeviceDict = ( + {found_inf.device: found_inf} if found_inf is not None else {} ) - found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer # treating it as a scalar. - lr_dict = ( + lr_dict: Optional[DeviceDict] = ( {lr.device: lr} if isinstance(lr, Tensor) and str(lr.device) != "cpu" else None ) @@ -643,16 +663,17 @@ def _fused_adamw( ) in grouped_tensors.items(): device_grad_scale, device_found_inf = None, None if grad_scale is not None: - if device not in grad_scale_dict: - grad_scale_dict[device] = grad_scale.to(device, non_blocking=True) - device_grad_scale = grad_scale_dict[device] + device_grad_scale = grad_scale_dict.setdefault( + device, grad_scale.to(device, non_blocking=True) + ) if found_inf is not None: - if found_inf not in found_inf_dict: - found_inf_dict[device] = found_inf.to(device, non_blocking=True) - device_found_inf = found_inf_dict[device] + device_found_inf = found_inf_dict.setdefault( + device, found_inf.to(device, non_blocking=True) + ) if lr_dict is not None and device not in lr_dict: - lr_dict[device] = lr.to(device=device, non_blocking=True) - lr = lr_dict[device] + lr = lr_dict.setdefault( + device, lr.to(device=device, non_blocking=True) # type: ignore[union-attr] + ) torch._foreach_add_(device_state_steps, 1) torch._fused_adamw_( device_params, diff --git a/torch/optim/adamw.pyi b/torch/optim/adamw.pyi deleted file mode 100644 index 17c35ebec8a6a..0000000000000 --- a/torch/optim/adamw.pyi +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Optional, Tuple, Union - -from torch import Tensor - -from .optimizer import Optimizer, ParamsT - -class AdamW(Optimizer): - def __init__( - self, - params: ParamsT, - lr: Union[float, Tensor] = 1e-3, - betas: Tuple[float, float] = (0.9, 0.999), - eps: float = 1e-8, - weight_decay: float = 1e-2, - amsgrad: bool = False, - *, - maximize: bool = False, - foreach: Optional[bool] = None, - capturable: bool = False, - differentiable: bool = False, - fused: Optional[bool] = None, - ) -> None: ... diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py index 5714a82d5f19b..a87aadc81803c 100644 --- a/torch/optim/asgd.py +++ b/torch/optim/asgd.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List, Optional, Tuple, Union import torch from torch import Tensor @@ -9,12 +9,14 @@ _differentiable_doc, _disable_dynamo_if_unsupported, _foreach_doc, + _get_capturable_supported_devices, _get_scalar_dtype, _get_value, _maximize_doc, _use_grad_for_differentiable, _view_as_real, Optimizer, + ParamsT, ) __all__ = ["ASGD", "asgd"] @@ -30,12 +32,12 @@ def _to_tensor(x, device=None): class ASGD(Optimizer): def __init__( self, - params, - lr=1e-2, - lambd=1e-4, - alpha=0.75, - t0=1e6, - weight_decay=0, + params: ParamsT, + lr: float = 1e-2, + lambd: float = 1e-4, + alpha: float = 0.75, + t0: float = 1e6, + weight_decay: float = 0, foreach: Optional[bool] = None, maximize: bool = False, differentiable: bool = False, @@ -135,12 +137,12 @@ def step(self, closure=None): loss = closure() for group in self.param_groups: - params_with_grad = [] - grads = [] - mus = [] - axs = [] - etas = [] - state_steps = [] + params_with_grad: List[Tensor] = [] + grads: List[Tensor] = [] + mus: List[Tensor] = [] + axs: List[Tensor] = [] + etas: List[Tensor] = [] + state_steps: List[Tensor] = [] has_complex = self._init_group( group, params_with_grad, grads, mus, axs, etas, state_steps @@ -220,11 +222,17 @@ def _single_tensor_asgd( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: + capturable_supported_devices = _get_capturable_supported_devices() assert ( - param.is_cuda and mu.is_cuda and eta.is_cuda and step_t.is_cuda - ) or ( - param.is_xla and mu.is_xla and eta.is_xla and step_t.is_xla - ), "If capturable=True, params, mus, etas, and state_steps must be CUDA or XLA tensors." + param.device.type + == mu.device.type + == eta.device.type + == step_t.device.type + and param.device.type in capturable_supported_devices + ), ( + f"If capturable=True, params, mus, etas, and state_steps must be " + f"on supported devices: {capturable_supported_devices}." + ) if torch.is_complex(param): grad = torch.view_as_real(grad) @@ -287,10 +295,14 @@ def _multi_tensor_asgd( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: + capturable_supported_devices = _get_capturable_supported_devices( + supports_xla=False + ) assert all( - p.is_cuda and mu.is_cuda and eta.is_cuda and step.is_cuda + p.device.type == mu.device.type == eta.device.type == step.device.type + and p.device.type in capturable_supported_devices for p, mu, eta, step in zip(params, mus, etas, state_steps) - ), "If capturable=True, params, mus, etas, and state_steps must be CUDA tensors." + ), f"If capturable=True, params, mus, etas, and state_steps must be on supported devices: {capturable_supported_devices}." grouped_tensors = Optimizer._group_tensors_by_device_and_dtype( [params, grads, axs, mus, etas, state_steps] @@ -310,7 +322,7 @@ def _multi_tensor_asgd( _view_as_real(grouped_params, grouped_grads, grouped_axs) if maximize: - grouped_grads = torch._foreach_neg(grouped_grads) + grouped_grads = torch._foreach_neg(grouped_grads) # type: ignore[assignment] # Update steps # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over @@ -324,6 +336,7 @@ def _multi_tensor_asgd( torch._foreach_add_(grouped_state_steps, 1) # intermediate = grad + param * lambd + intermediate: Union[Tuple[Tensor, ...], List[Tensor]] if weight_decay != 0: if maximize: torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay) @@ -358,6 +371,8 @@ def _multi_tensor_asgd( torch._foreach_addcmul_(grouped_axs, intermediate, grouped_mus) del intermediate + new_etas: Union[Tuple[Tensor, ...], List[Tensor]] + new_mus: Union[Tuple[Tensor, ...], List[Tensor]] if capturable: # update grouped_mus new_mus = torch._foreach_sub(grouped_state_steps, t0) diff --git a/torch/optim/asgd.pyi b/torch/optim/asgd.pyi deleted file mode 100644 index 634b0d162cebd..0000000000000 --- a/torch/optim/asgd.pyi +++ /dev/null @@ -1,12 +0,0 @@ -from .optimizer import Optimizer, ParamsT - -class ASGD(Optimizer): - def __init__( - self, - params: ParamsT, - lr: float = ..., - lambd: float = ..., - alpha: float = ..., - t0: float = ..., - weight_decay: float = ..., - ) -> None: ... diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py index 1e0f5738ad637..e8818cca538c9 100644 --- a/torch/optim/lbfgs.py +++ b/torch/optim/lbfgs.py @@ -1,5 +1,7 @@ +from typing import Optional + import torch -from .optimizer import Optimizer +from .optimizer import Optimizer, ParamsT __all__ = ["LBFGS"] @@ -99,17 +101,17 @@ def _strong_wolfe( # exact point satisfying the criteria insuf_progress = False # find high and low points in bracket - low_pos, high_pos = (0, 1) if bracket_f[0] <= bracket_f[-1] else (1, 0) + low_pos, high_pos = (0, 1) if bracket_f[0] <= bracket_f[-1] else (1, 0) # type: ignore[possibly-undefined] while not done and ls_iter < max_ls: # line-search bracket is so small - if abs(bracket[1] - bracket[0]) * d_norm < tolerance_change: + if abs(bracket[1] - bracket[0]) * d_norm < tolerance_change: # type: ignore[possibly-undefined] break # compute new trial value t = _cubic_interpolate( bracket[0], bracket_f[0], - bracket_gtd[0], + bracket_gtd[0], # type: ignore[possibly-undefined] bracket[1], bracket_f[1], bracket_gtd[1], @@ -147,7 +149,7 @@ def _strong_wolfe( # Armijo condition not satisfied or not lower than lowest point bracket[high_pos] = t bracket_f[high_pos] = f_new - bracket_g[high_pos] = g_new.clone(memory_format=torch.contiguous_format) + bracket_g[high_pos] = g_new.clone(memory_format=torch.contiguous_format) # type: ignore[possibly-undefined] bracket_gtd[high_pos] = gtd_new low_pos, high_pos = (0, 1) if bracket_f[0] <= bracket_f[1] else (1, 0) else: @@ -158,19 +160,19 @@ def _strong_wolfe( # old high becomes new low bracket[high_pos] = bracket[low_pos] bracket_f[high_pos] = bracket_f[low_pos] - bracket_g[high_pos] = bracket_g[low_pos] + bracket_g[high_pos] = bracket_g[low_pos] # type: ignore[possibly-undefined] bracket_gtd[high_pos] = bracket_gtd[low_pos] # new point becomes new low bracket[low_pos] = t bracket_f[low_pos] = f_new - bracket_g[low_pos] = g_new.clone(memory_format=torch.contiguous_format) + bracket_g[low_pos] = g_new.clone(memory_format=torch.contiguous_format) # type: ignore[possibly-undefined] bracket_gtd[low_pos] = gtd_new # return stuff - t = bracket[low_pos] + t = bracket[low_pos] # type: ignore[possibly-undefined] f_new = bracket_f[low_pos] - g_new = bracket_g[low_pos] + g_new = bracket_g[low_pos] # type: ignore[possibly-undefined] return f_new, g_new, t, ls_func_evals @@ -210,14 +212,14 @@ class LBFGS(Optimizer): def __init__( self, - params, - lr=1, - max_iter=20, - max_eval=None, - tolerance_grad=1e-7, - tolerance_change=1e-9, - history_size=100, - line_search_fn=None, + params: ParamsT, + lr: float = 1, + max_iter: int = 20, + max_eval: Optional[int] = None, + tolerance_grad: float = 1e-7, + tolerance_change: float = 1e-9, + history_size: int = 100, + line_search_fn: Optional[str] = None, ): if max_eval is None: max_eval = max_iter * 5 // 4 diff --git a/torch/optim/lbfgs.pyi b/torch/optim/lbfgs.pyi deleted file mode 100644 index c7c0ac060881a..0000000000000 --- a/torch/optim/lbfgs.pyi +++ /dev/null @@ -1,16 +0,0 @@ -from typing import Optional - -from .optimizer import Optimizer, ParamsT - -class LBFGS(Optimizer): - def __init__( - self, - params: ParamsT, - lr: float = ..., - max_iter: int = ..., - max_eval: Optional[int] = ..., - tolerance_grad: float = ..., - tolerance_change: float = ..., - history_size: int = ..., - line_search_fn: Optional[str] = ..., - ) -> None: ... diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py index 6d0daaf9d7184..77bdb6b46aac0 100644 --- a/torch/optim/lr_scheduler.py +++ b/torch/optim/lr_scheduler.py @@ -155,7 +155,18 @@ def print_lr( lr: float, epoch: Optional[int] = None, ): - """Display the current learning rate.""" + """Display the current learning rate. + + .. deprecated:: 2.4 + ``print_lr()`` is deprecated. Please use ``get_last_lr()`` to access the + learning rate. + """ + warnings.warn( + "`LRScheduler.print_lr()` is being deprecated. To fetch the learning rate, " + "please use `get_last_lr()` instead. For more details, " + "see https://github.com/pytorch/pytorch/issues/99270.", + UserWarning, + ) if is_verbose: if epoch is None: print(f"Adjusting learning rate of group {group} to {lr:.4e}.") diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py index 901036897f564..cca41f5bc8427 100644 --- a/torch/optim/nadam.py +++ b/torch/optim/nadam.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import cast, List, Optional, Tuple, Union import torch from torch import Tensor @@ -9,12 +9,14 @@ _disable_dynamo_if_unsupported, _dispatch_sqrt, _foreach_doc, + _get_capturable_supported_devices, _get_scalar_dtype, _get_value, _stack_if_compiling, _use_grad_for_differentiable, _view_as_real, Optimizer, + ParamsT, ) __all__ = ["NAdam", "nadam"] @@ -23,12 +25,12 @@ class NAdam(Optimizer): def __init__( self, - params, - lr=2e-3, - betas=(0.9, 0.999), - eps=1e-8, - weight_decay=0, - momentum_decay=4e-3, + params: ParamsT, + lr: float = 2e-3, + betas: Tuple[float, float] = (0.9, 0.999), + eps: float = 1e-8, + weight_decay: float = 0, + momentum_decay: float = 4e-3, decoupled_weight_decay: bool = False, *, foreach: Optional[bool] = None, @@ -155,13 +157,13 @@ def step(self, closure=None): loss = closure() for group in self.param_groups: - params_with_grad = [] - grads = [] - exp_avgs = [] - exp_avg_sqs = [] - mu_products = [] - state_steps = [] - beta1, beta2 = group["betas"] + params_with_grad: List[Tensor] = [] + grads: List[Tensor] = [] + exp_avgs: List[Tensor] = [] + exp_avg_sqs: List[Tensor] = [] + mu_products: List[Tensor] = [] + state_steps: List[Tensor] = [] + beta1, beta2 = cast(Tuple[float, float], group["betas"]) has_complex = self._init_group( group, @@ -293,9 +295,14 @@ def _single_tensor_nadam( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: - assert (param.is_cuda and mu_product.is_cuda and step_t.is_cuda) or ( - param.is_xla and mu_product.is_xla and step_t.is_xla - ), "If capturable=True, params, mu_products, and state_steps must be CUDA or XLA tensors." + capturable_supported_devices = _get_capturable_supported_devices() + assert ( + param.device.type == mu_product.device.type == step_t.device.type + and param.device.type in capturable_supported_devices + ), ( + f"If capturable=True, params, mu_products and state_steps must be " + f"on supported devices: {capturable_supported_devices}." + ) # update step step_t += 1 @@ -373,10 +380,14 @@ def _multi_tensor_nadam( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: + capturable_supported_devices = _get_capturable_supported_devices( + supports_xla=False + ) assert all( - p.is_cuda and mp.is_cuda and step.is_cuda + p.device.type == mp.device.type == step.device.type + and p.device.type in capturable_supported_devices for p, mp, step in zip(params, mu_products, state_steps) - ), "If capturable=True, params, mu_products, and state_steps must be CUDA tensors." + ), f"If capturable=True, params, mu_products, and state_steps must be on supported devices: {capturable_supported_devices}." grouped_tensors = Optimizer._group_tensors_by_device_and_dtype( [params, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps] @@ -411,7 +422,7 @@ def _multi_tensor_nadam( # Perform stepweight decay torch._foreach_mul_(grouped_params, 1 - lr * weight_decay) else: - grouped_grads = torch._foreach_add( + grouped_grads = torch._foreach_add( # type: ignore[assignment] grouped_grads, grouped_params, alpha=weight_decay ) @@ -425,6 +436,9 @@ def _multi_tensor_nadam( exp_avg_sq_sqrt = torch._foreach_sqrt(grouped_exp_avg_sqs) + bias_correction_sqrt: Union[Tuple[Tensor, ...], List[Tensor]] + mus: Union[Tuple[Tensor, ...], List[Tensor]] + mu_nexts: Union[Tuple[Tensor, ...], List[Tensor]] if capturable: # mus will be beta1 * (1 - 0.5 * 0.96 ** (step * momentum_decay)) exponent = torch._foreach_mul(grouped_state_steps, momentum_decay) @@ -524,10 +538,10 @@ def _multi_tensor_nadam( ) torch._foreach_addcdiv_( - grouped_params, grouped_grads, exp_avg_sq_sqrt, step_size_grads + grouped_params, grouped_grads, exp_avg_sq_sqrt, step_size_grads # type: ignore[arg-type] ) torch._foreach_addcdiv_( - grouped_params, grouped_exp_avgs, exp_avg_sq_sqrt, step_size_expavg + grouped_params, grouped_exp_avgs, exp_avg_sq_sqrt, step_size_expavg # type: ignore[arg-type] ) diff --git a/torch/optim/nadam.pyi b/torch/optim/nadam.pyi deleted file mode 100644 index f62e188b3d72b..0000000000000 --- a/torch/optim/nadam.pyi +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Tuple - -from .optimizer import Optimizer, ParamsT - -class NAdam(Optimizer): - def __init__( - self, - params: ParamsT, - lr: float = ..., - betas: Tuple[float, float] = ..., - eps: float = ..., - weight_decay: float = ..., - momentum_decay: float = ..., - decoupled_weight_decay: bool = ..., - ) -> None: ... diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py index fbcf95744bad8..1b76f6287af36 100644 --- a/torch/optim/optimizer.py +++ b/torch/optim/optimizer.py @@ -38,6 +38,8 @@ Kwargs: TypeAlias = Dict[str, Any] StateDict: TypeAlias = Dict[str, Any] TensorListList: TypeAlias = List[List[torch.Tensor]] +DeviceDict = Dict[Optional[torch.device], torch.Tensor] + GlobalOptimizerPreHook: TypeAlias = Callable[ ["Optimizer", Args, Kwargs], Optional[Tuple[Args, Kwargs]] @@ -213,6 +215,16 @@ def _get_scalar_dtype(is_fused=None): ) +def _get_capturable_supported_devices(supports_xla: bool = True) -> List[str]: + r"""Return the device type list that supports capturable optimizer.""" + capturable_supported_devices = ["cuda"] + if not torch.jit.is_scripting(): + capturable_supported_devices.append(torch._C._get_privateuse1_backend_name()) + if supports_xla: + capturable_supported_devices.append("xla") + return capturable_supported_devices + + # Common doc strings among optimizers _foreach_doc = r"""foreach (bool, optional): whether foreach implementation of optimizer is used. If unspecified by the user (so foreach is None), we will try to use @@ -222,7 +234,7 @@ def _get_scalar_dtype(is_fused=None): being a tensorlist vs just one tensor. If memory is prohibitive, batch fewer parameters through the optimizer at a time or switch this flag to False (default: None)""" -_fused_doc = r"""fused (bool, optional): whether the fused implementation is used. +_fused_doc = r"""fused (bool, optional): whether the fused implementation (CUDA only) is used. Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16` are supported. (default: None) diff --git a/torch/optim/radam.py b/torch/optim/radam.py index 10c38a14a6aa3..18330f98ec7ae 100644 --- a/torch/optim/radam.py +++ b/torch/optim/radam.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import cast, List, Optional, Tuple, Union import torch from torch import Tensor @@ -10,11 +10,13 @@ _disable_dynamo_if_unsupported, _dispatch_sqrt, _foreach_doc, + _get_capturable_supported_devices, _get_scalar_dtype, _get_value, _use_grad_for_differentiable, _view_as_real, Optimizer, + ParamsT, ) __all__ = ["RAdam", "radam"] @@ -23,11 +25,11 @@ class RAdam(Optimizer): def __init__( self, - params, - lr=1e-3, - betas=(0.9, 0.999), - eps=1e-8, - weight_decay=0, + params: ParamsT, + lr: float = 1e-3, + betas: Tuple[float, float] = (0.9, 0.999), + eps: float = 1e-8, + weight_decay: float = 0, decoupled_weight_decay: bool = False, *, foreach: Optional[bool] = None, @@ -127,12 +129,12 @@ def step(self, closure=None): loss = closure() for group in self.param_groups: - params_with_grad = [] - grads = [] - exp_avgs = [] - exp_avg_sqs = [] - state_steps = [] - beta1, beta2 = group["betas"] + params_with_grad: List[Tensor] = [] + grads: List[Tensor] = [] + exp_avgs: List[Tensor] = [] + exp_avg_sqs: List[Tensor] = [] + state_steps: List[Tensor] = [] + beta1, beta2 = cast(Tuple[float, float], group["betas"]) has_complex = self._init_group( group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps @@ -247,8 +249,8 @@ def _single_tensor_radam( lr: float, weight_decay: float, eps: float, - differentiable: bool, decoupled_weight_decay: bool, + differentiable: bool, capturable: bool, has_complex: bool, ): @@ -260,9 +262,11 @@ def _single_tensor_radam( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: - assert (param.is_cuda and step_t.is_cuda) or ( - param.is_xla and step_t.is_xla - ), "If capturable=True, params and state_steps must be CUDA or XLA tensors." + capturable_supported_devices = _get_capturable_supported_devices() + assert ( + param.device.type == step_t.device.type + and param.device.type in capturable_supported_devices + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." if torch.is_complex(param): param = torch.view_as_real(param) @@ -355,9 +359,14 @@ def _multi_tensor_radam( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: + capturable_supported_devices = _get_capturable_supported_devices( + supports_xla=False + ) assert all( - p.is_cuda and step.is_cuda for p, step in zip(params, state_steps) - ), "If capturable=True, params and state_steps must be CUDA tensors." + p.device.type == step.device.type + and p.device.type in capturable_supported_devices + for p, step in zip(params, state_steps) + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." grouped_tensors = Optimizer._group_tensors_by_device_and_dtype( [params, grads, exp_avgs, exp_avg_sqs, state_steps] @@ -388,6 +397,9 @@ def _multi_tensor_radam( # maximum length of the approximated SMA rho_inf = 2 / (1 - beta2) - 1 # compute the length of the approximated SMA + bias_correction1: Union[Tuple[Tensor, ...], List[Tensor]] + bias_correction2: Union[Tuple[Tensor, ...], List[Tensor]] + rho_t_list: Union[Tuple[Tensor, ...], List[Tensor]] if capturable: bias_correction1 = torch._foreach_pow(beta2, grouped_state_steps) torch._foreach_neg_(bias_correction1) @@ -413,7 +425,7 @@ def _multi_tensor_radam( if decoupled_weight_decay: torch._foreach_mul_(grouped_params, 1 - lr * weight_decay) else: - grouped_grads = torch._foreach_add( + grouped_grads = torch._foreach_add( # type: ignore[assignment] grouped_grads, grouped_params, alpha=weight_decay ) @@ -469,7 +481,7 @@ def _multi_tensor_radam( else: rect = [ _dispatch_sqrt( - (rho_t - 4) + (rho_t - 4) # type: ignore[arg-type] * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t) diff --git a/torch/optim/radam.pyi b/torch/optim/radam.pyi deleted file mode 100644 index b001376b05ef4..0000000000000 --- a/torch/optim/radam.pyi +++ /dev/null @@ -1,14 +0,0 @@ -from typing import Tuple - -from .optimizer import Optimizer, ParamsT - -class RAdam(Optimizer): - def __init__( - self, - params: ParamsT, - lr: float = ..., - betas: Tuple[float, float] = ..., - eps: float = ..., - weight_decay: float = ..., - decoupled_weight_decay: bool = ..., - ) -> None: ... diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py index dc4491b553b24..b3375c338b40f 100644 --- a/torch/optim/rmsprop.py +++ b/torch/optim/rmsprop.py @@ -8,11 +8,13 @@ _differentiable_doc, _disable_dynamo_if_unsupported, _foreach_doc, + _get_capturable_supported_devices, _get_scalar_dtype, _maximize_doc, _use_grad_for_differentiable, _view_as_real, Optimizer, + ParamsT, ) __all__ = ["RMSprop", "rmsprop"] @@ -21,12 +23,12 @@ class RMSprop(Optimizer): def __init__( self, - params, - lr=1e-2, - alpha=0.99, - eps=1e-8, - weight_decay=0, - momentum=0, + params: ParamsT, + lr: float = 1e-2, + alpha: float = 0.99, + eps: float = 1e-8, + weight_decay: float = 0, + momentum: float = 0, centered=False, capturable=False, foreach: Optional[bool] = None, @@ -146,12 +148,12 @@ def step(self, closure=None): loss = closure() for group in self.param_groups: - params_with_grad = [] - grads = [] - square_avgs = [] - grad_avgs = [] - momentum_buffer_list = [] - state_steps = [] + params_with_grad: List[Tensor] = [] + grads: List[Tensor] = [] + square_avgs: List[Tensor] = [] + grad_avgs: List[Tensor] = [] + momentum_buffer_list: List[Tensor] = [] + state_steps: List[Tensor] = [] has_complex = self._init_group( group, @@ -275,9 +277,11 @@ def _single_tensor_rmsprop( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: - assert (param.is_cuda and step.is_cuda) or ( - param.is_xla and step.is_xla - ), "If capturable=True, params and state_steps must be CUDA or XLA tensors." + capturable_supported_devices = _get_capturable_supported_devices() + assert ( + param.device.type == step.device.type + and param.device.type in capturable_supported_devices + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." grad = grads[i] grad = grad if not maximize else -grad @@ -346,10 +350,12 @@ def _multi_tensor_rmsprop( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: + capturable_supported_devices = _get_capturable_supported_devices() assert all( - (p.is_cuda and step.is_cuda) or (p.is_xla and step.is_xla) + p.device.type == step.device.type + and p.device.type in capturable_supported_devices for p, step in zip(params, state_steps) - ), "If capturable=True, params and state_steps must be CUDA tensors." + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." grouped_tensors = Optimizer._group_tensors_by_device_and_dtype( [params, grads, square_avgs, grad_avgs, momentum_buffer_list, state_steps] @@ -373,7 +379,7 @@ def _multi_tensor_rmsprop( _view_as_real(grouped_params, *state_and_grads) if maximize: - grouped_grads = torch._foreach_neg(grouped_grads) + grouped_grads = torch._foreach_neg(grouped_grads) # type: ignore[assignment] # Update steps # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over @@ -391,7 +397,7 @@ def _multi_tensor_rmsprop( if maximize: torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay) else: - grouped_grads = torch._foreach_add( + grouped_grads = torch._foreach_add( # type: ignore[assignment] grouped_grads, grouped_params, alpha=weight_decay ) diff --git a/torch/optim/rmsprop.pyi b/torch/optim/rmsprop.pyi deleted file mode 100644 index f206d542dcecb..0000000000000 --- a/torch/optim/rmsprop.pyi +++ /dev/null @@ -1,13 +0,0 @@ -from .optimizer import Optimizer, ParamsT - -class RMSprop(Optimizer): - def __init__( - self, - params: ParamsT, - lr: float = ..., - alpha: float = ..., - eps: float = ..., - weight_decay: float = ..., - momentum: float = ..., - centered: bool = ..., - ) -> None: ... diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py index b252f5214cb8a..ec40aae5c90a9 100644 --- a/torch/optim/rprop.py +++ b/torch/optim/rprop.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List, Optional, Tuple import torch from torch import Tensor @@ -8,11 +8,13 @@ _differentiable_doc, _disable_dynamo_if_unsupported, _foreach_doc, + _get_capturable_supported_devices, _get_scalar_dtype, _maximize_doc, _use_grad_for_differentiable, _view_as_real, Optimizer, + ParamsT, ) __all__ = ["Rprop", "rprop"] @@ -21,10 +23,10 @@ class Rprop(Optimizer): def __init__( self, - params, - lr=1e-2, - etas=(0.5, 1.2), - step_sizes=(1e-6, 50), + params: ParamsT, + lr: float = 1e-2, + etas: Tuple[float, float] = (0.5, 1.2), + step_sizes: Tuple[float, float] = (1e-6, 50), *, capturable: bool = False, foreach: Optional[bool] = None, @@ -120,11 +122,11 @@ def step(self, closure=None): loss = closure() for group in self.param_groups: - params = [] - grads = [] - prevs = [] - step_sizes = [] - state_steps = [] + params: List[Tensor] = [] + grads: List[Tensor] = [] + prevs: List[Tensor] = [] + step_sizes: List[Tensor] = [] + state_steps: List[Tensor] = [] etaminus, etaplus = group["etas"] step_size_min, step_size_max = group["step_sizes"] @@ -235,9 +237,11 @@ def _single_tensor_rprop( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: - assert (param.is_cuda and step.is_cuda) or ( - param.is_xla and step.is_xla - ), "If capturable=True, params and state_steps must be CUDA or XLA tensors." + capturable_supported_devices = _get_capturable_supported_devices() + assert ( + param.device.type == step.device.type + and param.device.type in capturable_supported_devices + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." step += 1 @@ -299,10 +303,12 @@ def _multi_tensor_rprop( # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable] if not torch._utils.is_compiling() and capturable: + capturable_supported_devices = _get_capturable_supported_devices() assert all( - (p.is_cuda and step.is_cuda) or (p.is_xla and step.is_xla) + p.device.type == step.device.type + and p.device.type in capturable_supported_devices for p, step in zip(params, state_steps) - ), "If capturable=True, params and state_steps must be CUDA or XLA tensors." + ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}." grouped_tensors = Optimizer._group_tensors_by_device_and_dtype( [params, grads, prevs, step_sizes, state_steps] diff --git a/torch/optim/rprop.pyi b/torch/optim/rprop.pyi deleted file mode 100644 index fd0c6ba209161..0000000000000 --- a/torch/optim/rprop.pyi +++ /dev/null @@ -1,12 +0,0 @@ -from typing import Tuple - -from .optimizer import Optimizer, ParamsT - -class Rprop(Optimizer): - def __init__( - self, - params: ParamsT, - lr: float = ..., - etas: Tuple[float, float] = ..., - step_sizes: Tuple[float, float] = ..., - ) -> None: ... diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py index b346958204710..c0efc24430787 100644 --- a/torch/optim/sgd.py +++ b/torch/optim/sgd.py @@ -10,6 +10,7 @@ _fused_doc, _maximize_doc, _use_grad_for_differentiable, + DeviceDict, Optimizer, ) @@ -20,10 +21,10 @@ class SGD(Optimizer): def __init__( self, params, - lr=1e-3, - momentum=0, - dampening=0, - weight_decay=0, + lr: float = 1e-3, + momentum: float = 0, + dampening: float = 0, + weight_decay: float = 0, nesterov=False, *, maximize: bool = False, @@ -80,13 +81,13 @@ def __setstate__(self, state): group.setdefault("differentiable", False) group.setdefault("fused", False) - def _init_group(self, group, params_with_grad, d_p_list, momentum_buffer_list): + def _init_group(self, group, params, grads, momentum_buffer_list): has_sparse_grad = False for p in group["params"]: if p.grad is not None: - params_with_grad.append(p) - d_p_list.append(p.grad) + params.append(p) + grads.append(p.grad) if p.grad.is_sparse: has_sparse_grad = True @@ -110,17 +111,17 @@ def step(self, closure=None): loss = closure() for group in self.param_groups: - params_with_grad = [] - d_p_list = [] - momentum_buffer_list = [] + params: List[Tensor] = [] + grads: List[Tensor] = [] + momentum_buffer_list: List[Optional[Tensor]] = [] has_sparse_grad = self._init_group( - group, params_with_grad, d_p_list, momentum_buffer_list + group, params, grads, momentum_buffer_list ) sgd( - params_with_grad, - d_p_list, + params, + grads, momentum_buffer_list, weight_decay=group["weight_decay"], momentum=group["momentum"], @@ -137,7 +138,7 @@ def step(self, closure=None): if group["momentum"] != 0: # update momentum_buffers in state - for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list): + for p, momentum_buffer in zip(params, momentum_buffer_list): state = self.state[p] state["momentum_buffer"] = momentum_buffer @@ -245,7 +246,7 @@ def sgd( momentum_buffer_list: List[Optional[Tensor]], # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627 # setting this as kwarg for now as functional API is compiled by torch/distributed/optim - has_sparse_grad: bool = None, + has_sparse_grad: bool = False, foreach: Optional[bool] = None, fused: Optional[bool] = None, grad_scale: Optional[Tensor] = None, @@ -312,7 +313,7 @@ def sgd( def _single_tensor_sgd( params: List[Tensor], - d_p_list: List[Tensor], + grads: List[Tensor], momentum_buffer_list: List[Optional[Tensor]], grad_scale: Optional[Tensor], found_inf: Optional[Tensor], @@ -328,26 +329,26 @@ def _single_tensor_sgd( assert grad_scale is None and found_inf is None for i, param in enumerate(params): - d_p = d_p_list[i] if not maximize else -d_p_list[i] + grad = grads[i] if not maximize else -grads[i] if weight_decay != 0: - d_p = d_p.add(param, alpha=weight_decay) + grad = grad.add(param, alpha=weight_decay) if momentum != 0: buf = momentum_buffer_list[i] if buf is None: - buf = torch.clone(d_p).detach() + buf = torch.clone(grad).detach() momentum_buffer_list[i] = buf else: - buf.mul_(momentum).add_(d_p, alpha=1 - dampening) + buf.mul_(momentum).add_(grad, alpha=1 - dampening) if nesterov: - d_p = d_p.add(buf, alpha=momentum) + grad = grad.add(buf, alpha=momentum) else: - d_p = buf + grad = buf - param.add_(d_p, alpha=-lr) + param.add_(grad, alpha=-lr) def _multi_tensor_sgd( @@ -371,7 +372,7 @@ def _multi_tensor_sgd( return grouped_tensors = Optimizer._group_tensors_by_device_and_dtype( - [params, grads, momentum_buffer_list], with_indices=True + [params, grads, momentum_buffer_list], with_indices=True # type: ignore[list-item] ) for ( device_params, @@ -383,14 +384,14 @@ def _multi_tensor_sgd( ) if maximize: - device_grads = torch._foreach_neg(device_grads) + device_grads = torch._foreach_neg(device_grads) # type: ignore[assignment] if weight_decay != 0: # Re-use the intermediate memory (device_grads) already allocated for maximize if maximize: torch._foreach_add_(device_grads, device_params, alpha=weight_decay) else: - device_grads = torch._foreach_add( + device_grads = torch._foreach_add( # type: ignore[assignment] device_grads, device_params, alpha=weight_decay ) @@ -458,10 +459,12 @@ def _fused_sgd( return if has_sparse_grad: raise RuntimeError("`_fused_sgd` does not support sparse gradients") - grad_scale_dict = ( - {grad_scale.device: grad_scale} if grad_scale is not None else None + grad_scale_dict: DeviceDict = ( + {grad_scale.device: grad_scale} if grad_scale is not None else {} + ) + found_inf_dict: DeviceDict = ( + {found_inf.device: found_inf} if found_inf is not None else {} ) - found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None no_momentum_buffer = momentum == 0 is_first_step = ( @@ -471,21 +474,19 @@ def _fused_sgd( for i, g in enumerate(grads): momentum_buffer_list[i] = torch.empty_like(g) grouped_tensors = Optimizer._group_tensors_by_device_and_dtype( - [params, grads, momentum_buffer_list], with_indices=False + [params, grads, momentum_buffer_list], with_indices=False # type: ignore[list-item] ) - for (device, dtype), ( + for (device, _), ( (device_params, device_grads, device_momentum_buffer_list), _, ) in grouped_tensors.items(): device_grad_scale, device_found_inf = None, None if grad_scale is not None: - if device not in grad_scale_dict: - grad_scale_dict[device] = grad_scale.to(device) - device_grad_scale = grad_scale_dict[device] - if found_inf is not None: - if device not in found_inf_dict: - found_inf_dict[device] = found_inf.to(device) - device_found_inf = found_inf_dict[device] + device_grad_scale = grad_scale_dict.setdefault( + device, grad_scale.to(device) + ) + if found_inf_dict is not None and found_inf is not None: + device_found_inf = found_inf_dict.setdefault(device, found_inf.to(device)) torch._fused_sgd_( device_params, device_grads, diff --git a/torch/optim/sgd.pyi b/torch/optim/sgd.pyi deleted file mode 100644 index ba1bcd60a1b89..0000000000000 --- a/torch/optim/sgd.pyi +++ /dev/null @@ -1,12 +0,0 @@ -from .optimizer import Optimizer, ParamsT - -class SGD(Optimizer): - def __init__( - self, - params: ParamsT, - lr: float = ..., - momentum: float = ..., - dampening: float = ..., - weight_decay: float = ..., - nesterov: bool = ..., - ) -> None: ... diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py index e3ee2db8204b0..88643d1a56461 100644 --- a/torch/optim/sparse_adam.py +++ b/torch/optim/sparse_adam.py @@ -1,13 +1,21 @@ +from typing import List, Tuple + import torch +from torch import Tensor from . import _functional as F -from .optimizer import _maximize_doc, Optimizer +from .optimizer import _maximize_doc, Optimizer, ParamsT __all__ = ["SparseAdam"] class SparseAdam(Optimizer): def __init__( - self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, maximize: bool = False + self, + params: ParamsT, + lr: float = 1e-3, + betas: Tuple[float, float] = (0.9, 0.999), + eps: float = 1e-8, + maximize: bool = False, ): if not 0.0 < lr: raise ValueError(f"Invalid learning rate: {lr}") @@ -56,13 +64,11 @@ def step(self, closure=None): loss = closure() for group in self.param_groups: - params_with_grad = [] - grads = [] - exp_avgs = [] - exp_avg_sqs = [] - state_steps = [] - eps = group["eps"] - lr = group["lr"] + params_with_grad: List[Tensor] = [] + grads: List[Tensor] = [] + exp_avgs: List[Tensor] = [] + exp_avg_sqs: List[Tensor] = [] + state_steps: List[int] = [] beta1, beta2 = group["betas"] maximize = group.get("maximize", False) @@ -103,10 +109,10 @@ def step(self, closure=None): exp_avgs, exp_avg_sqs, state_steps, + eps=group["eps"], beta1=beta1, beta2=beta2, lr=group["lr"], - eps=group["eps"], maximize=maximize, ) diff --git a/torch/optim/sparse_adam.pyi b/torch/optim/sparse_adam.pyi deleted file mode 100644 index a84001d590b8c..0000000000000 --- a/torch/optim/sparse_adam.pyi +++ /dev/null @@ -1,12 +0,0 @@ -from typing import Tuple - -from .optimizer import Optimizer, ParamsT - -class SparseAdam(Optimizer): - def __init__( - self, - params: ParamsT, - lr: float = ..., - betas: Tuple[float, float] = ..., - eps: float = ..., - ) -> None: ... diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py index 62bb93c906358..7c2c9cdaf6f92 100644 --- a/torch/optim/swa_utils.py +++ b/torch/optim/swa_utils.py @@ -2,7 +2,7 @@ import math import warnings from copy import deepcopy -from typing import Any, Callable, cast, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Callable, Iterable, List, Literal, Optional, Tuple, Union import torch from torch import Tensor @@ -21,11 +21,7 @@ "get_swa_avg_fn", ] -from torch.utils._foreach_utils import ( - _group_tensors_by_device_and_dtype, - Indices, - TensorListList, -) +from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype PARAM_LIST = Union[Tuple[Tensor, ...], List[Tensor]] @@ -192,6 +188,7 @@ class AveragedModel(Module): .. _Polyak averaging: https://paperswithcode.com/method/polyak-averaging """ + n_averaged: Tensor def __init__( self, @@ -231,8 +228,8 @@ def update_parameters(self, model: Module): if self.use_buffers else model.parameters() ) - self_param_detached = [] - model_param_detached = [] + self_param_detached: List[Optional[Tensor]] = [] + model_param_detached: List[Optional[Tensor]] = [] for p_averaged, p_model in zip(self_param, model_param): p_model_ = p_model.detach().to(p_averaged.device) self_param_detached.append(p_averaged.detach()) @@ -243,14 +240,7 @@ def update_parameters(self, model: Module): if self.n_averaged > 0: if self.multi_avg_fn is not None or self.avg_fn is None: grouped_tensors = _group_tensors_by_device_and_dtype( - cast(TensorListList, [self_param_detached, model_param_detached]) - ) - grouped_tensors = cast( - Dict[ - Tuple[torch.device, torch.dtype], - Tuple[List[List[Tensor]], Indices], - ], - grouped_tensors, + [self_param_detached, model_param_detached] ) for (device, _), ( [self_params, model_params], @@ -258,9 +248,12 @@ def update_parameters(self, model: Module): ) in grouped_tensors.items(): if self.multi_avg_fn: self.multi_avg_fn( - self_params, model_params, self.n_averaged.to(device) + self_params, model_params, self.n_averaged.to(device) # type: ignore[arg-type] ) - elif device.type in _get_foreach_kernels_supported_devices(): + elif ( + device is not None + and device.type in _get_foreach_kernels_supported_devices() + ): multi_avg_fn = get_swa_multi_avg_fn() multi_avg_fn( self_params, model_params, self.n_averaged.to(device) @@ -268,10 +261,10 @@ def update_parameters(self, model: Module): else: avg_fn = get_swa_avg_fn() n_averaged = self.n_averaged.to(device) - for p_averaged, p_model in zip(self_params, model_params): + for p_averaged, p_model in zip(self_params, model_params): # type: ignore[assignment] p_averaged.copy_(avg_fn(p_averaged, p_model, n_averaged)) else: - for p_averaged, p_model in zip( + for p_averaged, p_model in zip( # type: ignore[assignment] self_param_detached, model_param_detached ): n_averaged = self.n_averaged.to(p_averaged.device) @@ -394,7 +387,7 @@ def __init__( optimizer: Optimizer, swa_lr: float, anneal_epochs=10, - anneal_strategy="cos", + anneal_strategy: Literal["cos", "linear"] = "cos", last_epoch=-1, ): swa_lrs = self._format_param(optimizer, swa_lr) @@ -417,7 +410,10 @@ def __init__( super().__init__(optimizer, last_epoch) @staticmethod - def _format_param(optimizer, swa_lrs): + def _format_param( + optimizer: Optimizer, + swa_lrs: Union[float, List[float], Tuple[float, ...]], + ) -> Union[List[float], Tuple[float, ...]]: if isinstance(swa_lrs, (list, tuple)): if len(swa_lrs) != len(optimizer.param_groups): raise ValueError( diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py index 6e40d0f68bd23..f9efd00f1bb06 100644 --- a/torch/profiler/profiler.py +++ b/torch/profiler/profiler.py @@ -148,7 +148,6 @@ def stop(self): def prepare_trace(self): if self.profiler is None: self.profiler = prof.profile( - use_cuda=(ProfilerActivity.CUDA in self.activities), use_cpu=(ProfilerActivity.CPU in self.activities), use_mtia=(ProfilerActivity.MTIA in self.activities), use_device=self.use_device, diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py index 4e2a654562444..8527084f4afa8 100644 --- a/torch/testing/_internal/autocast_test_lists.py +++ b/torch/testing/_internal/autocast_test_lists.py @@ -184,6 +184,7 @@ def __init__(self, dev): ("cross", (torch.randn(3, dtype=torch.float32, device=dev), torch.randn(3, dtype=torch.float16, device=dev))), ("dot", pointwise0_fp16 + pointwise1_fp32), + ("vdot", pointwise0_fp16 + pointwise1_fp32), ("grid_sampler", (torch.randn((2, 3, 33, 22), dtype=torch.float16, device=dev), torch.randn((2, 22, 11, 2), dtype=torch.float32, device=dev), 0, 0, False)), diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py index 31bf3fafd2379..283982b2ba445 100644 --- a/torch/testing/_internal/common_fsdp.py +++ b/torch/testing/_internal/common_fsdp.py @@ -17,11 +17,14 @@ import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F +from torch.distributed._composable import checkpoint +from torch.distributed._composable.fsdp import fully_shard from torch.distributed._composable.fsdp._fsdp_param_group import ( FSDPParamGroup, RegisterPostBackwardFunction, ) from torch.distributed._tensor import distribute_tensor, DTensor, Shard +from torch.distributed.device_mesh import DeviceMesh from torch.distributed.fsdp import CPUOffload, FullyShardedDataParallel as FSDP from torch.distributed.fsdp._common_utils import TrainingState from torch.distributed.fsdp._init_utils import NO_RESHARD_AFTER_FORWARD_STRATEGIES @@ -32,6 +35,11 @@ ) from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler from torch.distributed.fsdp.wrap import always_wrap_policy, ModuleWrapPolicy, wrap +from torch.distributed.tensor.parallel import ( + ColwiseParallel, + parallelize_module, + RowwiseParallel, +) from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer from torch.nn.parallel.distributed import DistributedDataParallel as DDP from torch.testing._internal.common_distributed import ( @@ -856,6 +864,47 @@ def reset_parameters(self): torch.nn.init.normal_(self.buffer) +class MLPStack(nn.Sequential): + def __init__(self, mlp_dim: int): + modules = [ + nn.LayerNorm(mlp_dim, bias=False), + # Use multiplier of 3 to exercise uneven case + MLP(mlp_dim, dim_multiplier=3), + MLP(mlp_dim), + MLP(mlp_dim, dim_multiplier=3), + ] + super().__init__(*modules) + + def parallelize( + self, + tp_mesh: DeviceMesh, + dp_mesh: DeviceMesh, + use_activation_checkpointing: bool, + reshard_after_forward: bool, + ) -> "MLPStack": + parallelize_module( + self, + device_mesh=tp_mesh, + # Leave the layer norm as implicitly replicated + parallelize_plan={ + # Pass `use_local_output=False` to keep as DTensor to preserve + # uneven activation dims + "1.in_proj": ColwiseParallel(use_local_output=False), + "1.out_proj": RowwiseParallel(use_local_output=False), + "2.in_proj": ColwiseParallel(use_local_output=False), + "2.out_proj": RowwiseParallel(use_local_output=False), + "3.in_proj": ColwiseParallel(use_local_output=False), + "3.out_proj": RowwiseParallel(), + }, + ) + for mlp in self: + if use_activation_checkpointing: + checkpoint(mlp) + fully_shard(mlp, mesh=dp_mesh, reshard_after_forward=reshard_after_forward) + fully_shard(self, mesh=dp_mesh, reshard_after_forward=reshard_after_forward) + return self + + class DoubleLinear(nn.Module): """ This can be used for returning multiple outputs from a module diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 678350789b2b4..d456bb520db06 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -14951,8 +14951,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1): # RuntimeError: Cannot insert a Tensor that requires grad as a constant. # Consider making it a parameter or input, or detaching the gradient DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)), - DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad', - active_if=TEST_WITH_ROCM) ], sample_inputs_func=sample_inputs_instance_norm, supports_expanded_weight=True,), @@ -16273,10 +16271,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1): DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'), DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness', device_type='cpu', dtypes=(torch.bfloat16, torch.float16)), - # Trying to use forward AD with miopen_batch_norm that does not support it - # because it has not been implemented yet. - DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad', - device_type="cuda", active_if=TEST_WITH_ROCM), DecorateInfo(toleranceOverride({torch.float32: tol(atol=5e-05, rtol=1e-05)}), 'TestCompositeCompliance', 'test_forward_ad', device_type="cpu"), )), diff --git a/torch/testing/_internal/common_optimizers.py b/torch/testing/_internal/common_optimizers.py index 5a66923373f74..61396b6226301 100644 --- a/torch/testing/_internal/common_optimizers.py +++ b/torch/testing/_internal/common_optimizers.py @@ -1146,8 +1146,7 @@ def _get_optim_inputs_including_global_cliquey_kwargs( Adagrad, optim_inputs_func=optim_inputs_func_adagrad, optim_error_inputs_func=optim_error_inputs_func_adagrad, - supported_impls=("foreach", "differentiable", "fused"), - supports_fused_on=("cpu",), + supported_impls=("foreach", "differentiable"), supports_sparse=True, metadata_for_sparse=( {"lr": 0.1, "weight_decay": 0, "lr_decay": 0}, @@ -1156,23 +1155,6 @@ def _get_optim_inputs_including_global_cliquey_kwargs( lambda opt: ReduceLROnPlateau(opt, threshold=1e-4), ], ), - decorators=( - DecorateInfo( - # Note on tolerances: - # difference comes from the fact that the non fused kernel have - # more dtype cast operations. We have another test test_fused_cpu_matches_cuda - # to make sure there is no discrepancies between cuda fused kernel - # and cpu fused kernel - toleranceOverride( - { - torch.bfloat16: tol(atol=5e-3, rtol=5e-3), - torch.float16: tol(atol=5e-3, rtol=5e-3), - } - ), - "TestOptimRenewed", - "test_fused_matches_forloop", - ), - ), skips=( DecorateInfo( skipIfMps, # addcdiv doesn't work for non-contiguous, see #118115 diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index 435caa69041be..1805134130936 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -1,6 +1,7 @@ # mypy: ignore-errors import copy +import json import itertools import math import os @@ -8,7 +9,7 @@ import sys import tempfile import time -from collections import namedtuple, OrderedDict +from collections import namedtuple, OrderedDict, defaultdict from contextlib import contextmanager, nullcontext from dataclasses import dataclass from datetime import timedelta @@ -204,6 +205,24 @@ def get_profiling_event(event_name, profiler, dedup_gpu_user_annotation=False): ) ] +def get_profiler_nccl_meta(prof): + """Torch profiler includes nccl metadata in an inserted operator called "record_param_comms" + We will need to test metadata obtained from profiler here""" + tf = tempfile.NamedTemporaryFile( + mode="w+t", suffix=".json", delete=False + ) + tf.close() + trace_file = tf.name + + prof.export_chrome_trace(trace_file) + with open(trace_file) as f: + events = json.load(f)["traceEvents"] + print(f"Trace saved to {trace_file}") + + # Comment to debug + os.remove(trace_file) + + return [e for e in events if e.get("name") == "record_param_comms"] # Base error message substring on unfinished reductions. ddp_prev_reduction_unfinished_str = ( @@ -659,6 +678,33 @@ def _verify_buffers_equal(self, m1, m2): for b in gathered_bufs_m2: self.assertEqual(b, buf2) + def _sanity_check_profiler_nccl_meta(self, nccl_meta_events): + """Torch profiler includes nccl metadata in an inserted operator called "record_param_comms" + We test for basic fields in this profiler event that correspond to the nccl communication + collectives""" + per_coll_meta = defaultdict(list) + for e in nccl_meta_events: + args = e.get("args", {}) + collname = args.get("Collective name", "") + self.assertNotEqual(collname, "") + self.assertNotEqual(args.get("dtype", ""), "") + + per_coll_meta[collname].append(args) + if collname in {"wait"}: + continue + + self.assertEqual(args["Process Group Description"], "default_pg") + self.assertNotEqual(args["Process Group Ranks"], "") + + self.assertGreaterEqual(args.get("In msg nelems", -1), 0) + self.assertGreaterEqual(args.get("Out msg nelems", -1), 0) + self.assertGreaterEqual(args.get("Group size", -1), 0) + self.assertGreaterEqual(args.get("Global rank start", -1), 0) + self.assertGreaterEqual(args.get("Global rank stride", -1), 0) + + # print(per_coll_meta) + return per_coll_meta + def test_dump_DDP_relevant_env_vars(self): with captured_output() as (out, _): _dump_DDP_relevant_env_vars() @@ -1588,6 +1634,7 @@ def _test_send_recv_nccl(self, profiler_ctx=None): for event in events: self.assertTrue(event.input_shapes in expected_shapes) + @skip_if_no_gpu @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Send Recv Only") @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv") @@ -6880,6 +6927,8 @@ def _test_ddp_profiling(self, profiler_ctx): events = get_profiling_event("search_unused_parameters", prof) self.assertEqual(len(events), 1) + return prof + @require_backend_is_available(DistTestCases.backend_feature["gpu"]) @skip_if_lt_x_gpu(2) @skip_but_pass_in_sandcastle("Currently failing in NVIDIA internal CI") @@ -6898,7 +6947,29 @@ def test_ddp_profiling_torch_profiler(self): cpu_act = torch.profiler.ProfilerActivity.CPU cuda_act = torch.profiler.ProfilerActivity.CUDA torch_profiler_ctx = torch.profiler.profile(activities=[cpu_act, cuda_act]) - self._test_ddp_profiling(profiler_ctx=torch_profiler_ctx) + prof = self._test_ddp_profiling(profiler_ctx=torch_profiler_ctx) + + if dist.get_backend() != "nccl": + return + + # Note comment out the "os.remove(trace_file)" in `get_profiler_nccl_meta()` + # to debug any mismatches. + nccl_meta_events = get_profiler_nccl_meta(prof) + self.assertGreater(len(nccl_meta_events), 0) + + nccl_meta = self._sanity_check_profiler_nccl_meta(nccl_meta_events) + + # additionally check the specific collectives in this test case + self.assertEqual(len(nccl_meta["allreduce"]), 2) + self.assertEqual(len(nccl_meta["wait"]), 1) + + # check allreduce message sizes + a0 = nccl_meta["allreduce"][0] + self.assertEqual(a0["Out msg nelems"], 100, msg=f"{a0}") + self.assertEqual(a0["dtype"], "Float", msg=f"{a0}") + a1 = nccl_meta["allreduce"][1] + self.assertEqual(a1["Out msg nelems"], 1, msg=f"{a1}") + self.assertEqual(a1["dtype"], "Int", msg=f"{a1}") @skip_if_lt_x_gpu(2) @skip_but_pass_in_sandcastle_if( diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py index d22b550c6d1a5..c417f1d9d72a2 100644 --- a/torch/utils/_python_dispatch.py +++ b/torch/utils/_python_dispatch.py @@ -205,6 +205,7 @@ def _disable_current_modes(): ) from torch._subclasses.functional_tensor import FunctionalTensorMode from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode + from torch._subclasses.schema_check_mode import SchemaCheckMode mode_len_pre_dispatch = _len_torch_dispatch_stack_pre_dispatch() old_pre_dispatch_modes = [ @@ -213,12 +214,15 @@ def _disable_current_modes(): has_proxy_mode_in_pre_dispatch = False has_functional_mode_in_pre_dispatch = False + has_schema_check_mode_in_pre_dispatch = False for i in old_pre_dispatch_modes: if isinstance(i, ProxyTorchDispatchMode): has_proxy_mode_in_pre_dispatch = True if isinstance(i, FunctionalTensorMode): has_functional_mode_in_pre_dispatch = True + if isinstance(i, SchemaCheckMode): + has_schema_check_mode_in_pre_dispatch = True mode_len = _len_torch_dispatch_stack() old_modes = [_pop_mode() for _ in range(mode_len)] @@ -235,6 +239,13 @@ def _disable_current_modes(): raise AssertionError( "Can't have ProxyTorchDispatchMode available both in PreDispatch and Python Key" ) + if ( + isinstance(old, SchemaCheckMode) + and has_schema_check_mode_in_pre_dispatch + ): + raise AssertionError( + "Can't have SchemaCheckMode available both in PreDispatch and Python Key" + ) # Manually disable proxy and fake modes, if any are active try: diff --git a/torch/utils/_sympy/symbol.py b/torch/utils/_sympy/symbol.py index ea2d2b7293f36..89908a09e1971 100644 --- a/torch/utils/_sympy/symbol.py +++ b/torch/utils/_sympy/symbol.py @@ -19,6 +19,7 @@ class SymT(Enum): SIZE = auto() + FLOAT = auto() UNBACKED_INT = auto() UNBACKED_FLOAT = auto() # Inductor: The intermediates in inner_fn tmp0, one generated per ops call. @@ -54,7 +55,11 @@ class SymT(Enum): prefix_str = { SymT.SIZE: "s", # integer SymT.UNBACKED_INT: "u", # integer - SymT.UNBACKED_FLOAT: "f", + # Prefix z here is chosen to avoid false aliasing in symbol_is_type test + # DO NOT add a "z" type. You also need to avoid conflicts on these + # prefixes but this is somewhat easier to manage + SymT.FLOAT: "zf", + SymT.UNBACKED_FLOAT: "zuf", SymT.TMP: "tmp", SymT.PRECOMPUTED_SIZE: "ps", SymT.INDEX: "i", diff --git a/torch/utils/_sympy/value_ranges.py b/torch/utils/_sympy/value_ranges.py index f2319e930d769..eae126b1b4dcd 100644 --- a/torch/utils/_sympy/value_ranges.py +++ b/torch/utils/_sympy/value_ranges.py @@ -872,12 +872,15 @@ def bound_sympy( # size variables can come with a lower bound of 2, as we specialise on 0 and 1 unbounded_ranges: Dict[sympy.Symbol, ValueRanges] = {} for s in unbounded_vars: - assert s.is_integer # type: ignore[attr-defined] - if s.is_positive: # type: ignore[attr-defined] - lower = 1 - elif s.is_nonnegative: # type: ignore[attr-defined] - lower = 0 + if s.is_integer: # type: ignore[attr-defined] + if s.is_positive: # type: ignore[attr-defined] + lower = 1 + elif s.is_nonnegative: # type: ignore[attr-defined] + lower = 0 + else: + lower = -math.inf # type: ignore[assignment] else: + # Don't bother trying very hard here lower = -math.inf # type: ignore[assignment] unbounded_ranges[s] = ValueRanges(lower, math.inf) # type: ignore[index] ranges = {**ranges, **unbounded_ranges} diff --git a/torch/xpu/streams.py b/torch/xpu/streams.py index 2c3c3a63d58bd..f4e35a376e7c2 100644 --- a/torch/xpu/streams.py +++ b/torch/xpu/streams.py @@ -2,6 +2,7 @@ import torch from torch._streambase import _EventBase, _StreamBase + from .._utils import _dummy_type @@ -34,7 +35,7 @@ def __new__(cls, device=None, priority=0, **kwargs): with torch.xpu.device(device): return super().__new__(cls, priority=priority, **kwargs) - def wait_event(self, event): + def wait_event(self, event) -> None: r"""Make all future work submitted to the stream wait for an event. Args: @@ -42,7 +43,7 @@ def wait_event(self, event): """ event.wait(self) - def wait_stream(self, stream): + def wait_stream(self, stream) -> None: r"""Synchronize with another stream. All future work submitted to this stream will wait until all kernels @@ -68,7 +69,7 @@ def record_event(self, event=None): event.record(self) return event - def query(self): + def query(self) -> bool: r"""Check if all the work submitted has been completed. Returns: @@ -76,7 +77,7 @@ def query(self): """ return super().query() - def synchronize(self): + def synchronize(self) -> None: r"""Wait for all the kernels in this stream to complete.""" super().synchronize() @@ -114,7 +115,7 @@ class Event(torch._C._XpuEventBase, _EventBase): def __new__(cls, enable_timing=False): return super().__new__(cls, enable_timing=enable_timing) - def record(self, stream=None): + def record(self, stream=None) -> None: r"""Record the event in a given stream. Uses ``torch.xpu.current_stream()`` if no stream is specified. The @@ -124,7 +125,7 @@ def record(self, stream=None): stream = torch.xpu.current_stream() super().record(stream) - def wait(self, stream=None): + def wait(self, stream=None) -> None: r"""Make all future work submitted to the given stream wait for this event. Use ``torch.xpu.current_stream()`` if no stream is specified. @@ -133,7 +134,7 @@ def wait(self, stream=None): stream = torch.xpu.current_stream() super().wait(stream) - def query(self): + def query(self) -> bool: r"""Check if all work currently captured by event has completed. Returns: @@ -150,7 +151,7 @@ def elapsed_time(self, end_event): """ return super().elapsed_time(end_event) - def synchronize(self): + def synchronize(self) -> None: r"""Wait for the event to complete. Waits until the completion of all work currently captured in this event. diff --git a/torchgen/aoti/fallback_ops.py b/torchgen/aoti/fallback_ops.py index aba595e141925..f77527a156beb 100644 --- a/torchgen/aoti/fallback_ops.py +++ b/torchgen/aoti/fallback_ops.py @@ -65,6 +65,7 @@ "aten.histogram.bin_ct", "aten._histogramdd_bin_edges.default", "aten._histogramdd_from_bin_cts.default", + "aten.index_put.default", "aten.index_reduce.default", "aten.index.Tensor", "aten.kthvalue.default", @@ -82,7 +83,9 @@ "aten.mm.out", "aten.mode.default", "aten.mul.Scalar", + "aten.mul.Tensor", "aten.nanmedian.default", + "aten.native_dropout.default", "aten.nonzero.default", "aten.ormqr.default", "aten._pdist_backward.default", @@ -93,6 +96,8 @@ "aten.rand.default", "aten.rand.generator", "aten.randint.default", + "aten.randint.generator", + "aten.randint.low_out", "aten.randn.default", "aten.randn.generator", "aten.randperm.default", @@ -110,9 +115,11 @@ "aten._scaled_mm.default", "aten.scatter_reduce.two_out", "aten.scatter.src_out", + "aten.scatter.value_out", "aten.searchsorted.default", "aten._segment_reduce_backward.default", "aten.segment_reduce.default", + "aten.slice.Tensor", "aten.soft_margin_loss_backward.default", "aten.sort.default", "aten.sort.stable", diff --git a/torchgen/gen.py b/torchgen/gen.py index 28e46c3536e6d..d715361146ea0 100644 --- a/torchgen/gen.py +++ b/torchgen/gen.py @@ -49,8 +49,8 @@ from torchgen.gen_aoti_c_shim import ( gen_aoti_c_shim, gen_static_dispatch_backend_call_signature, - get_backend_index_for_aoti, get_fallback_op_name, + get_header_for_aoti, ) from torchgen.gen_functionalization_type import ( gen_functionalization_definition, @@ -2353,54 +2353,28 @@ def operator_headers() -> List[str]: else: raise AssertionError(f"unrecognized {dispatch_key} for ufunc") - structured_func_group_dict = { - f"{func_group.functional.namespace}.{func_group.functional.func.name}": func_group - for func_group in structured_native_functions - } + structured_func_group_dict = dict() + for func_group in structured_native_functions: + for func in func_group.functions(): + if func.structured_delegate is not None: + structured_func_group_dict[func.structured_delegate] = func_group + break + if dispatch_key in (DispatchKey.CPU, DispatchKey.CUDA): fallbacks = dict() for func in native_functions: op_name = get_fallback_op_name(func) if op_name in inductor_fallback_ops: - fallbacks[op_name] = ( - func, - structured_func_group_dict.get( - f"{func.namespace}.{func.func.name.name}", None - ), - ) + fallbacks[op_name] = func fallback_native_functions = tuple( value for _, value in sorted(fallbacks.items()) ) - def get_header( - func: NativeFunction, - func_group: Optional[NativeFunctionsGroup], - ) -> Optional[str]: - backend_index = get_backend_index_for_aoti( - func, func_group, dispatch_key, backend_indices - ) - return ( - None - if backend_index is None - else f"#include " - ) - - def headers_for_aoti() -> str: - headers = [] - for func, func_group in fallback_native_functions: - header = get_header(func, func_group) - if header is not None: - headers.append(header) - return "\n".join(sorted(set(headers))) - - extra_headers = ( - extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else "" - ) - # header files were checked in for ABI-compatiblilty checking header_file_name = f"c_shim_{dispatch_key.lower()}.h" new_header = gen_aoti_c_shim( fallback_native_functions, + structured_func_group_dict, dispatch_key, backend_indices, header=True, @@ -2442,10 +2416,25 @@ def headers_for_aoti() -> str: ) # cpp files are always generated on-the-fly + def headers_for_aoti() -> str: + headers = [] + for func in fallback_native_functions: + header = get_header_for_aoti( + func, structured_func_group_dict, dispatch_key, backend_indices + ) + if header is not None: + headers.append(header) + return "\n".join(sorted(set(headers))) + + extra_headers = ( + extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else "" + ) + aoti_fm.write( f"c_shim_{dispatch_key.lower()}.cpp", lambda: gen_aoti_c_shim( fallback_native_functions, + structured_func_group_dict, dispatch_key, backend_indices, header=False, diff --git a/torchgen/gen_aoti_c_shim.py b/torchgen/gen_aoti_c_shim.py index 0d31bd14a5e6f..1f99e3a9f3fae 100644 --- a/torchgen/gen_aoti_c_shim.py +++ b/torchgen/gen_aoti_c_shim.py @@ -16,6 +16,7 @@ ListType, NativeFunction, NativeFunctionsGroup, + OperatorName, OptionalType, Type, ) @@ -209,7 +210,11 @@ def convert_return(typ: BaseType, val: str) -> str: ret_pointer_can_be_null = False unambiguous_name = schema.name.unambiguous_name() - for name in ["_scaled_dot_product_flash_attention", "convolution_backward"]: + for name in [ + "_scaled_dot_product_flash_attention", + "_scaled_dot_product_efficient_attention", + "convolution_backward", + ]: if name in unambiguous_name: ret_pointer_can_be_null = True break @@ -302,15 +307,17 @@ def gen_static_dispatch_backend_call( def get_backend_index_for_aoti( func: NativeFunction, - func_group: Optional[NativeFunctionsGroup], + func_group_mapping: Dict[OperatorName, NativeFunctionsGroup], dispatch_key: DispatchKey, backend_indices: Dict[DispatchKey, BackendIndex], ) -> Optional[BackendIndex]: backend_index = None if backend_indices[dispatch_key].has_kernel(func) or ( func.structured_delegate is not None - and func_group is not None - and backend_indices[dispatch_key].has_kernel(func_group) + and func.structured_delegate in func_group_mapping + and backend_indices[dispatch_key].has_kernel( + func_group_mapping[func.structured_delegate] + ) ): backend_index = backend_indices[dispatch_key] elif backend_indices[DispatchKey.CompositeExplicitAutograd].has_kernel(func): @@ -327,6 +334,22 @@ def get_backend_index_for_aoti( return backend_index +def get_header_for_aoti( + func: NativeFunction, + func_group_mapping: Dict[OperatorName, NativeFunctionsGroup], + dispatch_key: DispatchKey, + backend_indices: Dict[DispatchKey, BackendIndex], +) -> Optional[str]: + backend_index = get_backend_index_for_aoti( + func, func_group_mapping, dispatch_key, backend_indices + ) + return ( + None + if backend_index is None + else f"#include " + ) + + def get_fallback_op_name(func: NativeFunction) -> str: return ( f"{func.namespace}.{func.func.name.name}.{func.func.name.overload_name}" @@ -337,13 +360,13 @@ def get_fallback_op_name(func: NativeFunction) -> str: def gen_c_shim( func: NativeFunction, - func_group: Optional[NativeFunctionsGroup], + func_group_mapping: Dict[OperatorName, NativeFunctionsGroup], dispatch_key: DispatchKey, backend_indices: Dict[DispatchKey, BackendIndex], header: bool, ) -> Optional[str]: backend_index = get_backend_index_for_aoti( - func, func_group, dispatch_key, backend_indices + func, func_group_mapping, dispatch_key, backend_indices ) if backend_index is None: return None @@ -371,7 +394,7 @@ def gen_c_shim( @dataclass(frozen=True) class ShimGenerator: - func_group_mapping: Dict[str, Optional[NativeFunctionsGroup]] + func_group_mapping: Dict[OperatorName, NativeFunctionsGroup] dispatch_key: DispatchKey backend_indices: Dict[DispatchKey, BackendIndex] header: bool # True to generate .h and False to generate .cpp @@ -383,7 +406,7 @@ def __call__( ) -> Optional[str]: result = gen_c_shim( func, - self.func_group_mapping.get(get_fallback_op_name(func), None), + self.func_group_mapping, self.dispatch_key, self.backend_indices, self.header, @@ -392,22 +415,20 @@ def __call__( def gen_aoti_c_shim( - native_functions: Sequence[Tuple[NativeFunction, Optional[NativeFunctionsGroup]]], + native_functions: Sequence[NativeFunction], + func_group_mapping: Dict[OperatorName, NativeFunctionsGroup], dispatch_key: DispatchKey, backend_indices: Dict[DispatchKey, BackendIndex], header: bool, includes: str = "", ) -> str: - func_group_mapping = { - get_fallback_op_name(func): func_group for func, func_group in native_functions - } body = "\n".join( list( mapMaybe( ShimGenerator( func_group_mapping, dispatch_key, backend_indices, header ), - [func for func, _ in native_functions], + native_functions, ) ) )